From: Oskar Arvidsson <oskar@irock.se>
Date: Fri, 2 Jul 2010 02:06:08 +0000 (+0200)
Subject: Support for 9 and 10-bit encoding
X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=c91f43a4b09dab84953f417e6d6662ec0fa7acb1;p=x264

Support for 9 and 10-bit encoding
Output bit depth is specified on compilation time via --bit-depth.
There is currently almost no assembly code available for high-bit-depth modes, so encoding will be very slow.
Input is still 8-bit only; this will change in the future.

Note that very few H.264 decoders support >8 bit depth currently.
Also note that the quantizer scale differs for higher bit depth.  For example, for 10-bit, the quantizer (and crf) ranges from 0 to 63 instead of 0 to 51.
---

diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index d294eff4..b1106dd2 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -64,6 +64,19 @@ MC_WEIGHT(_nodenom)
 MC_WEIGHT(_offsetadd)
 MC_WEIGHT(_offsetsub)
 
+void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
+void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );
+
+void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
+void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);
+
+void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
+void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
+void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
+
+#if !X264_HIGH_BIT_DEPTH
 static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
 {
     if( w->i_scale == 1<<w->i_denom )
@@ -85,14 +98,6 @@ static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
         w->weightfn = x264_mc_wtab_neon;
 }
 
-void x264_mc_copy_w4_neon( uint8_t *, int, uint8_t *, int, int );
-void x264_mc_copy_w8_neon( uint8_t *, int, uint8_t *, int, int );
-void x264_mc_copy_w16_neon( uint8_t *, int, uint8_t *, int, int );
-void x264_mc_copy_w16_aligned_neon( uint8_t *, int, uint8_t *, int, int );
-
-void x264_mc_chroma_neon( uint8_t *, int, uint8_t *, int, int, int, int, int );
-void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int, int, int);
-
 static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =
 {
     NULL,
@@ -174,10 +179,6 @@ static uint8_t *get_ref_neon( uint8_t *dst,   int *i_dst_stride,
     }
 }
 
-void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, int, int );
-void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
-void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
-
 static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
                               int stride, int width, int height, int16_t *buf )
 {
@@ -198,18 +199,22 @@ static void hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8
         src  += stride;
     }
 }
+#endif // !X264_HIGH_BIT_DEPTH
 
 void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
 {
     if( !(cpu&X264_CPU_ARMV6) )
         return;
 
+#if !X264_HIGH_BIT_DEPTH
     pf->prefetch_fenc = x264_prefetch_fenc_arm;
     pf->prefetch_ref  = x264_prefetch_ref_arm;
+#endif // !X264_HIGH_BIT_DEPTH
 
     if( !(cpu&X264_CPU_NEON) )
         return;
 
+#if !X264_HIGH_BIT_DEPTH
     pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
     pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_neon;
     pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_neon;
@@ -229,15 +234,16 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
     pf->offsetsub = x264_mc_offsetsub_wtab_neon;
     pf->weight_cache = x264_weight_cache_neon;
 
-// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
-#ifndef SYS_MACOSX
-    pf->memcpy_aligned  = x264_memcpy_aligned_neon;
-#endif
-    pf->memzero_aligned = x264_memzero_aligned_neon;
-
     pf->mc_chroma = x264_mc_chroma_neon;
     pf->mc_luma = mc_luma_neon;
     pf->get_ref = get_ref_neon;
     pf->hpel_filter = hpel_filter_neon;
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
+#endif // !X264_HIGH_BIT_DEPTH
+
+// Apple's gcc stupidly cannot align stack variables, and ALIGNED_ARRAY can't work on structs
+#ifndef SYS_MACOSX
+    pf->memcpy_aligned  = x264_memcpy_aligned_neon;
+#endif
+    pf->memzero_aligned = x264_memzero_aligned_neon;
 }
diff --git a/common/arm/predict-c.c b/common/arm/predict-c.c
index fa7b9f7b..b40dc9a1 100644
--- a/common/arm/predict-c.c
+++ b/common/arm/predict-c.c
@@ -51,6 +51,7 @@ void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
     if (!(cpu&X264_CPU_ARMV6))
         return;
 
+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_4x4_H]   = x264_predict_4x4_h_armv6;
     pf[I_PRED_4x4_DC]  = x264_predict_4x4_dc_armv6;
     pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_armv6;
@@ -59,6 +60,7 @@ void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] )
         return;
 
     pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_neon;
+#endif // !X264_HIGH_BIT_DEPTH
 }
 
 void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
@@ -66,12 +68,14 @@ void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] )
     if (!(cpu&X264_CPU_NEON))
         return;
 
+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_neon;
     pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_neon;
     pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
     pf[I_PRED_CHROMA_H] = x264_predict_8x8c_h_neon;
     pf[I_PRED_CHROMA_V] = x264_predict_8x8c_v_neon;
     pf[I_PRED_CHROMA_P] = x264_predict_8x8c_p_neon;
+#endif // !X264_HIGH_BIT_DEPTH
 }
 
 void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
@@ -79,8 +83,10 @@ void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_
     if (!(cpu&X264_CPU_NEON))
         return;
 
+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_8x8_DC]  = x264_predict_8x8_dc_neon;
     pf[I_PRED_8x8_H]   = x264_predict_8x8_h_neon;
+#endif // !X264_HIGH_BIT_DEPTH
 }
 
 void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] )
@@ -88,10 +94,12 @@ void x264_predict_16x16_init_arm( int cpu, x264_predict_t pf[7] )
     if (!(cpu&X264_CPU_NEON))
         return;
 
+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_16x16_DC ]    = x264_predict_16x16_dc_neon;
     pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
     pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
     pf[I_PRED_16x16_H ]     = x264_predict_16x16_h_neon;
     pf[I_PRED_16x16_V ]     = x264_predict_16x16_v_neon;
     pf[I_PRED_16x16_P ]     = x264_predict_16x16_p_neon;
+#endif // !X264_HIGH_BIT_DEPTH
 }
diff --git a/common/bitstream.h b/common/bitstream.h
index d10f3a20..b2aa8b89 100644
--- a/common/bitstream.h
+++ b/common/bitstream.h
@@ -53,7 +53,7 @@ typedef struct bs_s
 typedef struct
 {
     int     last;
-    int16_t level[16];
+    dctcoef level[16];
     uint8_t run[16];
 } x264_run_level_t;
 
diff --git a/common/common.c b/common/common.c
index 14dd7167..728dfab0 100644
--- a/common/common.c
+++ b/common/common.c
@@ -91,10 +91,10 @@ void x264_param_default( x264_param_t *param )
     param->rc.i_vbv_max_bitrate = 0;
     param->rc.i_vbv_buffer_size = 0;
     param->rc.f_vbv_buffer_init = 0.9;
-    param->rc.i_qp_constant = 23;
-    param->rc.f_rf_constant = 23;
+    param->rc.i_qp_constant = 23 + QP_BD_OFFSET;
+    param->rc.f_rf_constant = 23 + QP_BD_OFFSET;
     param->rc.i_qp_min = 10;
-    param->rc.i_qp_max = 51;
+    param->rc.i_qp_max = QP_MAX;
     param->rc.i_qp_step = 4;
     param->rc.f_ip_factor = 1.4;
     param->rc.f_pb_factor = 1.3;
@@ -418,6 +418,15 @@ int x264_param_apply_profile( x264_param_t *param, const char *profile )
     if( !profile )
         return 0;
 
+#if BIT_DEPTH > 8
+    if( !strcasecmp( profile, "baseline" ) || !strcasecmp( profile, "main" ) ||
+        !strcasecmp( profile, "high" ) )
+    {
+        x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support a bit depth of %d.\n", profile, BIT_DEPTH );
+        return -1;
+    }
+#endif
+
     if( !strcasecmp( profile, "baseline" ) )
     {
         param->analyse.b_transform_8x8 = 0;
@@ -441,7 +450,7 @@ int x264_param_apply_profile( x264_param_t *param, const char *profile )
         param->analyse.b_transform_8x8 = 0;
         param->i_cqm_preset = X264_CQM_FLAT;
     }
-    else if( !strcasecmp( profile, "high" ) )
+    else if( !strcasecmp( profile, "high" ) || !strcasecmp( profile, "high10" ) )
     {
         /* Default */
     }
diff --git a/common/common.h b/common/common.h
index 7b60811f..ca279683 100644
--- a/common/common.h
+++ b/common/common.h
@@ -52,10 +52,15 @@ do {\
 
 #define X264_BFRAME_MAX 16
 #define X264_THREAD_MAX 128
-#define X264_PCM_COST (386*8)
+#define X264_PCM_COST (384*BIT_DEPTH+16)
 #define X264_LOOKAHEAD_MAX 250
+#define QP_BD_OFFSET (6*(BIT_DEPTH-8))
+#define QP_MAX (51+QP_BD_OFFSET)
+#define QP_MAX_MAX (51+2*6)
+#define LAMBDA_MAX (91 << (BIT_DEPTH-8))
+#define PIXEL_MAX ((1 << BIT_DEPTH)-1)
 // arbitrary, but low because SATD scores are 1/4 normal
-#define X264_LOOKAHEAD_QP 12
+#define X264_LOOKAHEAD_QP (12+QP_BD_OFFSET)
 
 // number of pixels (per thread) in progress at any given time.
 // 16 for the macroblock in progress + 3 for deblocking + 3 for motion compensation filter + 2 for extra safety
@@ -101,17 +106,23 @@ typedef union { x264_uint128_t i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; u
 #define CP64(dst,src) M64(dst) = M64(src)
 #define CP128(dst,src) M128(dst) = M128(src)
 
-typedef uint8_t pixel;
-typedef uint32_t pixel4;
-typedef int16_t dctcoef;
+#if X264_HIGH_BIT_DEPTH
+    typedef uint16_t pixel;
+    typedef uint64_t pixel4;
+    typedef int32_t  dctcoef;
 
-#define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
-#define MPIXEL_X4(src) M32(src)
-#define CPPIXEL_X4(dst,src) CP32(dst,src)
-#define CPPIXEL_X8(dst,src) CP64(dst,src)
-#define MDCT_X2(dct) M32(dct)
-#define CPDCT_X2(dst,src) CP32(dst,src)
-#define CPDCT_X4(dst,src) CP64(dst,src)
+#   define PIXEL_SPLAT_X4(x) ((x)*0x0001000100010001ULL)
+#   define MPIXEL_X4(src) M64(src)
+#else
+    typedef uint8_t  pixel;
+    typedef uint32_t pixel4;
+    typedef int16_t  dctcoef;
+
+#   define PIXEL_SPLAT_X4(x) ((x)*0x01010101U)
+#   define MPIXEL_X4(src) M32(src)
+#endif
+
+#define CPPIXEL_X4(dst,src) MPIXEL_X4(dst) = MPIXEL_X4(src)
 
 #define X264_SCAN8_SIZE (6*8)
 #define X264_SCAN8_LUMA_SIZE (5*8)
@@ -189,7 +200,7 @@ void x264_init_vlc_tables();
 
 static ALWAYS_INLINE pixel x264_clip_pixel( int x )
 {
-    return x&(~255) ? (-x)>>31 : x;
+    return ( (x & ~PIXEL_MAX) ? (-x)>>31 & PIXEL_MAX : x );
 }
 
 static ALWAYS_INLINE int x264_clip3( int v, int i_min, int i_max )
@@ -449,8 +460,8 @@ struct x264_t
     /* mv/ref cost arrays.  Indexed by lambda instead of
      * qp because, due to rounding, some quantizers share
      * lambdas.  This saves memory. */
-    uint16_t *cost_mv[92];
-    uint16_t *cost_mv_fpel[92][4];
+    uint16_t *cost_mv[LAMBDA_MAX+1];
+    uint16_t *cost_mv_fpel[LAMBDA_MAX+1][4];
 
     const uint8_t   *chroma_qp_table; /* includes both the nonlinear luma->chroma mapping and chroma_qp_offset */
 
diff --git a/common/dct.c b/common/dct.c
index 60dbd551..cd273636 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -418,6 +418,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
     dctf->dct4x4dc  = dct4x4dc;
     dctf->idct4x4dc = idct4x4dc;
 
+#if !X264_HIGH_BIT_DEPTH
 #if HAVE_MMX
     if( cpu&X264_CPU_MMX )
     {
@@ -515,6 +516,7 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf )
         dctf->add16x16_idct8= x264_add16x16_idct8_neon;
     }
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 }
 
 void x264_dct_init_weights( void )
@@ -599,11 +601,9 @@ static void zigzag_scan_4x4_frame( dctcoef level[16], dctcoef dct[16] )
 
 static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
 {
-    CPDCT_X2( level, dct );
+    memcpy( level, dct, 2 * sizeof(dctcoef) );
     ZIG(2,0,1) ZIG(3,2,0) ZIG(4,3,0) ZIG(5,1,1)
-    CPDCT_X2( level+6, dct+6 );
-    CPDCT_X4( level+8, dct+8 );
-    CPDCT_X4( level+12, dct+12 );
+    memcpy( level+6, dct+6, 10 * sizeof(dctcoef) );
 }
 
 #undef ZIG
@@ -618,6 +618,7 @@ static void zigzag_scan_4x4_field( dctcoef level[16], dctcoef dct[16] )
     CPPIXEL_X4( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
     CPPIXEL_X4( p_dst+2*FDEC_STRIDE, p_src+2*FENC_STRIDE );\
     CPPIXEL_X4( p_dst+3*FDEC_STRIDE, p_src+3*FENC_STRIDE );
+#define CPPIXEL_X8(dst,src) ( CPPIXEL_X4(dst,src), CPPIXEL_X4(dst+4,src+4) )
 #define COPY8x8\
     CPPIXEL_X8( p_dst+0*FDEC_STRIDE, p_src+0*FENC_STRIDE );\
     CPPIXEL_X8( p_dst+1*FDEC_STRIDE, p_src+1*FENC_STRIDE );\
@@ -709,6 +710,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         pf->sub_8x8    = zigzag_sub_8x8_field;
         pf->sub_4x4    = zigzag_sub_4x4_field;
         pf->sub_4x4ac  = zigzag_sub_4x4ac_field;
+#if !X264_HIGH_BIT_DEPTH
 #if HAVE_MMX
         if( cpu&X264_CPU_MMXEXT )
         {
@@ -726,6 +728,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         if( cpu&X264_CPU_ALTIVEC )
             pf->scan_4x4   = x264_zigzag_scan_4x4_field_altivec;
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
     }
     else
     {
@@ -734,6 +737,7 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         pf->sub_8x8    = zigzag_sub_8x8_frame;
         pf->sub_4x4    = zigzag_sub_4x4_frame;
         pf->sub_4x4ac  = zigzag_sub_4x4ac_frame;
+#if !X264_HIGH_BIT_DEPTH
 #if HAVE_MMX
         if( cpu&X264_CPU_MMX )
             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_mmx;
@@ -759,13 +763,16 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
         if( cpu&X264_CPU_NEON )
             pf->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
     }
 
     pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
+#if !X264_HIGH_BIT_DEPTH
 #if HAVE_MMX
     if( cpu&X264_CPU_MMX )
         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
     if( cpu&X264_CPU_SHUFFLE_IS_FAST )
         pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 }
diff --git a/common/deblock.c b/common/deblock.c
index 9e42d43e..c7298747 100644
--- a/common/deblock.c
+++ b/common/deblock.c
@@ -25,8 +25,9 @@
 #include "common.h"
 
 /* Deblocking filter */
-static const uint8_t i_alpha_table[52+12*2] =
+static const uint8_t i_alpha_table[52+12*3] =
 {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
      0,  0,  0,  0,  0,  0,  4,  4,  5,  6,
@@ -36,8 +37,9 @@ static const uint8_t i_alpha_table[52+12*2] =
    255,255,
    255,255,255,255,255,255,255,255,255,255,255,255,
 };
-static const uint8_t i_beta_table[52+12*2] =
+static const uint8_t i_beta_table[52+12*3] =
 {
+     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
      0,  0,  0,  0,  0,  0,  2,  2,  2,  3,
@@ -47,12 +49,14 @@ static const uint8_t i_beta_table[52+12*2] =
     18, 18,
     18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18, 18,
 };
-static const int8_t i_tc0_table[52+12*2][4] =
+static const int8_t i_tc0_table[52+12*3][4] =
 {
     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
+    {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 },
     {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 0 }, {-1, 0, 0, 1 },
     {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 0, 1 }, {-1, 0, 1, 1 }, {-1, 0, 1, 1 }, {-1, 1, 1, 1 },
     {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 1 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 }, {-1, 1, 1, 2 },
@@ -63,9 +67,9 @@ static const int8_t i_tc0_table[52+12*2][4] =
     {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
     {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 }, {-1,13,17,25 },
 };
-#define alpha_table(x) i_alpha_table[(x)+12]
-#define beta_table(x)  i_beta_table[(x)+12]
-#define tc0_table(x)   i_tc0_table[(x)+12]
+#define alpha_table(x) i_alpha_table[(x)+24]
+#define beta_table(x)  i_beta_table[(x)+24]
+#define tc0_table(x)   i_tc0_table[(x)+24]
 
 /* From ffmpeg */
 static inline void deblock_luma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
@@ -265,18 +269,19 @@ static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264
 
 static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
 {
-    int index_a = i_qp + h->sh.i_alpha_c0_offset;
-    int alpha = alpha_table(index_a);
-    int beta  = beta_table(i_qp + h->sh.i_beta_offset);
+    int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset;
+    int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset;
+    int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
+    int beta  = beta_table(index_b) << (BIT_DEPTH-8);
     int8_t tc[4];
 
     if( !M32(bS) || !alpha || !beta )
         return;
 
-    tc[0] = tc0_table(index_a)[bS[0]] + b_chroma;
-    tc[1] = tc0_table(index_a)[bS[1]] + b_chroma;
-    tc[2] = tc0_table(index_a)[bS[2]] + b_chroma;
-    tc[3] = tc0_table(index_a)[bS[3]] + b_chroma;
+    tc[0] = (tc0_table(index_a)[bS[0]] << (BIT_DEPTH-8)) + b_chroma;
+    tc[1] = (tc0_table(index_a)[bS[1]] << (BIT_DEPTH-8)) + b_chroma;
+    tc[2] = (tc0_table(index_a)[bS[2]] << (BIT_DEPTH-8)) + b_chroma;
+    tc[3] = (tc0_table(index_a)[bS[3]] << (BIT_DEPTH-8)) + b_chroma;
 
     pf_inter( pix1, i_stride, alpha, beta, tc );
     if( b_chroma )
@@ -285,8 +290,10 @@ static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stri
 
 static inline void deblock_edge_intra( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
 {
-    int alpha = alpha_table(i_qp + h->sh.i_alpha_c0_offset);
-    int beta  = beta_table(i_qp + h->sh.i_beta_offset);
+    int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset;
+    int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset;
+    int alpha = alpha_table(index_a) << (BIT_DEPTH-8);
+    int beta  = beta_table(index_b) << (BIT_DEPTH-8);
 
     if( !alpha || !beta )
         return;
@@ -450,6 +457,7 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
 #if HAVE_MMX
     if( cpu&X264_CPU_MMXEXT )
     {
+#if !X264_HIGH_BIT_DEPTH
         pf->deblock_chroma[1] = x264_deblock_v_chroma_mmxext;
         pf->deblock_chroma[0] = x264_deblock_h_chroma_mmxext;
         pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmxext;
@@ -460,10 +468,12 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
         pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmxext;
         pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmxext;
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
         pf->deblock_strength = x264_deblock_strength_mmxext;
         if( cpu&X264_CPU_SSE2 )
         {
             pf->deblock_strength = x264_deblock_strength_sse2;
+#if !X264_HIGH_BIT_DEPTH
             if( !(cpu&X264_CPU_STACK_MOD4) )
             {
                 pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
@@ -471,12 +481,14 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
                 pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
                 pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
             }
+#endif // !X264_HIGH_BIT_DEPTH
         }
         if( cpu&X264_CPU_SSSE3 )
             pf->deblock_strength = x264_deblock_strength_ssse3;
     }
 #endif
 
+#if !X264_HIGH_BIT_DEPTH
 #if HAVE_ALTIVEC
     if( cpu&X264_CPU_ALTIVEC )
     {
@@ -494,4 +506,5 @@ void x264_deblock_init( int cpu, x264_deblock_function_t *pf )
         pf->deblock_chroma[0] = x264_deblock_h_chroma_neon;
    }
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 }
diff --git a/common/macroblock.c b/common/macroblock.c
index 94df8f6e..386063ac 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -337,7 +337,7 @@ int x264_macroblock_thread_allocate( x264_t *h, int b_lookahead )
     int scratch_size = 0;
     if( !b_lookahead )
     {
-        int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(int16_t);
+        int buf_hpel = (h->thread[0]->fdec->i_width[0]+48) * sizeof(dctcoef);
         int buf_ssim = h->param.analyse.b_ssim * 8 * (h->param.i_width/4+3) * sizeof(int);
         int me_range = X264_MIN(h->param.analyse.i_me_range, h->param.analyse.i_mv_range);
         int buf_tesa = (h->param.analyse.i_me_method >= X264_ME_ESA) *
diff --git a/common/macroblock.h b/common/macroblock.h
index b2723da2..68844cc3 100644
--- a/common/macroblock.h
+++ b/common/macroblock.h
@@ -238,17 +238,30 @@ static const uint16_t block_idx_xy_fdec[16] =
     2*4 + 3*4*FDEC_STRIDE, 3*4 + 3*4*FDEC_STRIDE
 };
 
-static const uint8_t i_chroma_qp_table[52+12*2] =
+#define QP(qP) ( (qP)+QP_BD_OFFSET )
+static const uint8_t i_chroma_qp_table[QP_MAX+1+12*2] =
 {
-     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
-     0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
-    10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
-    20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
-    29, 30, 31, 32, 32, 33, 34, 34, 35, 35,
-    36, 36, 37, 37, 37, 38, 38, 38, 39, 39,
-    39, 39,
-    39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39, 39,
+         0,      0,      0,      0,      0,      0,
+         0,      0,      0,      0,      0,      0,
+#if BIT_DEPTH > 9
+   QP(-12),QP(-11),QP(-10), QP(-9), QP(-8), QP(-7),
+#endif
+#if BIT_DEPTH > 8
+    QP(-6), QP(-5), QP(-4), QP(-3), QP(-2), QP(-1),
+#endif
+     QP(0),  QP(1),  QP(2),  QP(3),  QP(4),  QP(5),
+     QP(6),  QP(7),  QP(8),  QP(9), QP(10), QP(11),
+    QP(12), QP(13), QP(14), QP(15), QP(16), QP(17),
+    QP(18), QP(19), QP(20), QP(21), QP(22), QP(23),
+    QP(24), QP(25), QP(26), QP(27), QP(28), QP(29),
+    QP(29), QP(30), QP(31), QP(32), QP(32), QP(33),
+    QP(34), QP(34), QP(35), QP(35), QP(36), QP(36),
+    QP(37), QP(37), QP(37), QP(38), QP(38), QP(38),
+    QP(39), QP(39), QP(39), QP(39),
+    QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
+    QP(39), QP(39), QP(39), QP(39), QP(39), QP(39),
 };
+#undef QP
 
 enum cabac_ctx_block_cat_e
 {
@@ -340,26 +353,31 @@ static ALWAYS_INLINE uint32_t pack16to32_mask( int a, int b )
    return (a&0xFFFF) + (b<<16);
 #endif
 }
+static ALWAYS_INLINE uint64_t pack32to64( uint32_t a, uint32_t b )
+{
+#ifdef WORDS_BIGENDIAN
+   return b + ((uint64_t)a<<32);
+#else
+   return a + ((uint64_t)b<<32);
+#endif
+}
 
-#define pack_pixel_1to2 pack8to16
-#define pack_pixel_2to4 pack16to32
+#if X264_HIGH_BIT_DEPTH
+#   define pack_pixel_1to2 pack16to32
+#   define pack_pixel_2to4 pack32to64
+#else
+#   define pack_pixel_1to2 pack8to16
+#   define pack_pixel_2to4 pack16to32
+#endif
 
-#define array_non_zero(a) array_non_zero_int(a, sizeof(a))
+#define array_non_zero(a) array_non_zero_int(a, sizeof(a)/sizeof(dctcoef))
 #define array_non_zero_int array_non_zero_int
 static ALWAYS_INLINE int array_non_zero_int( dctcoef *v, int i_count )
 {
-    if(i_count == 8)
-        return !!M64( &v[0] );
-    else if(i_count == 16)
-        return !!(M64( &v[0] ) | M64( &v[4] ));
-    else if(i_count == 32)
-        return !!(M64( &v[0] ) | M64( &v[4] ) | M64( &v[8] ) | M64( &v[12] ));
-    else
-    {
-        for( int i = 0; i < i_count; i+=4 )
-            if( M64( &v[i] ) ) return 1;
-        return 0;
-    }
+    for( int i = 0; i < i_count; i++ )
+        if( v[i] )
+            return 1;
+    return 0;
 }
 static ALWAYS_INLINE int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
 {
diff --git a/common/mc.c b/common/mc.c
index 9776becf..5ef0682e 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -117,11 +117,14 @@ static void x264_weight_cache( x264_t *h, x264_weight_t *w )
 {
     w->weightfn = h->mc.weight;
 }
-#define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * weight->i_scale + (1<<(weight->i_denom - 1))) >> weight->i_denom) + weight->i_offset )
-#define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * weight->i_scale + weight->i_offset )
-static inline void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height )
+#define opscale(x) dst[x] = x264_clip_pixel( ((src[x] * scale + (1<<(denom - 1))) >> denom) + offset )
+#define opscale_noden(x) dst[x] = x264_clip_pixel( src[x] * scale + offset )
+static void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int i_width, int i_height )
 {
-    if( weight->i_denom >= 1 )
+    int offset = weight->i_offset << (BIT_DEPTH-8);
+    int scale = weight->i_scale;
+    int denom = weight->i_denom;
+    if( denom >= 1 )
     {
         for( int y = 0; y < i_height; y++, dst += i_dst_stride, src += i_src_stride )
             for( int x = 0; x < i_width; x++ )
@@ -135,21 +138,10 @@ static inline void mc_weight( pixel *dst, int i_dst_stride, pixel *src, int i_sr
     }
 }
 
-#define MC_WEIGHT_C( name, lx ) \
+#define MC_WEIGHT_C( name, width ) \
     static void name( pixel *dst, int i_dst_stride, pixel *src, int i_src_stride, const x264_weight_t *weight, int height ) \
 { \
-    if( weight->i_denom >= 1 ) \
-    { \
-        for( int y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \
-            for( int x = 0; x < lx; x++ ) \
-                opscale( x ); \
-    } \
-    else \
-    { \
-        for( int y = 0; y < height; y++, dst += i_dst_stride, src += i_src_stride ) \
-            for( int x = 0; x < lx; x++ ) \
-                opscale_noden( x ); \
-    } \
+    mc_weight( dst, i_dst_stride, src, i_src_stride, weight, width, height );\
 }
 
 MC_WEIGHT_C( mc_weight_w20, 20 )
@@ -182,7 +174,7 @@ static void mc_copy( pixel *src, int i_src_stride, pixel *dst, int i_dst_stride,
 
 #define TAPFILTER(pix, d) ((pix)[x-2*d] + (pix)[x+3*d] - 5*((pix)[x-d] + (pix)[x+2*d]) + 20*((pix)[x] + (pix)[x+d]))
 static void hpel_filter( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
-                         int stride, int width, int height, int16_t *buf )
+                         int stride, int width, int height, dctcoef *buf )
 {
     for( int y = 0; y < height; y++ )
     {
@@ -301,7 +293,12 @@ void x264_plane_copy_c( pixel *dst, int i_dst,
 {
     while( h-- )
     {
+#if X264_HIGH_BIT_DEPTH
+        for( int i = 0; i < w; i++ )
+            dst[i] = src[i] << (BIT_DEPTH-8);
+#else
         memcpy( dst, src, w );
+#endif
         dst += i_dst;
         src += i_src;
     }
diff --git a/common/mc.h b/common/mc.h
index bb16d13e..cbdf1a63 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -82,7 +82,7 @@ typedef struct
                         uint8_t *src, int i_src, int w, int h);
 
     void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
-                         int i_stride, int i_width, int i_height, int16_t *buf );
+                         int i_stride, int i_width, int i_height, dctcoef *buf );
 
     /* prefetch the next few macroblocks of fenc or fdec */
     void (*prefetch_fenc)( pixel *pix_y, int stride_y,
diff --git a/common/pixel.c b/common/pixel.c
index 8441c7ae..069589f6 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -177,7 +177,7 @@ static int pixel_var2_8x8( pixel *pix1, int i_stride1, pixel *pix2, int i_stride
         pix2 += i_stride2;
     }
     sum = abs(sum);
-    var = sqr - (sum * sum >> 6);
+    var = sqr - ((uint64_t)sum * sum >> 6);
     *ssd = sqr;
     return var;
 }
@@ -406,12 +406,14 @@ SAD_X( 8x4 )
 SAD_X( 4x8 )
 SAD_X( 4x4 )
 
+#if !X264_HIGH_BIT_DEPTH
 #if ARCH_UltraSparc
 SAD_X( 16x16_vis )
 SAD_X( 16x8_vis )
 SAD_X( 8x16_vis )
 SAD_X( 8x8_vis )
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 
 /****************************************************************************
  * pixel_satd_x4
@@ -444,6 +446,7 @@ SATD_X_DECL6( cpu )\
 SATD_X( 4x4, cpu )
 
 SATD_X_DECL7()
+#if !X264_HIGH_BIT_DEPTH
 #if HAVE_MMX
 SATD_X_DECL7( _mmxext )
 SATD_X_DECL6( _sse2 )
@@ -454,6 +457,7 @@ SATD_X_DECL7( _sse4 )
 #if HAVE_ARMV6
 SATD_X_DECL7( _neon )
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 
 #define INTRA_MBCMP_8x8( mbcmp )\
 void x264_intra_##mbcmp##_x3_8x8( pixel *fenc, pixel edge[33], int res[3] )\
@@ -520,8 +524,8 @@ static void ssim_4x4x2_core( const pixel *pix1, int stride1,
 
 static float ssim_end1( int s1, int s2, int ss, int s12 )
 {
-    static const int ssim_c1 = (int)(.01*.01*255*255*64 + .5);
-    static const int ssim_c2 = (int)(.03*.03*255*255*64*63 + .5);
+    static const int ssim_c1 = (int)(.01*.01*PIXEL_MAX*PIXEL_MAX*64 + .5);
+    static const int ssim_c2 = (int)(.03*.03*PIXEL_MAX*PIXEL_MAX*64*63 + .5);
     int vars = ss*64 - s1*s1 - s2*s2;
     int covar = s12*64 - s1*s2;
     return (float)(2*s1*s2 + ssim_c1) * (float)(2*covar + ssim_c2)
@@ -678,6 +682,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
     pixf->intra_sad_x3_16x16  = x264_intra_sad_x3_16x16;
     pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16;
 
+#if !X264_HIGH_BIT_DEPTH
 #if HAVE_MMX
     if( cpu&X264_CPU_MMX )
     {
@@ -903,17 +908,20 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
         }
     }
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 #if HAVE_ALTIVEC
     if( cpu&X264_CPU_ALTIVEC )
     {
         x264_pixel_altivec_init( pixf );
     }
 #endif
+#if !X264_HIGH_BIT_DEPTH
 #if ARCH_UltraSparc
     INIT4( sad, _vis );
     INIT4( sad_x3, _vis );
     INIT4( sad_x4, _vis );
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 
     pixf->ads[PIXEL_8x16] =
     pixf->ads[PIXEL_8x4] =
diff --git a/common/ppc/dct.c b/common/ppc/dct.c
index eb223ae2..85d5ce7f 100644
--- a/common/ppc/dct.c
+++ b/common/ppc/dct.c
@@ -24,6 +24,7 @@
 #include "common/common.h"
 #include "ppccommon.h"
 
+#if !X264_HIGH_BIT_DEPTH
 #define VEC_DCT(a0,a1,a2,a3,b0,b1,b2,b3) \
     b1 = vec_add( a0, a3 );              \
     b3 = vec_add( a1, a2 );              \
@@ -482,4 +483,5 @@ void x264_zigzag_scan_4x4_field_altivec( int16_t level[16], int16_t dct[4][4] )
     vec_st( tmp0v, 0x00, level );
     vec_st( tmp1v, 0x10, level );
 }
+#endif // !X264_HIGH_BIT_DEPTH
 
diff --git a/common/ppc/deblock.c b/common/ppc/deblock.c
index 0c8d2d43..986710d9 100644
--- a/common/ppc/deblock.c
+++ b/common/ppc/deblock.c
@@ -21,6 +21,7 @@
 #include "common/common.h"
 #include "ppccommon.h"
 
+#if !X264_HIGH_BIT_DEPTH
 #define transpose4x16(r0, r1, r2, r3)        \
 {                                            \
     register vec_u8_t r4;                    \
@@ -292,3 +293,4 @@ void x264_deblock_h_luma_altivec( uint8_t *pix, int stride, int alpha, int beta,
     transpose4x16(line1, line2, line3, line4);
     write16x4(pix-2, stride, line1, line2, line3, line4);
 }
+#endif // !X264_HIGH_BIT_DEPTH
diff --git a/common/ppc/mc.c b/common/ppc/mc.c
index 7ad80505..744a8043 100644
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -33,6 +33,7 @@
 #include "mc.h"
 #include "ppccommon.h"
 
+#if !X264_HIGH_BIT_DEPTH
 typedef void (*pf_mc_t)( uint8_t *src, int i_src,
                          uint8_t *dst, int i_dst, int i_height );
 
@@ -792,9 +793,11 @@ static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_
         dstc += dst_stride;
     }
 }
+#endif // !X264_HIGH_BIT_DEPTH
 
 void x264_mc_altivec_init( x264_mc_functions_t *pf )
 {
+#if !X264_HIGH_BIT_DEPTH
     pf->mc_luma   = mc_luma_altivec;
     pf->get_ref   = get_ref_altivec;
     pf->mc_chroma = mc_chroma_altivec;
@@ -804,4 +807,5 @@ void x264_mc_altivec_init( x264_mc_functions_t *pf )
 
     pf->hpel_filter = x264_hpel_filter_altivec;
     pf->frame_init_lowres_core = frame_init_lowres_core_altivec;
+#endif // !X264_HIGH_BIT_DEPTH
 }
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index 3f996065..bd5f547f 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -24,6 +24,7 @@
 #include "common/common.h"
 #include "ppccommon.h"
 
+#if !X264_HIGH_BIT_DEPTH
 /***********************************************************************
  * SAD routines
  **********************************************************************/
@@ -1979,12 +1980,14 @@ static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1,
     sums[0][3] = temp[0];
     sums[1][3] = temp[1];
 }
+#endif // !X264_HIGH_BIT_DEPTH
 
 /****************************************************************************
  * x264_pixel_init:
  ****************************************************************************/
 void x264_pixel_altivec_init( x264_pixel_function_t *pixf )
 {
+#if !X264_HIGH_BIT_DEPTH
     pixf->sad[PIXEL_16x16]  = pixel_sad_16x16_altivec;
     pixf->sad[PIXEL_8x16]   = pixel_sad_8x16_altivec;
     pixf->sad[PIXEL_16x8]   = pixel_sad_16x8_altivec;
@@ -2023,4 +2026,5 @@ void x264_pixel_altivec_init( x264_pixel_function_t *pixf )
     pixf->hadamard_ac[PIXEL_8x8]   = x264_pixel_hadamard_ac_8x8_altivec;
 
     pixf->ssim_4x4x2_core = ssim_4x4x2_core_altivec;
+#endif // !X264_HIGH_BIT_DEPTH
 }
diff --git a/common/ppc/predict.c b/common/ppc/predict.c
index 3fb1a2b7..c71dbb56 100644
--- a/common/ppc/predict.c
+++ b/common/ppc/predict.c
@@ -23,6 +23,7 @@
 #include "pixel.h"
 #include "ppccommon.h"
 
+#if !X264_HIGH_BIT_DEPTH
 static void predict_8x8c_p_altivec( uint8_t *src )
 {
     int H = 0, V = 0;
@@ -194,6 +195,7 @@ static void predict_16x16_v_altivec( uint8_t *src )
         src += FDEC_STRIDE;
     }
 }
+#endif // !X264_HIGH_BIT_DEPTH
 
 
 /****************************************************************************
@@ -201,6 +203,7 @@ static void predict_16x16_v_altivec( uint8_t *src )
  ****************************************************************************/
 void x264_predict_16x16_init_altivec( x264_predict_t pf[7] )
 {
+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_16x16_V ]      = predict_16x16_v_altivec;
     pf[I_PRED_16x16_H ]      = predict_16x16_h_altivec;
     pf[I_PRED_16x16_DC]      = predict_16x16_dc_altivec;
@@ -208,9 +211,12 @@ void x264_predict_16x16_init_altivec( x264_predict_t pf[7] )
     pf[I_PRED_16x16_DC_LEFT] = predict_16x16_dc_left_altivec;
     pf[I_PRED_16x16_DC_TOP ] = predict_16x16_dc_top_altivec;
     pf[I_PRED_16x16_DC_128 ] = predict_16x16_dc_128_altivec;
+#endif // !X264_HIGH_BIT_DEPTH
 }
 
 void x264_predict_8x8c_init_altivec( x264_predict_t pf[7] )
 {
+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_CHROMA_P]       = predict_8x8c_p_altivec;
+#endif // !X264_HIGH_BIT_DEPTH
 }
diff --git a/common/ppc/quant.c b/common/ppc/quant.c
index 6f41a06f..ffd6a1ba 100644
--- a/common/ppc/quant.c
+++ b/common/ppc/quant.c
@@ -22,6 +22,7 @@
 #include "ppccommon.h"
 #include "quant.h"
 
+#if !X264_HIGH_BIT_DEPTH
 // quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
 #define QUANT_16_U( idx0, idx1 )                                    \
 {                                                                   \
@@ -360,4 +361,5 @@ void x264_dequant_8x8_altivec( int16_t dct[8][8], int dequant_mf[6][8][8], int i
             DEQUANT_SHR();
     }
 }
+#endif // !X264_HIGH_BIT_DEPTH
 
diff --git a/common/predict.c b/common/predict.c
index 782dfa32..fa71b6e6 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -53,40 +53,40 @@
 
 void x264_predict_16x16_dc_c( pixel *src )
 {
-    pixel4 dc = 0;
+    int dc = 0;
 
     for( int i = 0; i < 16; i++ )
     {
         dc += src[-1 + i * FDEC_STRIDE];
         dc += src[i - FDEC_STRIDE];
     }
-    dc = PIXEL_SPLAT_X4( ( dc + 16 ) >> 5 );
+    pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 16 ) >> 5 );
 
-    PREDICT_16x16_DC( dc );
+    PREDICT_16x16_DC( dcsplat );
 }
 static void x264_predict_16x16_dc_left_c( pixel *src )
 {
-    pixel4 dc = 0;
+    int dc = 0;
 
     for( int i = 0; i < 16; i++ )
         dc += src[-1 + i * FDEC_STRIDE];
-    dc = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 );
+    pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 );
 
-    PREDICT_16x16_DC( dc );
+    PREDICT_16x16_DC( dcsplat );
 }
 static void x264_predict_16x16_dc_top_c( pixel *src )
 {
-    pixel4 dc = 0;
+    int dc = 0;
 
     for( int i = 0; i < 16; i++ )
         dc += src[i - FDEC_STRIDE];
-    dc = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 );
+    pixel4 dcsplat = PIXEL_SPLAT_X4( ( dc + 8 ) >> 4 );
 
-    PREDICT_16x16_DC( dc );
+    PREDICT_16x16_DC( dcsplat );
 }
 static void x264_predict_16x16_dc_128_c( pixel *src )
 {
-    PREDICT_16x16_DC( PIXEL_SPLAT_X4( 0x80 ) );
+    PREDICT_16x16_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) );
 }
 void x264_predict_16x16_h_c( pixel *src )
 {
@@ -155,53 +155,53 @@ static void x264_predict_8x8c_dc_128_c( pixel *src )
 {
     for( int y = 0; y < 8; y++ )
     {
-        MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 0x80 );
-        MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 0x80 );
+        MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
+        MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
         src += FDEC_STRIDE;
     }
 }
 static void x264_predict_8x8c_dc_left_c( pixel *src )
 {
-    pixel4 dc0 = 0, dc1 = 0;
+    int dc0 = 0, dc1 = 0;
 
     for( int y = 0; y < 4; y++ )
     {
         dc0 += src[y * FDEC_STRIDE     - 1];
         dc1 += src[(y+4) * FDEC_STRIDE - 1];
     }
-    dc0 = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
-    dc1 = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );
+    pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
+    pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );
 
     for( int y = 0; y < 4; y++ )
     {
-        MPIXEL_X4( src+0 ) = dc0;
-        MPIXEL_X4( src+4 ) = dc0;
+        MPIXEL_X4( src+0 ) = dc0splat;
+        MPIXEL_X4( src+4 ) = dc0splat;
         src += FDEC_STRIDE;
     }
     for( int y = 0; y < 4; y++ )
     {
-        MPIXEL_X4( src+0 ) = dc1;
-        MPIXEL_X4( src+4 ) = dc1;
+        MPIXEL_X4( src+0 ) = dc1splat;
+        MPIXEL_X4( src+4 ) = dc1splat;
         src += FDEC_STRIDE;
     }
 
 }
 static void x264_predict_8x8c_dc_top_c( pixel *src )
 {
-    pixel4 dc0 = 0, dc1 = 0;
+    int dc0 = 0, dc1 = 0;
 
     for( int x = 0; x < 4; x++ )
     {
         dc0 += src[x     - FDEC_STRIDE];
         dc1 += src[x + 4 - FDEC_STRIDE];
     }
-    dc0 = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
-    dc1 = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );
+    pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
+    pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );
 
     for( int y = 0; y < 8; y++ )
     {
-        MPIXEL_X4( src+0 ) = dc0;
-        MPIXEL_X4( src+4 ) = dc1;
+        MPIXEL_X4( src+0 ) = dc0splat;
+        MPIXEL_X4( src+4 ) = dc1splat;
         src += FDEC_STRIDE;
     }
 }
@@ -306,7 +306,7 @@ static void x264_predict_8x8c_p_c( pixel *src )
 
 static void x264_predict_4x4_dc_128_c( pixel *src )
 {
-    PREDICT_4x4_DC( PIXEL_SPLAT_X4( 0x80 ) );
+    PREDICT_4x4_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) );
 }
 static void x264_predict_4x4_dc_left_c( pixel *src )
 {
@@ -491,7 +491,8 @@ static void x264_predict_8x8_filter_c( pixel *src, pixel edge[33], int i_neighbo
             }
             else
             {
-                M64( edge+24 ) = SRC(7,-1) * 0x0101010101010101ULL;
+                MPIXEL_X4( edge+24 ) = PIXEL_SPLAT_X4( SRC(7,-1) );
+                MPIXEL_X4( edge+28 ) = PIXEL_SPLAT_X4( SRC(7,-1) );
                 edge[32] = SRC(7,-1);
             }
         }
@@ -523,7 +524,7 @@ static void x264_predict_8x8_filter_c( pixel *src, pixel edge[33], int i_neighbo
 
 static void x264_predict_8x8_dc_128_c( pixel *src, pixel edge[33] )
 {
-    PREDICT_8x8_DC( PIXEL_SPLAT_X4( 0x80 ) );
+    PREDICT_8x8_DC( PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) ) );
 }
 static void x264_predict_8x8_dc_left_c( pixel *src, pixel edge[33] )
 {
@@ -554,9 +555,13 @@ void x264_predict_8x8_h_c( pixel *src, pixel edge[33] )
 }
 void x264_predict_8x8_v_c( pixel *src, pixel edge[33] )
 {
-    uint64_t top = M64( edge+16 );
+    pixel4 top[2] = { MPIXEL_X4( edge+16 ),
+                      MPIXEL_X4( edge+20 ) };
     for( int y = 0; y < 8; y++ )
-        M64( src+y*FDEC_STRIDE ) = top;
+    {
+        MPIXEL_X4( src+y*FDEC_STRIDE+0 ) = top[0];
+        MPIXEL_X4( src+y*FDEC_STRIDE+4 ) = top[1];
+    }
 }
 static void x264_predict_8x8_ddl_c( pixel *src, pixel edge[33] )
 {
diff --git a/common/quant.c b/common/quant.c
index ece52f9d..a7b72cfb 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -142,7 +142,7 @@ static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, uint16_t *offset, int
     for( int i = 1; i < size; i++ )
     {
         int level = dct[i];
-        int sign = level>>15;
+        int sign = level>>31;
         level = (level+sign)^sign;
         sum[i] += level;
         level -= offset[i];
@@ -177,10 +177,7 @@ static int ALWAYS_INLINE x264_decimate_score_internal( dctcoef *dct, int i_max )
     int i_score = 0;
     int idx = i_max - 1;
 
-    /* Yes, dct[idx-1] is guaranteed to be 32-bit aligned.  idx>=0 instead of 1 works correctly for the same reason */
-    while( idx >= 0 && MDCT_X2( &dct[idx-1] ) == 0 )
-        idx -= 2;
-    if( idx >= 0 && dct[idx] == 0 )
+    while( idx >= 0 && dct[idx] == 0 )
         idx--;
     while( idx >= 0 )
     {
@@ -216,10 +213,7 @@ static int x264_decimate_score64( dctcoef *dct )
 
 static int ALWAYS_INLINE x264_coeff_last_internal( dctcoef *l, int i_count )
 {
-    int i_last;
-    for( i_last = i_count-1; i_last >= 3; i_last -= 4 )
-        if( M64( l+i_last-3 ) )
-            break;
+    int i_last = i_count-1;
     while( i_last >= 0 && l[i_last] == 0 )
         i_last--;
     return i_last;
@@ -287,6 +281,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
     pf->coeff_level_run[  DCT_LUMA_AC] = x264_coeff_level_run15;
     pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16;
 
+#if !X264_HIGH_BIT_DEPTH
 #if HAVE_MMX
     if( cpu&X264_CPU_MMX )
     {
@@ -425,6 +420,7 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
         pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
     }
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
     pf->coeff_last[  DCT_LUMA_DC] = pf->coeff_last[DCT_LUMA_4x4];
     pf->coeff_last[DCT_CHROMA_AC] = pf->coeff_last[ DCT_LUMA_AC];
     pf->coeff_level_run[  DCT_LUMA_DC] = pf->coeff_level_run[DCT_LUMA_4x4];
diff --git a/common/set.c b/common/set.c
index 16cff8ef..86f38542 100644
--- a/common/set.c
+++ b/common/set.c
@@ -78,6 +78,7 @@ int x264_cqm_init( x264_t *h )
                         32 - 11, 32 - 21 };
     int max_qp_err = -1;
     int max_chroma_qp_err = -1;
+    int min_qp_err = QP_MAX+1;
 
     for( int i = 0; i < 6; i++ )
     {
@@ -94,9 +95,9 @@ int x264_cqm_init( x264_t *h )
         }
         else
         {
-            CHECKED_MALLOC( h->  quant4_mf[i], 52*size*sizeof(uint16_t) );
+            CHECKED_MALLOC( h->  quant4_mf[i], (QP_MAX+1)*size*sizeof(uint16_t) );
             CHECKED_MALLOC( h->dequant4_mf[i],  6*size*sizeof(int) );
-            CHECKED_MALLOC( h->unquant4_mf[i], 52*size*sizeof(int) );
+            CHECKED_MALLOC( h->unquant4_mf[i], (QP_MAX+1)*size*sizeof(int) );
         }
 
         for( j = (i<4 ? 0 : 4); j < i; j++ )
@@ -106,7 +107,7 @@ int x264_cqm_init( x264_t *h )
         if( j < i )
             h->quant4_bias[i] = h->quant4_bias[j];
         else
-            CHECKED_MALLOC( h->quant4_bias[i], 52*size*sizeof(uint16_t) );
+            CHECKED_MALLOC( h->quant4_bias[i], (QP_MAX+1)*size*sizeof(uint16_t) );
     }
 
     for( int q = 0; q < 6; q++ )
@@ -140,7 +141,7 @@ int x264_cqm_init( x264_t *h )
                      quant8_mf[i_list][q][i] = DIV(def_quant8[q][i] * 16, h->pps->scaling_list[4+i_list][i]);
             }
     }
-    for( int q = 0; q < 52; q++ )
+    for( int q = 0; q < QP_MAX+1; q++ )
     {
         int j;
         for( int i_list = 0; i_list < 4; i_list++ )
@@ -148,6 +149,11 @@ int x264_cqm_init( x264_t *h )
             {
                 h->unquant4_mf[i_list][q][i] = (1ULL << (q/6 + 15 + 8)) / quant4_mf[i_list][q%6][i];
                 h->quant4_mf[i_list][q][i] = j = SHIFT(quant4_mf[i_list][q%6][i], q/6 - 1);
+                if( !j )
+                {
+                    min_qp_err = X264_MIN( min_qp_err, q );
+                    continue;
+                }
                 // round to nearest, unless that would cause the deadzone to be negative
                 h->quant4_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
                 if( j > 0xffff && q > max_qp_err && (i_list == CQM_4IY || i_list == CQM_4PY) )
@@ -161,6 +167,11 @@ int x264_cqm_init( x264_t *h )
                 {
                     h->unquant8_mf[i_list][q][i] = (1ULL << (q/6 + 16 + 8)) / quant8_mf[i_list][q%6][i];
                     h->quant8_mf[i_list][q][i] = j = SHIFT(quant8_mf[i_list][q%6][i], q/6);
+                    if( !j )
+                    {
+                        min_qp_err = X264_MIN( min_qp_err, q );
+                        continue;
+                    }
                     h->quant8_bias[i_list][q][i] = X264_MIN( DIV(deadzone[i_list]<<10, j), (1<<15)/j );
                     if( j > 0xffff && q > max_qp_err )
                         max_qp_err = q;
@@ -179,6 +190,12 @@ int x264_cqm_init( x264_t *h )
         x264_log( h, X264_LOG_ERROR, "but min chroma QP is implied to be %d.\n", h->chroma_qp_table[h->param.rc.i_qp_min] );
         return -1;
     }
+    if( !h->mb.b_lossless && min_qp_err <= h->param.rc.i_qp_max )
+    {
+        x264_log( h, X264_LOG_ERROR, "Quantization underflow.  Your CQM is incompatible with QP > %d,\n", min_qp_err-1 );
+        x264_log( h, X264_LOG_ERROR, "but max QP is implied to be %d.\n", h->param.rc.i_qp_max );
+        return -1;
+    }
     return 0;
 fail:
     x264_cqm_delete( h );
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 4ddf2e5b..8a12f833 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -125,6 +125,7 @@ PIXEL_AVG_WALL(sse2)
 PIXEL_AVG_WALL(sse2_misalign)
 PIXEL_AVG_WALL(cache64_ssse3)
 
+#if !X264_HIGH_BIT_DEPTH
 #define PIXEL_AVG_WTAB(instr, name1, name2, name3, name4, name5)\
 static void (* const x264_pixel_avg_wtab_##instr[6])( uint8_t *, int, uint8_t *, int, uint8_t *, int ) =\
 {\
@@ -355,24 +356,28 @@ static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i
         x264_plane_copy_core_mmxext( dst+i_dst, i_dst, src+i_src, i_src, (w+15)&~15, h-1 );
     }
 }
+#endif // !X264_HIGH_BIT_DEPTH
 
 void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 {
     if( !(cpu&X264_CPU_MMX) )
         return;
 
+    pf->memcpy_aligned = x264_memcpy_aligned_mmx;
+    pf->memzero_aligned = x264_memzero_aligned_mmx;
+#if !X264_HIGH_BIT_DEPTH
     pf->copy_16x16_unaligned = x264_mc_copy_w16_mmx;
     pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
     pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_mmx;
     pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_mmx;
-    pf->memcpy_aligned = x264_memcpy_aligned_mmx;
-    pf->memzero_aligned = x264_memzero_aligned_mmx;
     pf->integral_init4v = x264_integral_init4v_mmx;
     pf->integral_init8v = x264_integral_init8v_mmx;
+#endif // !X264_HIGH_BIT_DEPTH
 
     if( !(cpu&X264_CPU_MMXEXT) )
         return;
 
+#if !X264_HIGH_BIT_DEPTH
     pf->mc_luma = mc_luma_mmxext;
     pf->get_ref = get_ref_mmxext;
     pf->mc_chroma = x264_mc_chroma_mmxext;
@@ -412,12 +417,14 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         pf->frame_init_lowres_core = x264_frame_init_lowres_core_cache32_mmxext;
     }
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 
     if( !(cpu&X264_CPU_SSE2) )
         return;
 
     pf->memcpy_aligned = x264_memcpy_aligned_sse2;
     pf->memzero_aligned = x264_memzero_aligned_sse2;
+#if !X264_HIGH_BIT_DEPTH
     pf->integral_init4v = x264_integral_init4v_sse2;
     pf->integral_init8v = x264_integral_init8v_sse2;
     pf->hpel_filter = x264_hpel_filter_sse2_amd;
@@ -492,4 +499,5 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 
     pf->integral_init4h = x264_integral_init4h_sse4;
     pf->integral_init8h = x264_integral_init8h_sse4;
+#endif // !X264_HIGH_BIT_DEPTH
 }
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index e771431e..4004265f 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -75,6 +75,7 @@
  void x264_predict_16x16_v_sse2( uint8_t *src );
  void x264_predict_16x16_p_core_sse2( uint8_t *src, int i00, int b, int c );
 
+#if !X264_HIGH_BIT_DEPTH
 ALIGNED_8( static const int8_t pb_12345678[8] ) = {1,2,3,4,5,6,7,8};
 ALIGNED_8( static const int8_t pb_m87654321[8] ) = {-8,-7,-6,-5,-4,-3,-2,-1};
 ALIGNED_8( static const int8_t pb_m32101234[8] ) = {-3,-2,-1,0,1,2,3,4};
@@ -364,6 +365,7 @@ INTRA_SA8D_X3(ssse3)
 #else
 INTRA_SA8D_X3(mmxext)
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 
 /****************************************************************************
  * Exported functions:
@@ -372,6 +374,7 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
 {
     if( !(cpu&X264_CPU_MMX) )
         return;
+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_16x16_V]       = x264_predict_16x16_v_mmx;
     if( !(cpu&X264_CPU_MMXEXT) )
         return;
@@ -397,12 +400,14 @@ void x264_predict_16x16_init_mmx( int cpu, x264_predict_t pf[7] )
 #ifdef __GNUC__
     pf[I_PRED_16x16_P]       = x264_predict_16x16_p_ssse3;
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 }
 
 void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
 {
     if( !(cpu&X264_CPU_MMX) )
         return;
+#if !X264_HIGH_BIT_DEPTH
 #if ARCH_X86_64
     pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left;
 #endif
@@ -424,12 +429,14 @@ void x264_predict_8x8c_init_mmx( int cpu, x264_predict_t pf[7] )
 #ifdef __GNUC__
     pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_ssse3;
 #endif
+#endif // !X264_HIGH_BIT_DEPTH
 }
 
 void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_8x8_filter )
 {
     if( !(cpu&X264_CPU_MMXEXT) )
         return;
+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_8x8_V]      = x264_predict_8x8_v_mmxext;
     pf[I_PRED_8x8_H]      = x264_predict_8x8_h_mmxext;
     pf[I_PRED_8x8_DC]     = x264_predict_8x8_dc_mmxext;
@@ -456,12 +463,14 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
     pf[I_PRED_8x8_HD]   = x264_predict_8x8_hd_ssse3;
     pf[I_PRED_8x8_HU]   = x264_predict_8x8_hu_ssse3;
     *predict_8x8_filter = x264_predict_8x8_filter_ssse3;
+#endif // !X264_HIGH_BIT_DEPTH
 }
 
 void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
 {
     if( !(cpu&X264_CPU_MMXEXT) )
         return;
+#if !X264_HIGH_BIT_DEPTH
     pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_mmxext;
     pf[I_PRED_4x4_DDL] = x264_predict_4x4_ddl_mmxext;
     pf[I_PRED_4x4_VL]  = x264_predict_4x4_vl_mmxext;
@@ -474,4 +483,5 @@ void x264_predict_4x4_init_mmx( int cpu, x264_predict_t pf[12] )
     pf[I_PRED_4x4_DDR] = x264_predict_4x4_ddr_ssse3;
     pf[I_PRED_4x4_VR]  = x264_predict_4x4_vr_ssse3;
     pf[I_PRED_4x4_HD]  = x264_predict_4x4_hd_ssse3;
+#endif // !X264_HIGH_BIT_DEPTH
 }
diff --git a/configure b/configure
index 24d15adf..43fbe393 100755
--- a/configure
+++ b/configure
@@ -18,6 +18,7 @@ echo "  --enable-gprof           adds -pg, doesn't strip"
 echo "  --enable-visualize       enables visualization (X11 only)"
 echo "  --enable-pic             build position-independent code"
 echo "  --enable-shared          build libx264.so"
+echo "  --bit-depth=BIT_DEPTH    sets output bit depth (8-10), default 8"
 echo "  --extra-asflags=EASFLAGS add EASFLAGS to ASFLAGS"
 echo "  --extra-cflags=ECFLAGS   add ECFLAGS to CFLAGS"
 echo "  --extra-ldflags=ELDFLAGS add ELDFLAGS to LDFLAGS"
@@ -124,6 +125,7 @@ gprof="no"
 pic="no"
 vis="no"
 shared="no"
+bit_depth="8"
 
 CFLAGS="$CFLAGS -Wall -I."
 LDFLAGS="$LDFLAGS"
@@ -208,6 +210,14 @@ for opt do
             CFLAGS="$CFLAGS --sysroot=${opt#--sysroot=}"
             LDFLAGS="$LDFLAGS --sysroot=${opt#--sysroot=}"
             ;;
+        --bit-depth=*)
+            bit_depth="${opt#--bit-depth=}"
+            if [ "$bit_depth" -lt "8" -o "$bit_depth" -gt "10" ]; then
+                echo "Supplied bit depth must be in range [8,10]."
+                exit 1
+            fi
+            bit_depth=`expr $bit_depth + 0`
+            ;;
         *)
             echo "Unknown option $opt, ignored"
             ;;
@@ -644,6 +654,12 @@ if cc_check '' -Wshadow ; then
     CFLAGS="-Wshadow $CFLAGS"
 fi
 
+if [ "$bit_depth" -gt "8" ]; then
+    define X264_HIGH_BIT_DEPTH
+fi
+
+define BIT_DEPTH $bit_depth
+
 rm -f conftest*
 
 # generate config files
@@ -724,6 +740,7 @@ gprof:      $gprof
 PIC:        $pic
 shared:     $shared
 visualize:  $vis
+bit depth:  $bit_depth
 EOF
 
 echo >> config.log
diff --git a/encoder/analyse.c b/encoder/analyse.c
index 48a6f394..44543731 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -134,25 +134,27 @@ typedef struct
 } x264_mb_analysis_t;
 
 /* lambda = pow(2,qp/6-2) */
-const uint8_t x264_lambda_tab[52] = {
-   1, 1, 1, 1, 1, 1, 1, 1,  /*  0-7 */
-   1, 1, 1, 1,              /*  8-11 */
-   1, 1, 1, 1, 2, 2, 2, 2,  /* 12-19 */
-   3, 3, 3, 4, 4, 4, 5, 6,  /* 20-27 */
-   6, 7, 8, 9,10,11,13,14,  /* 28-35 */
-  16,18,20,23,25,29,32,36,  /* 36-43 */
-  40,45,51,57,64,72,81,91   /* 44-51 */
+const uint16_t x264_lambda_tab[QP_MAX_MAX+1] = {
+   1,   1,   1,   1,   1,   1,   1,   1, /*  0- 7 */
+   1,   1,   1,   1,   1,   1,   1,   1, /*  8-15 */
+   2,   2,   2,   2,   3,   3,   3,   4, /* 16-23 */
+   4,   4,   5,   6,   6,   7,   8,   9, /* 24-31 */
+  10,  11,  13,  14,  16,  18,  20,  23, /* 32-39 */
+  25,  29,  32,  36,  40,  45,  51,  57, /* 40-47 */
+  64,  72,  81,  91, 102, 114, 128, 144, /* 48-55 */
+ 161, 181, 203, 228, 256, 287, 323, 362, /* 56-63 */
 };
 
 /* lambda2 = pow(lambda,2) * .9 * 256 */
-const int x264_lambda2_tab[52] = {
-    14,      18,      22,      28,     36,     45,     57,     72, /*  0 -  7 */
-    91,     115,     145,     182,    230,    290,    365,    460, /*  8 - 15 */
-   580,     731,     921,    1161,   1462,   1843,   2322,   2925, /* 16 - 23 */
-  3686,    4644,    5851,    7372,   9289,  11703,  14745,  18578, /* 24 - 31 */
- 23407,   29491,   37156,   46814,  58982,  74313,  93628, 117964, /* 32 - 39 */
-148626,  187257,  235929,  297252, 374514, 471859, 594505, 749029, /* 40 - 47 */
-943718, 1189010, 1498059, 1887436                                  /* 48 - 51 */
+const int x264_lambda2_tab[QP_MAX_MAX+1] = {
+     14,     18,     22,      28,      36,      45,      57,      72, /*  0- 7 */
+     91,    115,    145,     182,     230,     290,     365,     460, /*  8-15 */
+    580,    731,    921,    1161,    1462,    1843,    2322,    2925, /* 16-23 */
+   3686,   4644,   5851,    7372,    9289,   11703,   14745,   18578, /* 24-31 */
+  23407,  29491,  37156,   46814,   58982,   74313,   93628,  117964, /* 32-39 */
+ 148626, 187257, 235929,  297252,  374514,  471859,  594505,  749029, /* 40-47 */
+ 943718,1189010,1498059, 1887436, 2378021, 2996119, 3774873, 4756042, /* 48-55 */
+5992238,7549747,9512085,11984476,15099494,19024170,23968953,30198988, /* 56-63 */
 };
 
 const uint8_t x264_exp2_lut[64] = {
@@ -188,27 +190,31 @@ const float x264_log2_lz_lut[32] = {
 
 // should the intra and inter lambdas be different?
 // I'm just matching the behaviour of deadzone quant.
-static const int x264_trellis_lambda2_tab[2][52] = {
+static const int x264_trellis_lambda2_tab[2][QP_MAX_MAX+1] = {
     // inter lambda = .85 * .85 * 2**(qp/3. + 10 - LAMBDA_BITS)
-    {    46,      58,      73,      92,     117,     147,
-        185,     233,     294,     370,     466,     587,
-        740,     932,    1174,    1480,    1864,    2349,
-       2959,    3728,    4697,    5918,    7457,    9395,
-      11837,   14914,   18790,   23674,   29828,   37581,
-      47349,   59656,   75163,   94699,  119313,  150326,
-     189399,  238627,  300652,  378798,  477255,  601304,
-     757596,  954511, 1202608, 1515192, 1909022, 2405217,
-    3030384, 3818045, 4810435, 6060769 },
+    {      46,      58,      73,      92,     117,     147,
+          185,     233,     294,     370,     466,     587,
+          740,     932,    1174,    1480,    1864,    2349,
+         2959,    3728,    4697,    5918,    7457,    9395,
+        11837,   14914,   18790,   23674,   29828,   37581,
+        47349,   59656,   75163,   94699,  119313,  150326,
+       189399,  238627,  300652,  378798,  477255,  601304,
+       757596,  954511, 1202608, 1515192, 1909022, 2405217,
+      3030384, 3818045, 4810435, 6060769, 7636091, 9620872,
+     12121539,15272182,19241743,24243077,30544363,38483486,
+     48486154,61088726,76966972,96972308 },
     // intra lambda = .65 * .65 * 2**(qp/3. + 10 - LAMBDA_BITS)
-    {    27,      34,      43,      54,      68,      86,
-        108,     136,     172,     216,     273,     343,
-        433,     545,     687,     865,    1090,    1374,
-       1731,    2180,    2747,    3461,    4361,    5494,
-       6922,    8721,   10988,   13844,   17442,   21976,
-      27688,   34885,   43953,   55377,   69771,   87906,
-     110755,  139543,  175813,  221511,  279087,  351627,
-     443023,  558174,  703255,  886046, 1116348, 1406511,
-    1772093, 2232697, 2813022, 3544186 }
+    {      27,      34,      43,      54,      68,      86,
+          108,     136,     172,     216,     273,     343,
+          433,     545,     687,     865,    1090,    1374,
+         1731,    2180,    2747,    3461,    4361,    5494,
+         6922,    8721,   10988,   13844,   17442,   21976,
+        27688,   34885,   43953,   55377,   69771,   87906,
+       110755,  139543,  175813,  221511,  279087,  351627,
+       443023,  558174,  703255,  886046, 1116348, 1406511,
+      1772093, 2232697, 2813022, 3544186, 4465396, 5626046,
+      7088374, 8930791,11252092,14176748,17861583,22504184,
+     28353495,35723165,45008368,56706990 }
 };
 
 static const uint16_t x264_chroma_lambda2_offset_tab[] = {
@@ -237,7 +243,7 @@ static const uint8_t i_sub_mb_p_cost_table[4] = {
 
 static void x264_analyse_update_cache( x264_t *h, x264_mb_analysis_t *a );
 
-static uint16_t x264_cost_ref[92][3][33];
+static uint16_t x264_cost_ref[LAMBDA_MAX+1][3][33];
 static UNUSED x264_pthread_mutex_t cost_ref_mutex = X264_PTHREAD_MUTEX_INITIALIZER;
 
 int x264_analyse_init_costs( x264_t *h, int qp )
@@ -275,7 +281,7 @@ fail:
 
 void x264_analyse_free_costs( x264_t *h )
 {
-    for( int i = 0; i < 92; i++ )
+    for( int i = 0; i < LAMBDA_MAX+1; i++ )
     {
         if( h->cost_mv[i] )
             x264_free( h->cost_mv[i] - 2*4*2048 );
diff --git a/encoder/cabac.c b/encoder/cabac.c
index b99a32d1..be18f534 100644
--- a/encoder/cabac.c
+++ b/encoder/cabac.c
@@ -262,9 +262,9 @@ static void x264_cabac_mb_qp_delta( x264_t *h, x264_cabac_t *cb )
     if( i_dqp != 0 )
     {
         int val = i_dqp <= 0 ? (-2*i_dqp) : (2*i_dqp - 1);
-        /* dqp is interpreted modulo 52 */
-        if( val >= 51 && val != 52 )
-            val = 103 - val;
+        /* dqp is interpreted modulo (QP_MAX+1) */
+        if( val >= QP_MAX && val != QP_MAX+1 )
+            val = 2*QP_MAX+1 - val;
         do
         {
             x264_cabac_encode_decision( cb, 60 + ctx, 1 );
@@ -767,15 +767,18 @@ void x264_macroblock_write_cabac( x264_t *h, x264_cabac_t *cb )
         i_mb_pos_tex = x264_cabac_pos( cb );
         h->stat.frame.i_mv_bits += i_mb_pos_tex - i_mb_pos_start;
 
-        memcpy( cb->p, h->mb.pic.p_fenc[0], 256 );
-        cb->p += 256;
-        for( int i = 0; i < 8; i++ )
-            memcpy( cb->p + i*8, h->mb.pic.p_fenc[1] + i*FENC_STRIDE, 8 );
-        cb->p += 64;
-        for( int i = 0; i < 8; i++ )
-            memcpy( cb->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 );
-        cb->p += 64;
+        bs_t s;
+        bs_init( &s, cb->p, cb->p_end - cb->p );
 
+        for( int i = 0; i < 256; i++ )
+            bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[0][i] );
+        for( int ch = 0; ch < 2; ch++ )
+            for( int i = 0; i < 8; i++ )
+                for( int j = 0; j < 8; j++ )
+                    bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
+
+        bs_flush( &s );
+        cb->p = s.p;
         x264_cabac_encode_init_core( cb );
 
         h->stat.frame.i_tex_bits += x264_cabac_pos( cb ) - i_mb_pos_tex;
diff --git a/encoder/cavlc.c b/encoder/cavlc.c
index b2544652..0b58ada6 100644
--- a/encoder/cavlc.c
+++ b/encoder/cavlc.c
@@ -66,7 +66,7 @@ static inline int block_residual_write_cavlc_escape( x264_t *h, int i_suffix_len
     bs_t *s = &h->out.bs;
     static const uint16_t next_suffix[7] = { 0, 3, 6, 12, 24, 48, 0xffff };
     int i_level_prefix = 15;
-    int mask = level >> 15;
+    int mask = level >> 31;
     int abs_level = (level^mask)-mask;
     int i_level_code = abs_level*2-mask-2;
     if( ( i_level_code >> i_suffix_length ) < 15 )
@@ -219,10 +219,10 @@ static void cavlc_qp_delta( x264_t *h )
 
     if( i_dqp )
     {
-        if( i_dqp < -26 )
-            i_dqp += 52;
-        else if( i_dqp > 25 )
-            i_dqp -= 52;
+        if( i_dqp < -(QP_MAX+1)/2 )
+            i_dqp += QP_MAX+1;
+        else if( i_dqp > QP_MAX/2 )
+            i_dqp -= QP_MAX+1;
     }
     bs_write_se( s, i_dqp );
 }
@@ -309,14 +309,12 @@ void x264_macroblock_write_cavlc( x264_t *h )
 
         bs_align_0( s );
 
-        memcpy( s->p, h->mb.pic.p_fenc[0], 256 );
-        s->p += 256;
-        for( int i = 0; i < 8; i++ )
-            memcpy( s->p + i*8, h->mb.pic.p_fenc[1] + i*FENC_STRIDE, 8 );
-        s->p += 64;
-        for( int i = 0; i < 8; i++ )
-            memcpy( s->p + i*8, h->mb.pic.p_fenc[2] + i*FENC_STRIDE, 8 );
-        s->p += 64;
+        for( int i = 0; i < 256; i++ )
+            bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[0][i] );
+        for( int ch = 0; ch < 2; ch++ )
+            for( int i = 0; i < 8; i++ )
+                for( int j = 0; j < 8; j++ )
+                    bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
 
         bs_init( s, s->p, s->p_end - s->p );
         s->p_start = p_start;
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 6a2aacb7..a2369bd5 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -51,7 +51,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
  ****************************************************************************/
 static float x264_psnr( int64_t i_sqe, int64_t i_size )
 {
-    double f_mse = (double)i_sqe / ((double)65025.0 * (double)i_size);
+    double f_mse = (double)i_sqe / (PIXEL_MAX*PIXEL_MAX * (double)i_size);
     if( f_mse <= 0.0000000001 ) /* Max 100dB */
         return 100;
 
@@ -68,11 +68,13 @@ static void x264_frame_dump( x264_t *h )
     FILE *f = fopen( h->param.psz_dump_yuv, "r+b" );
     if( !f )
         return;
+    int bytes_per_pixel = (BIT_DEPTH+7)/8;
     /* Write the frame in display order */
-    fseek( f, (uint64_t)h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2, SEEK_SET );
+    fseek( f, (uint64_t)h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2 * bytes_per_pixel, SEEK_SET );
     for( int i = 0; i < h->fdec->i_plane; i++ )
         for( int y = 0; y < h->param.i_height >> !!i; y++ )
-            fwrite( &h->fdec->plane[i][y*h->fdec->i_stride[i]], 1, h->param.i_width >> !!i, f );
+            for( int j = 0; j < h->param.i_width >> !!i; j++ )
+                fwrite( &h->fdec->plane[i][y*h->fdec->i_stride[i]]+j, bytes_per_pixel, 1, f );
     fclose( f );
 }
 
@@ -469,8 +471,8 @@ static int x264_validate_parameters( x264_t *h )
         x264_log( h, X264_LOG_ERROR, "no ratecontrol method specified\n" );
         return -1;
     }
-    h->param.rc.f_rf_constant = x264_clip3f( h->param.rc.f_rf_constant, 0, 51 );
-    h->param.rc.i_qp_constant = x264_clip3( h->param.rc.i_qp_constant, 0, 51 );
+    h->param.rc.f_rf_constant = x264_clip3f( h->param.rc.f_rf_constant, 0, QP_MAX );
+    h->param.rc.i_qp_constant = x264_clip3( h->param.rc.i_qp_constant, 0, QP_MAX );
     if( h->param.rc.i_rc_method == X264_RC_CRF )
     {
         h->param.rc.i_qp_constant = h->param.rc.f_rf_constant;
@@ -502,12 +504,12 @@ static int x264_validate_parameters( x264_t *h )
         float qp_p = h->param.rc.i_qp_constant;
         float qp_i = qp_p - 6*log2f( h->param.rc.f_ip_factor );
         float qp_b = qp_p + 6*log2f( h->param.rc.f_pb_factor );
-        h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, 51 );
-        h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, 51 );
+        h->param.rc.i_qp_min = x264_clip3( (int)(X264_MIN3( qp_p, qp_i, qp_b )), 0, QP_MAX );
+        h->param.rc.i_qp_max = x264_clip3( (int)(X264_MAX3( qp_p, qp_i, qp_b ) + .999), 0, QP_MAX );
         h->param.rc.i_aq_mode = 0;
         h->param.rc.b_mb_tree = 0;
     }
-    h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, 51 );
+    h->param.rc.i_qp_max = x264_clip3( h->param.rc.i_qp_max, 0, QP_MAX );
     h->param.rc.i_qp_min = x264_clip3( h->param.rc.i_qp_min, 0, h->param.rc.i_qp_max );
     if( h->param.rc.i_vbv_buffer_size )
     {
@@ -1054,8 +1056,9 @@ x264_t *x264_encoder_open( x264_param_t *param )
     if( x264_analyse_init_costs( h, X264_LOOKAHEAD_QP ) )
         goto fail;
 
+    static const uint16_t cost_mv_correct[7] = { 24, 47, 95, 189, 379, 757, 1515 };
     /* Checks for known miscompilation issues. */
-    if( h->cost_mv[1][2013] != 24 )
+    if( h->cost_mv[x264_lambda_tab[X264_LOOKAHEAD_QP]][2013] != cost_mv_correct[BIT_DEPTH-8] )
     {
         x264_log( h, X264_LOG_ERROR, "MV cost test failed: x264 has been miscompiled!\n" );
         goto fail;
@@ -1147,11 +1150,22 @@ x264_t *x264_encoder_open( x264_param_t *param )
         fclose( f );
     }
 
-    x264_log( h, X264_LOG_INFO, "profile %s, level %d.%d\n",
-        h->sps->i_profile_idc == PROFILE_BASELINE ? "Baseline" :
-        h->sps->i_profile_idc == PROFILE_MAIN ? "Main" :
-        h->sps->i_profile_idc == PROFILE_HIGH ? "High" :
-        "High 4:4:4 Predictive", h->sps->i_level_idc/10, h->sps->i_level_idc%10 );
+    const char *profile = h->sps->i_profile_idc == PROFILE_BASELINE ? "Baseline" :
+                          h->sps->i_profile_idc == PROFILE_MAIN ? "Main" :
+                          h->sps->i_profile_idc == PROFILE_HIGH ? "High" :
+                          h->sps->i_profile_idc == PROFILE_HIGH10 ? "High 10" :
+                          "High 4:4:4 Predictive";
+
+    if( h->sps->i_profile_idc < PROFILE_HIGH10 )
+    {
+        x264_log( h, X264_LOG_INFO, "profile %s, level %d.%d\n",
+            profile, h->sps->i_level_idc/10, h->sps->i_level_idc%10 );
+    }
+    else
+    {
+        x264_log( h, X264_LOG_INFO, "profile %s, level %d.%d, bit depth %d\n",
+            profile, h->sps->i_level_idc/10, h->sps->i_level_idc%10, BIT_DEPTH );
+    }
 
     return h;
 fail:
@@ -1836,7 +1850,7 @@ static int x264_slice_write( x264_t *h )
         bs_align_1( &h->out.bs );
 
         /* init cabac */
-        x264_cabac_context_init( &h->cabac, h->sh.i_type, h->sh.i_qp, h->sh.i_cabac_init_idc );
+        x264_cabac_context_init( &h->cabac, h->sh.i_type, x264_clip3( h->sh.i_qp-QP_BD_OFFSET, 0, 51 ), h->sh.i_cabac_init_idc );
         x264_cabac_encode_init ( &h->cabac, h->out.bs.p, h->out.bs.p_end );
     }
     h->mb.i_last_qp = h->sh.i_qp;
@@ -2705,6 +2719,7 @@ static int x264_encoder_frame_end( x264_t *h, x264_t *thread_current,
     for( int i = 0; i < 3; i++ )
     {
         pic_out->img.i_stride[i] = h->fdec->i_stride[i];
+        // FIXME This breaks the API when pixel != uint8_t.
         pic_out->img.plane[i] = h->fdec->plane[i];
     }
 
diff --git a/encoder/macroblock.h b/encoder/macroblock.h
index b1b02fa5..7c833448 100644
--- a/encoder/macroblock.h
+++ b/encoder/macroblock.h
@@ -26,8 +26,8 @@
 
 #include "common/macroblock.h"
 
-extern const int x264_lambda2_tab[52];
-extern const uint8_t x264_lambda_tab[52];
+extern const int x264_lambda2_tab[QP_MAX_MAX+1];
+extern const uint16_t x264_lambda_tab[QP_MAX_MAX+1];
 
 void x264_rdo_init( void );
 
diff --git a/encoder/me.h b/encoder/me.h
index 912b05d1..b125f3d0 100644
--- a/encoder/me.h
+++ b/encoder/me.h
@@ -68,7 +68,7 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei
 void x264_me_refine_bidir_satd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight );
 uint64_t x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel );
 
-extern uint16_t *x264_cost_mv_fpel[92][4];
+extern uint16_t *x264_cost_mv_fpel[LAMBDA_MAX+1][4];
 
 #define COPY1_IF_LT(x,y)\
 if((y)<(x))\
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 7f5ba962..a2c58252 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -219,7 +219,7 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
     uint32_t ssd = res >> 32;
     frame->i_pixel_sum[i] += sum;
     frame->i_pixel_ssd[i] += ssd;
-    return ssd - (sum * sum >> shift);
+    return ssd - ((uint64_t)sum * sum >> shift);
 }
 
 // Find the total AC energy of the block in all planes.
@@ -287,6 +287,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_off
     {
         if( h->param.rc.i_aq_mode == X264_AQ_AUTOVARIANCE )
         {
+            float bit_depth_correction = powf(1 << (BIT_DEPTH-8), 0.5f);
             float avg_adj_pow2 = 0.f;
             for( int mb_y = 0; mb_y < h->mb.i_mb_height; mb_y++ )
                 for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x++ )
@@ -299,8 +300,8 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_off
                 }
             avg_adj /= h->mb.i_mb_count;
             avg_adj_pow2 /= h->mb.i_mb_count;
-            strength = h->param.rc.f_aq_strength * avg_adj;
-            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - 14.f) / avg_adj;
+            strength = h->param.rc.f_aq_strength * avg_adj / bit_depth_correction;
+            avg_adj = avg_adj - 0.5f * (avg_adj_pow2 - (14.f * bit_depth_correction)) / avg_adj;
         }
         else
             strength = h->param.rc.f_aq_strength * 1.0397f;
@@ -318,7 +319,7 @@ void x264_adaptive_quant_frame( x264_t *h, x264_frame_t *frame, float *quant_off
                 else
                 {
                     uint32_t energy = x264_ac_energy_mb( h, mb_x, mb_y, frame );
-                    qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - 14.427f);
+                    qp_adj = strength * (x264_log2( X264_MAX(energy, 1) ) - (14.427f + 2*(BIT_DEPTH-8)));
                 }
                 if( quant_offsets )
                     qp_adj += quant_offsets[mb_xy];
@@ -620,8 +621,8 @@ int x264_ratecontrol_new( x264_t *h )
     rc->ip_offset = 6.0 * log2f( h->param.rc.f_ip_factor );
     rc->pb_offset = 6.0 * log2f( h->param.rc.f_pb_factor );
     rc->qp_constant[SLICE_TYPE_P] = h->param.rc.i_qp_constant;
-    rc->qp_constant[SLICE_TYPE_I] = x264_clip3( h->param.rc.i_qp_constant - rc->ip_offset + 0.5, 0, 51 );
-    rc->qp_constant[SLICE_TYPE_B] = x264_clip3( h->param.rc.i_qp_constant + rc->pb_offset + 0.5, 0, 51 );
+    rc->qp_constant[SLICE_TYPE_I] = x264_clip3( h->param.rc.i_qp_constant - rc->ip_offset + 0.5, 0, QP_MAX );
+    rc->qp_constant[SLICE_TYPE_B] = x264_clip3( h->param.rc.i_qp_constant + rc->pb_offset + 0.5, 0, QP_MAX );
     h->mb.ip_offset = rc->ip_offset + 0.5;
 
     rc->lstep = pow( 2, h->param.rc.i_qp_step / 6.0 );
@@ -1180,18 +1181,24 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )
         if( l->level_idc == 41 && h->param.i_nal_hrd )
             mincr = 4;
 
-        /* The spec has a bizarre special case for the first frame. */
-        if( h->i_frame == 0 )
-        {
-            //384 * ( Max( PicSizeInMbs, fR * MaxMBPS ) + MaxMBPS * ( tr( 0 ) - tr,n( 0 ) ) ) / MinCR
-            double fr = 1. / 172;
-            int pic_size_in_mbs = h->mb.i_mb_width * h->mb.i_mb_height;
-            rc->frame_size_maximum = 384 * 8 * X264_MAX( pic_size_in_mbs, fr*l->mbps ) / mincr;
-        }
+        /* High 10 doesn't require minCR, so just set the maximum to a large value. */
+        if( h->sps->i_profile_idc == PROFILE_HIGH10 )
+            rc->frame_size_maximum = 1e9;
         else
         {
-            //384 * MaxMBPS * ( tr( n ) - tr( n - 1 ) ) / MinCR
-            rc->frame_size_maximum = 384 * 8 * ((double)h->fenc->i_cpb_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale) * l->mbps / mincr;
+            /* The spec has a bizarre special case for the first frame. */
+            if( h->i_frame == 0 )
+            {
+                //384 * ( Max( PicSizeInMbs, fR * MaxMBPS ) + MaxMBPS * ( tr( 0 ) - tr,n( 0 ) ) ) / MinCR
+                double fr = 1. / 172;
+                int pic_size_in_mbs = h->mb.i_mb_width * h->mb.i_mb_height;
+                rc->frame_size_maximum = 384 * BIT_DEPTH * X264_MAX( pic_size_in_mbs, fr*l->mbps ) / mincr;
+            }
+            else
+            {
+                //384 * MaxMBPS * ( tr( n ) - tr( n - 1 ) ) / MinCR
+                rc->frame_size_maximum = 384 * BIT_DEPTH * ((double)h->fenc->i_cpb_duration * h->sps->vui.i_num_units_in_tick / h->sps->vui.i_time_scale) * l->mbps / mincr;
+            }
         }
     }
 
@@ -1231,7 +1238,7 @@ void x264_ratecontrol_start( x264_t *h, int i_force_qp, int overhead )
 
     rc->qpa_rc =
     rc->qpa_aq = 0;
-    rc->qp = x264_clip3( (int)(q + 0.5), 0, 51 );
+    rc->qp = x264_clip3( (int)(q + 0.5), 0, QP_MAX );
     h->fdec->f_qp_avg_rc =
     h->fdec->f_qp_avg_aq =
     rc->qpm = q;
@@ -1416,9 +1423,9 @@ int x264_ratecontrol_slice_type( x264_t *h, int frame_num )
              * So just calculate the average QP used so far. */
             h->param.rc.i_qp_constant = (h->stat.i_frame_count[SLICE_TYPE_P] == 0) ? 24
                                       : 1 + h->stat.f_frame_qp[SLICE_TYPE_P] / h->stat.i_frame_count[SLICE_TYPE_P];
-            rc->qp_constant[SLICE_TYPE_P] = x264_clip3( h->param.rc.i_qp_constant, 0, 51 );
-            rc->qp_constant[SLICE_TYPE_I] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) / fabs( h->param.rc.f_ip_factor )) + 0.5 ), 0, 51 );
-            rc->qp_constant[SLICE_TYPE_B] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) * fabs( h->param.rc.f_pb_factor )) + 0.5 ), 0, 51 );
+            rc->qp_constant[SLICE_TYPE_P] = x264_clip3( h->param.rc.i_qp_constant, 0, QP_MAX );
+            rc->qp_constant[SLICE_TYPE_I] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) / fabs( h->param.rc.f_ip_factor )) + 0.5 ), 0, QP_MAX );
+            rc->qp_constant[SLICE_TYPE_B] = x264_clip3( (int)( qscale2qp( qp2qscale( h->param.rc.i_qp_constant ) * fabs( h->param.rc.f_pb_factor )) + 0.5 ), 0, QP_MAX );
 
             x264_log(h, X264_LOG_ERROR, "2nd pass has more frames than 1st pass (%d)\n", rc->num_entries);
             x264_log(h, X264_LOG_ERROR, "continuing anyway, at constant QP=%d\n", h->param.rc.i_qp_constant);
@@ -2652,7 +2659,7 @@ static int init_pass2( x264_t *h )
         }
         else if( expected_bits > all_available_bits && avgq > h->param.rc.i_qp_max - 2 )
         {
-            if( h->param.rc.i_qp_max < 51 )
+            if( h->param.rc.i_qp_max < QP_MAX )
                 x264_log( h, X264_LOG_WARNING, "try increasing target bitrate or increasing qp_max (currently %d)\n", h->param.rc.i_qp_max );
             else
                 x264_log( h, X264_LOG_WARNING, "try increasing target bitrate\n");
diff --git a/encoder/rdo.c b/encoder/rdo.c
index 5dddd03f..863add79 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -443,10 +443,7 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, dctcoef *dct,
         /* We only need to zero an empty 4x4 block. 8x8 can be
            implicitly emptied via zero nnz, as can dc. */
         if( i_coefs == 16 && !dc )
-        {
-            M128( &dct[0] ) = M128_ZERO;
-            M128( &dct[8] ) = M128_ZERO;
-        }
+            memset( dct, 0, 16 * sizeof(dctcoef) );
         return 0;
     }
 
@@ -613,10 +610,7 @@ static ALWAYS_INLINE int quant_trellis_cabac( x264_t *h, dctcoef *dct,
     if( bnode == &nodes_cur[0] )
     {
         if( i_coefs == 16 && !dc )
-        {
-            M128( &dct[0] ) = M128_ZERO;
-            M128( &dct[8] ) = M128_ZERO;
-        }
+            memset( dct, 0, 16 * sizeof(dctcoef) );
         return 0;
     }
 
diff --git a/encoder/set.c b/encoder/set.c
index 9e6e736b..a520b8a6 100644
--- a/encoder/set.c
+++ b/encoder/set.c
@@ -104,6 +104,8 @@ void x264_sps_init( x264_sps_t *sps, int i_id, x264_param_t *param )
     sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0;
     if( sps->b_qpprime_y_zero_transform_bypass )
         sps->i_profile_idc  = PROFILE_HIGH444_PREDICTIVE;
+    else if( BIT_DEPTH > 8 )
+        sps->i_profile_idc  = PROFILE_HIGH10;
     else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT )
         sps->i_profile_idc  = PROFILE_HIGH;
     else if( param->b_cabac || param->i_bframe > 0 || param->b_interlaced || param->b_fake_interlaced || param->analyse.i_weighted_pred > 0 )
@@ -260,8 +262,8 @@ void x264_sps_write( bs_t *s, x264_sps_t *sps )
     if( sps->i_profile_idc >= PROFILE_HIGH )
     {
         bs_write_ue( s, 1 ); // chroma_format_idc = 4:2:0
-        bs_write_ue( s, 0 ); // bit_depth_luma_minus8
-        bs_write_ue( s, 0 ); // bit_depth_chroma_minus8
+        bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_luma_minus8
+        bs_write_ue( s, BIT_DEPTH-8 ); // bit_depth_chroma_minus8
         bs_write( s, 1, sps->b_qpprime_y_zero_transform_bypass );
         bs_write( s, 1, 0 ); // seq_scaling_matrix_present_flag
     }
@@ -488,7 +490,7 @@ void x264_pps_write( bs_t *s, x264_pps_t *pps )
     bs_write( s, 1, pps->b_weighted_pred );
     bs_write( s, 2, pps->b_weighted_bipred );
 
-    bs_write_se( s, pps->i_pic_init_qp - 26 );
+    bs_write_se( s, pps->i_pic_init_qp - 26 - QP_BD_OFFSET );
     bs_write_se( s, pps->i_pic_init_qs - 26 );
     bs_write_se( s, pps->i_chroma_qp_index_offset );
 
@@ -668,7 +670,8 @@ int x264_validate_levels( x264_t *h, int verbose )
     int ret = 0;
     int mbs = h->sps->i_mb_width * h->sps->i_mb_height;
     int dpb = mbs * 384 * h->sps->vui.i_max_dec_frame_buffering;
-    int cbp_factor = h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4;
+    int cbp_factor = h->sps->i_profile_idc==PROFILE_HIGH10 ? 12 :
+                     h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4;
 
     const x264_level_t *l = x264_levels;
     while( l->level_idc != 0 && l->level_idc != h->param.i_level_idc )
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 84a82de1..c7a891da 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -303,7 +303,7 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
                                   (mv1)[0], (mv1)[1], 8, 8, w ); \
             h->mc.avg[PIXEL_8x8]( pix1, 16, src1, stride1, src2, stride2, i_bipred_weight ); \
         } \
-        i_cost = penalty + h->pixf.mbcmp[PIXEL_8x8]( \
+        i_cost = penalty * a->i_lambda + h->pixf.mbcmp[PIXEL_8x8]( \
                            m[0].p_fenc[0], FENC_STRIDE, pix1, 16 ); \
         COPY2_IF_LT( i_bcost, i_cost, list_used, 3 ); \
     }
@@ -393,9 +393,9 @@ static void x264_slicetype_mb_cost( x264_t *h, x264_mb_analysis_t *a,
             }
 
             x264_me_search( h, &m[l], mvc, i_mvc );
-            m[l].cost -= 2; // remove mvcost from skip mbs
+            m[l].cost -= 2 * a->i_lambda; // remove mvcost from skip mbs
             if( M32( m[l].mv ) )
-                m[l].cost += 5;
+                m[l].cost += 5 * a->i_lambda;
 
 skip_motionest:
             CP32( fenc_mvs[l], m[l].mv );
@@ -418,7 +418,7 @@ lowres_intra_mb:
         ALIGNED_ARRAY_16( pixel, edge,[33] );
         pixel *pix = &pix1[8+FDEC_STRIDE - 1];
         pixel *src = &fenc->lowres[0][i_pel_offset - 1];
-        const int intra_penalty = 5;
+        const int intra_penalty = 5 * a->i_lambda;
         int satds[3];
 
         memcpy( pix-FDEC_STRIDE, src-i_stride, 17 * sizeof(pixel) );
@@ -496,7 +496,7 @@ lowres_intra_mb:
         }
     }
 
-    fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = i_bcost + (list_used << LOWRES_COST_SHIFT);
+    fenc->lowres_costs[b-p0][p1-b][i_mb_xy] = X264_MIN( i_bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
 }
 #undef TRY_BIDIR
 
diff --git a/tools/checkasm.c b/tools/checkasm.c
index ddbf8bfd..6a6aeec2 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -40,8 +40,10 @@
 uint8_t *buf1, *buf2;
 /* buf3, buf4: used to store output */
 uint8_t *buf3, *buf4;
-/* pbuf*: point to the same memory as above, just for type convenience */
-pixel *pbuf1, *pbuf2, *pbuf3, *pbuf4;
+/* pbuf1, pbuf2: initialised to random pixel data and shouldn't write into them. */
+pixel *pbuf1, *pbuf2;
+/* pbuf3, pbuf4: point to buf3, buf4, just for type convenience */
+pixel *pbuf3, *pbuf4;
 
 int quiet = 0;
 
@@ -256,11 +258,15 @@ static int check_pixel( int cpu_ref, int cpu_new )
         int z = i|(i>>4);
         z ^= z>>2;
         z ^= z>>1;
-        buf3[i] = ~(buf4[i] = -(z&1));
+        pbuf4[i] = -(z&1) & PIXEL_MAX;
+        pbuf3[i] = ~pbuf4[i] & PIXEL_MAX;
     }
     // random pattern made of maxed pixel differences, in case an intermediate value overflows
     for( int i = 256; i < 0x1000; i++ )
-        buf3[i] = ~(buf4[i] = -(buf1[i&~0x88]&1));
+    {
+        pbuf4[i] = -(pbuf1[i&~0x88]&1) & PIXEL_MAX;
+        pbuf3[i] = ~(pbuf4[i]) & PIXEL_MAX;
+    }
 
 #define TEST_PIXEL( name, align ) \
     ok = 1, used_asm = 0; \
@@ -535,22 +541,22 @@ static int check_dct( int cpu_ref, int cpu_new )
         used_asm = 1; \
         call_c( dct_c.name, t1, pbuf1, pbuf2 ); \
         call_a( dct_asm.name, t2, pbuf1, pbuf2 ); \
-        if( memcmp( t1, t2, size ) ) \
+        if( memcmp( t1, t2, size*sizeof(dctcoef) ) ) \
         { \
             ok = 0; \
             fprintf( stderr, #name " [FAILED]\n" ); \
         } \
     }
     ok = 1; used_asm = 0;
-    TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16*2 );
-    TEST_DCT( sub8x8_dct, dct1, dct2, 16*2*4 );
-    TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4*2 );
-    TEST_DCT( sub16x16_dct, dct1, dct2, 16*2*16 );
+    TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16 );
+    TEST_DCT( sub8x8_dct, dct1, dct2, 16*4 );
+    TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4 );
+    TEST_DCT( sub16x16_dct, dct1, dct2, 16*16 );
     report( "sub_dct4 :" );
 
     ok = 1; used_asm = 0;
-    TEST_DCT( sub8x8_dct8, (void*)dct1[0], (void*)dct2[0], 64*2 );
-    TEST_DCT( sub16x16_dct8, (void*)dct1, (void*)dct2, 64*2*4 );
+    TEST_DCT( sub8x8_dct8, (void*)dct1[0], (void*)dct2[0], 64 );
+    TEST_DCT( sub16x16_dct8, (void*)dct1, (void*)dct2, 64*4 );
     report( "sub_dct8 :" );
 #undef TEST_DCT
 
@@ -574,13 +580,13 @@ static int check_dct( int cpu_ref, int cpu_new )
     { \
         set_func_name( #name ); \
         used_asm = 1; \
-        memcpy( buf3, buf1, 32*32 * sizeof(pixel) ); \
-        memcpy( buf4, buf1, 32*32 * sizeof(pixel) ); \
-        memcpy( dct1, src, 512 * sizeof(pixel) ); \
-        memcpy( dct2, src, 512 * sizeof(pixel) ); \
+        memcpy( pbuf3, pbuf1, 32*32 * sizeof(pixel) ); \
+        memcpy( pbuf4, pbuf1, 32*32 * sizeof(pixel) ); \
+        memcpy( dct1, src, 256 * sizeof(dctcoef) ); \
+        memcpy( dct2, src, 256 * sizeof(dctcoef) ); \
         call_c1( dct_c.name, pbuf3, (void*)dct1 ); \
         call_a1( dct_asm.name, pbuf4, (void*)dct2 ); \
-        if( memcmp( buf3, buf4, 32*32 * sizeof(pixel) ) ) \
+        if( memcmp( pbuf3, pbuf4, 32*32 * sizeof(pixel) ) ) \
         { \
             ok = 0; \
             fprintf( stderr, #name " [FAILED]\n" ); \
@@ -615,10 +621,10 @@ static int check_dct( int cpu_ref, int cpu_new )
                 dct1[0][j] = !i ? (j^j>>1^j>>2^j>>3)&1 ? 4080 : -4080 /* max dc */\
                            : i<8 ? (*p++)&1 ? 4080 : -4080 /* max elements */\
                            : ((*p++)&0x1fff)-0x1000; /* general case */\
-            memcpy( dct2, dct1, 32 );\
+            memcpy( dct2, dct1, 16 * sizeof(dctcoef) );\
             call_c1( dct_c.name, dct1[0] );\
             call_a1( dct_asm.name, dct2[0] );\
-            if( memcmp( dct1, dct2, 32 ) )\
+            if( memcmp( dct1, dct2, 16 * sizeof(dctcoef) ) )\
                 ok = 0;\
         }\
         call_c2( dct_c.name, dct1[0] );\
@@ -658,11 +664,11 @@ static int check_dct( int cpu_ref, int cpu_new )
         int nz_a, nz_c; \
         set_func_name( "zigzag_"#name"_%s", interlace?"field":"frame" ); \
         used_asm = 1; \
-        memcpy( buf3, buf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
-        memcpy( buf4, buf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
+        memcpy( pbuf3, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
+        memcpy( pbuf4, pbuf1, 16*FDEC_STRIDE * sizeof(pixel) ); \
         nz_c = call_c1( zigzag_c.name, t1, pbuf2, pbuf3 ); \
         nz_a = call_a1( zigzag_asm.name, t2, pbuf2, pbuf4 ); \
-        if( memcmp( t1, t2, size*sizeof(dctcoef) )|| memcmp( buf3, buf4, 16*FDEC_STRIDE ) || nz_c != nz_a ) \
+        if( memcmp( t1, t2, size*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE*sizeof(pixel) ) || nz_c != nz_a ) \
         { \
             ok = 0; \
             fprintf( stderr, #name " [FAILED]\n" ); \
@@ -680,8 +686,8 @@ static int check_dct( int cpu_ref, int cpu_new )
         used_asm = 1; \
         for( int i = 0; i < 2; i++ ) \
         { \
-            memcpy( buf3, buf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
-            memcpy( buf4, buf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
+            memcpy( pbuf3, pbuf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
+            memcpy( pbuf4, pbuf2, 16*FDEC_STRIDE * sizeof(pixel) ); \
             for( int j = 0; j < 4; j++ ) \
             { \
                 memcpy( pbuf3 + j*FDEC_STRIDE, (i?pbuf1:pbuf2) + j*FENC_STRIDE, 4 * sizeof(pixel) ); \
@@ -689,7 +695,7 @@ static int check_dct( int cpu_ref, int cpu_new )
             } \
             nz_c = call_c1( zigzag_c.name, t1, pbuf2, pbuf3, &dc_c ); \
             nz_a = call_a1( zigzag_asm.name, t2, pbuf2, pbuf4, &dc_a ); \
-            if( memcmp( t1+1, t2+1, 15*sizeof(dctcoef) ) || memcmp( buf3, buf4, 16*FDEC_STRIDE * sizeof(pixel) ) || nz_c != nz_a || dc_c != dc_a ) \
+            if( memcmp( t1+1, t2+1, 15*sizeof(dctcoef) ) || memcmp( pbuf3, pbuf4, 16*FDEC_STRIDE * sizeof(pixel) ) || nz_c != nz_a || dc_c != dc_a ) \
             { \
                 ok = 0; \
                 fprintf( stderr, #name " [FAILED]\n" ); \
@@ -779,11 +785,11 @@ static int check_mc( int cpu_ref, int cpu_new )
             const x264_weight_t *weight = weight_none; \
             set_func_name( "mc_luma_%dx%d", w, h ); \
             used_asm = 1; \
-            memset( buf3, 0xCD, 1024 ); \
-            memset( buf4, 0xCD, 1024 ); \
+            for( int i = 0; i < 1024; i++ ) \
+                pbuf3[i] = pbuf4[i] = 0xCD; \
             call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
             call_a( mc_a.mc_luma, dst2, 32, src2, 64, dx, dy, w, h, weight ); \
-            if( memcmp( buf3, buf4, 1024 ) ) \
+            if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
             { \
                 fprintf( stderr, "mc_luma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
                 ok = 0; \
@@ -796,8 +802,8 @@ static int check_mc( int cpu_ref, int cpu_new )
             const x264_weight_t *weight = weight_none; \
             set_func_name( "get_ref_%dx%d", w, h ); \
             used_asm = 1; \
-            memset( buf3, 0xCD, 1024 ); \
-            memset( buf4, 0xCD, 1024 ); \
+            for( int i = 0; i < 1024; i++ ) \
+                pbuf3[i] = pbuf4[i] = 0xCD; \
             call_c( mc_c.mc_luma, dst1, 32, src2, 64, dx, dy, w, h, weight ); \
             ref = (pixel*)call_a( mc_a.get_ref, ref, &ref_stride, src2, 64, dx, dy, w, h, weight ); \
             for( int i = 0; i < h; i++ ) \
@@ -814,15 +820,15 @@ static int check_mc( int cpu_ref, int cpu_new )
         { \
             set_func_name( "mc_chroma_%dx%d", w, h ); \
             used_asm = 1; \
-            memset( buf3, 0xCD, 1024 ); \
-            memset( buf4, 0xCD, 1024 ); \
+            for( int i = 0; i < 1024; i++ ) \
+                pbuf3[i] = pbuf4[i] = 0xCD; \
             call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \
             call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \
             /* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \
             for( int j = 0; j < h; j++ ) \
                 for( int i = w; i < 4; i++ ) \
                     dst2[i+j*16] = dst1[i+j*16]; \
-            if( memcmp( buf3, buf4, 1024 ) ) \
+            if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
             { \
                 fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d]     [FAILED]\n", dx, dy, w, h ); \
                 ok = 0; \
@@ -867,15 +873,15 @@ static int check_mc( int cpu_ref, int cpu_new )
     ok = 1, used_asm = 0; \
     for( int i = 0; i < 10; i++ ) \
     { \
-        memcpy( buf3, pbuf1+320, 320 * sizeof(pixel) ); \
-        memcpy( buf4, pbuf1+320, 320 * sizeof(pixel) ); \
+        memcpy( pbuf3, pbuf1+320, 320 * sizeof(pixel) ); \
+        memcpy( pbuf4, pbuf1+320, 320 * sizeof(pixel) ); \
         if( mc_a.name[i] != mc_ref.name[i] ) \
         { \
             set_func_name( "%s_%s", #name, pixel_names[i] ); \
             used_asm = 1; \
             call_c1( mc_c.name[i], pbuf3, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
             call_a1( mc_a.name[i], pbuf4, 16, pbuf2+1, 16, pbuf1+18, 16, weight ); \
-            if( memcmp( buf3, buf4, 320 * sizeof(pixel) ) ) \
+            if( memcmp( pbuf3, pbuf4, 320 * sizeof(pixel) ) ) \
             { \
                 ok = 0; \
                 fprintf( stderr, #name "[%d]: [FAILED]\n", i ); \
@@ -971,8 +977,8 @@ static int check_mc( int cpu_ref, int cpu_new )
         void *tmp = pbuf3+49*64;
         set_func_name( "hpel_filter" );
         ok = 1; used_asm = 1;
-        memset( buf3, 0, 4096 * sizeof(pixel) );
-        memset( buf4, 0, 4096 * sizeof(pixel) );
+        memset( pbuf3, 0, 4096 * sizeof(pixel) );
+        memset( pbuf4, 0, 4096 * sizeof(pixel) );
         call_c( mc_c.hpel_filter, dstc[0], dstc[1], dstc[2], srchpel, 64, 48, 10, tmp );
         call_a( mc_a.hpel_filter, dsta[0], dsta[1], dsta[2], srchpel, 64, 48, 10, tmp );
         for( int i = 0; i < 3; i++ )
@@ -1030,13 +1036,13 @@ static int check_mc( int cpu_ref, int cpu_new )
         int stride = 80;\
         set_func_name( #name );\
         used_asm = 1;\
-        memcpy( buf3, buf1, size*2*stride * sizeof(pixel) );\
-        memcpy( buf4, buf1, size*2*stride * sizeof(pixel) );\
-        uint16_t *sum = (uint16_t*)buf3;\
+        memcpy( pbuf3, pbuf1, size*2*stride * sizeof(pixel) );\
+        memcpy( pbuf4, pbuf1, size*2*stride * sizeof(pixel) );\
+        uint16_t *sum = (uint16_t*)pbuf3;\
         call_c1( mc_c.name, __VA_ARGS__ );\
-        sum = (uint16_t*)buf4;\
+        sum = (uint16_t*)pbuf4;\
         call_a1( mc_a.name, __VA_ARGS__ );\
-        if( memcmp( buf3, buf4, (stride-8)*2 * sizeof(pixel) )\
+        if( memcmp( pbuf3, pbuf4, (stride-8)*2 * sizeof(pixel) )\
             || (size>9 && memcmp( pbuf3+18*stride, pbuf4+18*stride, (stride-8)*2 * sizeof(pixel) )))\
             ok = 0;\
         call_c2( mc_c.name, __VA_ARGS__ );\
@@ -1096,11 +1102,11 @@ static int check_deblock( int cpu_ref, int cpu_new )
     /* not exactly the real values of a,b,tc but close enough */
     for( int i = 35, a = 255, c = 250; i >= 0; i-- )
     {
-        alphas[i] = a;
-        betas[i] = (i+1)/2;
-        tcs[i][0] = tcs[i][3] = (c+6)/10;
-        tcs[i][1] = (c+7)/15;
-        tcs[i][2] = (c+9)/20;
+        alphas[i] = a << (BIT_DEPTH-8);
+        betas[i] = (i+1)/2 << (BIT_DEPTH-8);
+        tcs[i][0] = tcs[i][3] = (c+6)/10 << (BIT_DEPTH-8);
+        tcs[i][1] = (c+7)/15 << (BIT_DEPTH-8);
+        tcs[i][2] = (c+9)/20 << (BIT_DEPTH-8);
         a = a*9/10;
         c = c*9/10;
     }
@@ -1111,15 +1117,15 @@ static int check_deblock( int cpu_ref, int cpu_new )
         int off = 8*32 + (i&15)*4*!align; /* benchmark various alignments of h filter */ \
         for( int j = 0; j < 1024; j++ ) \
             /* two distributions of random to excersize different failure modes */ \
-            buf3[j] = rand() & (i&1 ? 0xf : 0xff ); \
-        memcpy( buf4, buf3, 1024 * sizeof(pixel) ); \
+            pbuf3[j] = rand() & (i&1 ? 0xf : PIXEL_MAX ); \
+        memcpy( pbuf4, pbuf3, 1024 * sizeof(pixel) ); \
         if( db_a.name != db_ref.name ) \
         { \
             set_func_name( #name ); \
             used_asm = 1; \
             call_c1( db_c.name, pbuf3+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
             call_a1( db_a.name, pbuf4+off, 32, alphas[i], betas[i], ##__VA_ARGS__ ); \
-            if( memcmp( buf3, buf4, 1024 * sizeof(pixel) ) ) \
+            if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
             { \
                 ok = 0; \
                 fprintf( stderr, #name "(a=%d, b=%d): [FAILED]\n", alphas[i], betas[i] ); \
@@ -1200,7 +1206,7 @@ static int check_quant( int cpu_ref, int cpu_new )
     h->pps = h->pps_array;
     x264_param_default( &h->param );
     h->chroma_qp_table = i_chroma_qp_table + 12;
-    h->param.rc.i_qp_min = 26;
+    h->param.rc.i_qp_min = 26 + QP_BD_OFFSET;
     h->param.analyse.b_transform_8x8 = 1;
 
     for( int i_cqm = 0; i_cqm < 4; i_cqm++ )
@@ -1219,9 +1225,10 @@ static int check_quant( int cpu_ref, int cpu_new )
         }
         else
         {
+            int max_scale = BIT_DEPTH < 10 ? 255 : 228;
             if( i_cqm == 2 )
                 for( int i = 0; i < 64; i++ )
-                    cqm_buf[i] = 10 + rand() % 246;
+                    cqm_buf[i] = 10 + rand() % (max_scale - 9);
             else
                 for( int i = 0; i < 64; i++ )
                     cqm_buf[i] = 1;
@@ -1260,7 +1267,7 @@ static int check_quant( int cpu_ref, int cpu_new )
         { \
             set_func_name( #name ); \
             used_asms[0] = 1; \
-            for( int qp = 51; qp > 0; qp-- ) \
+            for( int qp = QP_MAX; qp > 0; qp-- ) \
             { \
                 for( int j = 0; j < 2; j++ ) \
                 { \
@@ -1269,7 +1276,7 @@ static int check_quant( int cpu_ref, int cpu_new )
                         dct1[i] = dct2[i] = j ? (rand() & 0x1fff) - 0xfff : 0; \
                     result_c = call_c1( qf_c.name, dct1, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
                     result_a = call_a1( qf_a.name, dct2, h->quant4_mf[CQM_4IY][qp][0], h->quant4_bias[CQM_4IY][qp][0] ); \
-                    if( memcmp( dct1, dct2, 16*2 ) || result_c != result_a ) \
+                    if( memcmp( dct1, dct2, 16*sizeof(dctcoef) ) || result_c != result_a ) \
                     { \
                         oks[0] = 0; \
                         fprintf( stderr, #name "(cqm=%d): [FAILED]\n", i_cqm ); \
@@ -1286,14 +1293,14 @@ static int check_quant( int cpu_ref, int cpu_new )
         { \
             set_func_name( #qname ); \
             used_asms[0] = 1; \
-            for( int qp = 51; qp > 0; qp-- ) \
+            for( int qp = QP_MAX; qp > 0; qp-- ) \
             { \
                 for( int j = 0; j < 2; j++ ) \
                 { \
                     INIT_QUANT##w(j) \
                     int result_c = call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
                     int result_a = call_a1( qf_a.qname, dct2, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
-                    if( memcmp( dct1, dct2, w*w*2 ) || result_c != result_a ) \
+                    if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) || result_c != result_a ) \
                     { \
                         oks[0] = 0; \
                         fprintf( stderr, #qname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
@@ -1317,14 +1324,14 @@ static int check_quant( int cpu_ref, int cpu_new )
         { \
             set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
             used_asms[1] = 1; \
-            for( int qp = 51; qp > 0; qp-- ) \
+            for( int qp = QP_MAX; qp > 0; qp-- ) \
             { \
                 INIT_QUANT##w(1) \
                 call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
-                memcpy( dct2, dct1, w*w*2 ); \
+                memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
                 call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
                 call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
-                if( memcmp( dct1, dct2, w*w*2 ) ) \
+                if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \
                 { \
                     oks[1] = 0; \
                     fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
@@ -1345,15 +1352,15 @@ static int check_quant( int cpu_ref, int cpu_new )
         { \
             set_func_name( "%s_%s", #dqname, i_cqm?"cqm":"flat" ); \
             used_asms[1] = 1; \
-            for( int qp = 51; qp > 0; qp-- ) \
+            for( int qp = QP_MAX; qp > 0; qp-- ) \
             { \
                 for( int i = 0; i < 16; i++ ) \
                     dct1[i] = rand(); \
                 call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
-                memcpy( dct2, dct1, w*w*2 ); \
+                memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
                 call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
                 call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
-                if( memcmp( dct1, dct2, w*w*2 ) ) \
+                if( memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \
                 { \
                     oks[1] = 0; \
                     fprintf( stderr, #dqname "(qp=%d, cqm=%d, block="#block"): [FAILED]\n", qp, i_cqm ); \
@@ -1381,12 +1388,12 @@ static int check_quant( int cpu_ref, int cpu_new )
         for( int size = 16; size <= 64; size += 48 )
         {
             set_func_name( "denoise_dct" );
-            memcpy( dct1, buf1, size*2 );
-            memcpy( dct2, buf1, size*2 );
+            memcpy( dct1, buf1, size*sizeof(dctcoef) );
+            memcpy( dct2, buf1, size*sizeof(dctcoef) );
             memcpy( buf3+256, buf3, 256 );
             call_c1( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size );
             call_a1( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size );
-            if( memcmp( dct1, dct2, size*2 ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) )
+            if( memcmp( dct1, dct2, size*sizeof(dctcoef) ) || memcmp( buf3+4, buf3+256+4, (size-1)*sizeof(uint32_t) ) )
                 ok = 0;
             call_c2( qf_c.denoise_dct, dct1, (uint32_t*)buf3, (uint16_t*)buf2, size );
             call_a2( qf_a.denoise_dct, dct2, (uint32_t*)(buf3+256), (uint16_t*)buf2, size );
@@ -1431,7 +1438,7 @@ static int check_quant( int cpu_ref, int cpu_new )
         { \
             int nnz = 0; \
             int max = rand() & (w*w-1); \
-            memset( dct1, 0, w*w*2 ); \
+            memset( dct1, 0, w*w*sizeof(dctcoef) ); \
             for( int idx = ac; idx < max; idx++ ) \
                 nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
             if( !nnz ) \
@@ -1464,7 +1471,7 @@ static int check_quant( int cpu_ref, int cpu_new )
             x264_run_level_t runlevel_c, runlevel_a; \
             int nnz = 0; \
             int max = rand() & (w*w-1); \
-            memset( dct1, 0, w*w*2 ); \
+            memset( dct1, 0, w*w*sizeof(dctcoef) ); \
             memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \
             memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \
             for( int idx = ac; idx < max; idx++ ) \
@@ -1474,7 +1481,7 @@ static int check_quant( int cpu_ref, int cpu_new )
             int result_c = call_c( qf_c.lastname, dct1+ac, &runlevel_c ); \
             int result_a = call_a( qf_a.lastname, dct1+ac, &runlevel_a ); \
             if( result_c != result_a || runlevel_c.last != runlevel_a.last || \
-                memcmp(runlevel_c.level, runlevel_a.level, sizeof(int16_t)*result_c) || \
+                memcmp(runlevel_c.level, runlevel_a.level, sizeof(dctcoef)*result_c) || \
                 memcmp(runlevel_c.run, runlevel_a.run, sizeof(uint8_t)*(result_c-1)) ) \
             { \
                 ok = 0; \
@@ -1529,11 +1536,11 @@ static int check_intra( int cpu_ref, int cpu_new )
     {\
         set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\
         used_asm = 1;\
-        memcpy( buf3, buf1, 32*20 * sizeof(pixel) );\
-        memcpy( buf4, buf1, 32*20 * sizeof(pixel) );\
+        memcpy( pbuf3, pbuf1, 32*20 * sizeof(pixel) );\
+        memcpy( pbuf4, pbuf1, 32*20 * sizeof(pixel) );\
         call_c( ip_c.name[dir], pbuf3+48, ##__VA_ARGS__ );\
         call_a( ip_a.name[dir], pbuf4+48, ##__VA_ARGS__ );\
-        if( memcmp( buf3, buf4, 32*20 * sizeof(pixel) ) )\
+        if( memcmp( pbuf3, pbuf4, 32*20 * sizeof(pixel) ) )\
         {\
             fprintf( stderr, #name "[%d] :  [FAILED]\n", dir );\
             ok = 0;\
@@ -1544,7 +1551,7 @@ static int check_intra( int cpu_ref, int cpu_new )
             {\
                 printf( "%2x ", edge[14-j] );\
                 for( int k = 0; k < w; k++ )\
-                    printf( "%2x ", buf4[48+k+j*32] );\
+                    printf( "%2x ", pbuf4[48+k+j*32] );\
                 printf( "\n" );\
             }\
             printf( "\n" );\
@@ -1552,7 +1559,7 @@ static int check_intra( int cpu_ref, int cpu_new )
             {\
                 printf( "   " );\
                 for( int k = 0; k < w; k++ )\
-                    printf( "%2x ", buf3[48+k+j*32] );\
+                    printf( "%2x ", pbuf3[48+k+j*32] );\
                 printf( "\n" );\
             }\
         }\
@@ -1831,8 +1838,9 @@ int main(int argc, char *argv[])
     fprintf( stderr, "x264: using random seed %u\n", seed );
     srand( seed );
 
-    buf1 = x264_malloc( 0x3e00 + 16*BENCH_ALIGNS );
-    if( !buf1 )
+    buf1 = x264_malloc( 0x1e00 + 0x2000*sizeof(pixel) + 16*BENCH_ALIGNS );
+    pbuf1 = x264_malloc( 0x1e00*sizeof(pixel) + 16*BENCH_ALIGNS );
+    if( !buf1 || !pbuf1 )
     {
         fprintf( stderr, "malloc failed, unable to initiate tests!\n" );
         return -1;
@@ -1840,15 +1848,17 @@ int main(int argc, char *argv[])
 #define INIT_POINTER_OFFSETS\
     buf2 = buf1 + 0xf00;\
     buf3 = buf2 + 0xf00;\
-    buf4 = buf3 + 0x1000;\
-    pbuf1 = (pixel*)buf1;\
-    pbuf2 = (pixel*)buf2;\
+    buf4 = buf3 + 0x1000*sizeof(pixel);\
+    pbuf2 = pbuf1 + 0xf00;\
     pbuf3 = (pixel*)buf3;\
     pbuf4 = (pixel*)buf4;
     INIT_POINTER_OFFSETS;
     for( int i = 0; i < 0x1e00; i++ )
+    {
         buf1[i] = rand() & 0xFF;
-    memset( buf1+0x1e00, 0, 0x2000 );
+        pbuf1[i] = rand() & PIXEL_MAX;
+    }
+    memset( buf1+0x1e00, 0, 0x2000*sizeof(pixel) );
 
     /* 16-byte alignment is guaranteed whenever it's useful, but some functions also vary in speed depending on %64 */
     if( do_bench )
@@ -1857,6 +1867,7 @@ int main(int argc, char *argv[])
             INIT_POINTER_OFFSETS;
             ret |= x264_stack_pagealign( check_all_flags, i*16 );
             buf1 += 16;
+            pbuf1 += 16;
             quiet = 1;
             fprintf( stderr, "%d/%d\r", i+1, BENCH_ALIGNS );
         }
diff --git a/x264.c b/x264.c
index 0bede93b..e6d27d17 100644
--- a/x264.c
+++ b/x264.c
@@ -262,6 +262,7 @@ static void Help( x264_param_t *defaults, int longhelp )
         " .mkv -> Matroska\n"
         " .flv -> Flash Video\n"
         " .mp4 -> MP4 if compiled with GPAC support (%s)\n"
+        "Output bit depth: %d (configured at compile time)\n"
         "\n"
         "Options:\n"
         "\n"
@@ -286,10 +287,11 @@ static void Help( x264_param_t *defaults, int longhelp )
         "no",
 #endif
 #if HAVE_GPAC
-        "yes"
+        "yes",
 #else
-        "no"
+        "no",
 #endif
+        BIT_DEPTH
       );
     H0( "Example usage:\n" );
     H0( "\n" );
@@ -311,7 +313,7 @@ static void Help( x264_param_t *defaults, int longhelp )
     H0( "\n" );
     H0( "Presets:\n" );
     H0( "\n" );
-    H0( "      --profile               Force the limits of an H.264 profile [high]\n"
+    H0( "      --profile               Force the limits of an H.264 profile\n"
         "                                  Overrides all settings.\n" );
     H2( "                                  - baseline:\n"
         "                                    --no-8x8dct --bframes 0 --no-cabac\n"
@@ -322,8 +324,11 @@ static void Help( x264_param_t *defaults, int longhelp )
         "                                    --no-8x8dct --cqm flat\n"
         "                                    No lossless.\n"
         "                                  - high:\n"
-        "                                    No lossless.\n" );
-        else H0( "                                  - baseline,main,high\n" );
+        "                                    No lossless.\n"
+        "                                  - high10:\n"
+        "                                    No lossless.\n"
+        "                                    Support for bit depth 8-10.\n" );
+        else H0( "                                  - baseline,main,high,high10\n" );
     H0( "      --preset                Use a preset to select encoding settings [medium]\n"
         "                                  Overridden by user settings.\n" );
     H2( "                                  - ultrafast:\n"
@@ -453,9 +458,9 @@ static void Help( x264_param_t *defaults, int longhelp )
     H0( "\n" );
     H0( "Ratecontrol:\n" );
     H0( "\n" );
-    H1( "  -q, --qp <integer>          Force constant QP (0-51, 0=lossless)\n" );
+    H1( "  -q, --qp <integer>          Force constant QP (0-%d, 0=lossless)\n", QP_MAX );
     H0( "  -B, --bitrate <integer>     Set bitrate (kbit/s)\n" );
-    H0( "      --crf <float>           Quality-based VBR (0-51, 0=lossless) [%.1f]\n", defaults->rc.f_rf_constant );
+    H0( "      --crf <float>           Quality-based VBR (0-%d, 0=lossless) [%.1f]\n", QP_MAX, defaults->rc.f_rf_constant );
     H1( "      --rc-lookahead <integer> Number of frames for frametype lookahead [%d]\n", defaults->rc.i_lookahead );
     H0( "      --vbv-maxrate <integer> Max local bitrate (kbit/s) [%d]\n", defaults->rc.i_vbv_max_bitrate );
     H0( "      --vbv-bufsize <integer> Set size of the VBV buffer (kbit) [%d]\n", defaults->rc.i_vbv_buffer_size );
@@ -1040,6 +1045,7 @@ static int Parse( int argc, char **argv, x264_param_t *param, cli_opt_t *opt )
 #else
                 printf( "using a non-gcc compiler\n" );
 #endif
+                printf( "configuration: --bit-depth=%d\n", BIT_DEPTH );
                 exit(0);
             case OPT_FRAMES:
                 param->i_frame_total = X264_MAX( atoi( optarg ), 0 );
@@ -1318,7 +1324,7 @@ static void parse_qpfile( cli_opt_t *opt, x264_picture_t *pic, int i_frame )
         else if( type == 'B' ) pic->i_type = X264_TYPE_BREF;
         else if( type == 'b' ) pic->i_type = X264_TYPE_B;
         else ret = 0;
-        if( ret != 3 || qp < -1 || qp > 51 )
+        if( ret != 3 || qp < -1 || qp > QP_MAX )
         {
             x264_cli_log( "x264", X264_LOG_ERROR, "can't parse qpfile for frame %d\n", i_frame );
             fclose( opt->qpfile );
diff --git a/x264.h b/x264.h
index 097365a4..4d9b9ca6 100644
--- a/x264.h
+++ b/x264.h
@@ -344,7 +344,7 @@ typedef struct x264_param_t
     {
         int         i_rc_method;    /* X264_RC_* */
 
-        int         i_qp_constant;  /* 0-51 */
+        int         i_qp_constant;  /* 0 to (51 + 6*(BIT_DEPTH-8)) */
         int         i_qp_min;       /* min allowed QP value */
         int         i_qp_max;       /* max allowed QP value */
         int         i_qp_step;      /* max QP step between frames */
@@ -550,7 +550,7 @@ void    x264_param_apply_fastfirstpass( x264_param_t * );
 /* x264_param_apply_profile:
  *      Applies the restrictions of the given profile.
  *      Currently available profiles are, from most to least restrictive: */
-static const char * const x264_profile_names[] = { "baseline", "main", "high", 0 };
+static const char * const x264_profile_names[] = { "baseline", "main", "high", "high10", 0 };
 
 /*      (can be NULL, in which case the function will do nothing)
  *