D: Altivec optimizations
S: Brittany, France
+N: Henrik Gramner
+E: hengar-6 AT student DOT ltu DOT se
+D: 4:2:2 chroma subsampling, x86 asm
+S: Sweden
+
N: Fiona Glaser
E: fiona AT x264 DOT com
D: x86 asm, 1pass VBV, adaptive quantization, inline asm
uint8_t run[16];
} x264_run_level_t;
-extern const vlc_t x264_coeff0_token[5];
-extern const vlc_t x264_coeff_token[5][16][4];
+extern const vlc_t x264_coeff0_token[6];
+extern const vlc_t x264_coeff_token[6][16][4];
extern const vlc_t x264_total_zeros[15][16];
-extern const vlc_t x264_total_zeros_dc[3][4];
+extern const vlc_t x264_total_zeros_2x2_dc[3][4];
+extern const vlc_t x264_total_zeros_2x4_dc[7][8];
extern const vlc_t x264_run_before[7][16];
typedef struct
}
}
+static int profile_string_to_int( const char *str )
+{
+ if( !strcasecmp( str, "baseline" ) )
+ return PROFILE_BASELINE;
+ if( !strcasecmp( str, "main" ) )
+ return PROFILE_MAIN;
+ if( !strcasecmp( str, "high" ) )
+ return PROFILE_HIGH;
+ if( !strcasecmp( str, "high10" ) )
+ return PROFILE_HIGH10;
+ if( !strcasecmp( str, "high422" ) )
+ return PROFILE_HIGH422;
+ if( !strcasecmp( str, "high444" ) )
+ return PROFILE_HIGH444_PREDICTIVE;
+ return -1;
+}
+
int x264_param_apply_profile( x264_param_t *param, const char *profile )
{
if( !profile )
return 0;
-#if BIT_DEPTH > 8
- if( !strcasecmp( profile, "baseline" ) || !strcasecmp( profile, "main" ) ||
- !strcasecmp( profile, "high" ) )
+ int p = profile_string_to_int( profile );
+ if( p < 0 )
{
- x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support a bit depth of %d.\n", profile, BIT_DEPTH );
+ x264_log( NULL, X264_LOG_ERROR, "invalid profile: %s\n", profile );
+ return -1;
+ }
+ if( p < PROFILE_HIGH444_PREDICTIVE && ((param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant <= 0) ||
+ (param->rc.i_rc_method == X264_RC_CRF && (int)(param->rc.f_rf_constant + QP_BD_OFFSET) <= 0)) )
+ {
+ x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support lossless\n", profile );
+ return -1;
+ }
+ if( p < PROFILE_HIGH444_PREDICTIVE && (param->i_csp & X264_CSP_MASK) >= X264_CSP_I444 )
+ {
+ x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support 4:4:4\n", profile );
+ return -1;
+ }
+ if( p < PROFILE_HIGH422 && (param->i_csp & X264_CSP_MASK) >= X264_CSP_I422 )
+ {
+ x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support 4:2:2\n", profile );
+ return -1;
+ }
+ if( p < PROFILE_HIGH10 && BIT_DEPTH > 8 )
+ {
+ x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support a bit depth of %d\n", profile, BIT_DEPTH );
return -1;
}
-#endif
- if( !strcasecmp( profile, "baseline" ) )
+ if( p == PROFILE_BASELINE )
{
param->analyse.b_transform_8x8 = 0;
param->b_cabac = 0;
return -1;
}
}
- else if( !strcasecmp( profile, "main" ) )
+ else if( p == PROFILE_MAIN )
{
param->analyse.b_transform_8x8 = 0;
param->i_cqm_preset = X264_CQM_FLAT;
param->psz_cqm_file = NULL;
}
- else if( !strcasecmp( profile, "high" ) || !strcasecmp( profile, "high10" ) )
- {
- /* Default */
- }
- else
- {
- x264_log( NULL, X264_LOG_ERROR, "invalid profile: %s\n", profile );
- return -1;
- }
- if( (param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant <= 0) ||
- (param->rc.i_rc_method == X264_RC_CRF && (int)(param->rc.f_rf_constant + QP_BD_OFFSET) <= 0) )
- {
- x264_log( NULL, X264_LOG_ERROR, "%s profile doesn't support lossless\n", profile );
- return -1;
- }
return 0;
}
[X264_CSP_I420] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } },
[X264_CSP_YV12] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256/2, 256/2 } },
[X264_CSP_NV12] = { 2, { 256*1, 256*1 }, { 256*1, 256/2 }, },
+ [X264_CSP_I422] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
+ [X264_CSP_YV16] = { 3, { 256*1, 256/2, 256/2 }, { 256*1, 256*1, 256*1 } },
+ [X264_CSP_NV16] = { 2, { 256*1, 256*1 }, { 256*1, 256*1 }, },
[X264_CSP_I444] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } },
[X264_CSP_YV24] = { 3, { 256*1, 256*1, 256*1 }, { 256*1, 256*1, 256*1 } },
[X264_CSP_BGR] = { 1, { 256*3 }, { 256*1 }, },
#define IS_DISPOSABLE(type) ( type == X264_TYPE_B )
#define FIX8(f) ((int)(f*(1<<8)+.5))
#define ALIGN(x,a) (((x)+((a)-1))&~((a)-1))
+#define CHROMA_FORMAT h->sps->i_chroma_format_idc
+#define CHROMA_SIZE(s) ((s)>>(h->mb.chroma_h_shift+h->mb.chroma_v_shift))
+#define FRAME_SIZE(s) ((s)+2*CHROMA_SIZE(s))
#define CHECKED_MALLOC( var, size )\
do {\
#define X264_BFRAME_MAX 16
#define X264_REF_MAX 16
#define X264_THREAD_MAX 128
-#define X264_PCM_COST ((384<<CHROMA444)*BIT_DEPTH+16)
+#define X264_PCM_COST (FRAME_SIZE(256*BIT_DEPTH)+16)
#define X264_LOOKAHEAD_MAX 250
#define QP_BD_OFFSET (6*(BIT_DEPTH-8))
#define QP_MAX_SPEC (51+QP_BD_OFFSET)
# define PARAM_INTERLACED 0
#endif
-#define CHROMA444 (h->sps->i_chroma_format_idc == 3)
+#define CHROMA444 (CHROMA_FORMAT == CHROMA_444)
/* Unions for type-punning.
* Mn: load or store n bits, aligned, native-endian
struct
{
ALIGNED_16( dctcoef luma16x16_dc[3][16] );
- ALIGNED_16( dctcoef chroma_dc[2][4] );
+ ALIGNED_16( dctcoef chroma_dc[2][8] );
// FIXME share memory?
ALIGNED_16( dctcoef luma8x8[12][64] );
ALIGNED_16( dctcoef luma4x4[16*3][16] );
int i_mb_height;
int i_mb_count; /* number of mbs in a frame */
+ /* Chroma subsampling */
+ int chroma_h_shift;
+ int chroma_v_shift;
+
/* Strides */
int i_mb_stride;
int i_b8_stride;
ALIGNED_16( uint32_t nr_residual_sum_buf[2][4][64] );
uint32_t nr_count_buf[2][4];
+ uint8_t luma2chroma_pixel[7]; /* Subsampled pixel size */
+
/* Buffers that are allocated per-thread even in sliced threads. */
void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
pixel *intra_border_backup[5][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
/* CPU functions dependents */
x264_predict_t predict_16x16[4+3];
- x264_predict_t predict_8x8c[4+3];
x264_predict8x8_t predict_8x8[9+3];
x264_predict_t predict_4x4[9+3];
+ x264_predict_t predict_chroma[4+3];
+ x264_predict_t predict_8x8c[4+3];
+ x264_predict_t predict_8x16c[4+3];
x264_predict_8x8_filter_t predict_8x8_filter;
x264_pixel_function_t pixf;
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
* Laurent Aimar <fenrir@via.ecp.fr>
+ * Henrik Gramner <hengar-6@student.ltu.se>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
}
}
+static void dct2x4dc( dctcoef dct[8], dctcoef dct4x4[8][16] )
+{
+ int a0 = dct4x4[0][0] + dct4x4[1][0];
+ int a1 = dct4x4[2][0] + dct4x4[3][0];
+ int a2 = dct4x4[4][0] + dct4x4[5][0];
+ int a3 = dct4x4[6][0] + dct4x4[7][0];
+ int a4 = dct4x4[0][0] - dct4x4[1][0];
+ int a5 = dct4x4[2][0] - dct4x4[3][0];
+ int a6 = dct4x4[4][0] - dct4x4[5][0];
+ int a7 = dct4x4[6][0] - dct4x4[7][0];
+ int b0 = a0 + a1;
+ int b1 = a2 + a3;
+ int b2 = a4 + a5;
+ int b3 = a6 + a7;
+ int b4 = a0 - a1;
+ int b5 = a2 - a3;
+ int b6 = a4 - a5;
+ int b7 = a6 - a7;
+ dct[0] = b0 + b1;
+ dct[1] = b2 + b3;
+ dct[2] = b0 - b1;
+ dct[3] = b2 - b3;
+ dct[4] = b4 - b5;
+ dct[5] = b6 - b7;
+ dct[6] = b4 + b5;
+ dct[7] = b6 + b7;
+ dct4x4[0][0] = 0;
+ dct4x4[1][0] = 0;
+ dct4x4[2][0] = 0;
+ dct4x4[3][0] = 0;
+ dct4x4[4][0] = 0;
+ dct4x4[5][0] = 0;
+ dct4x4[6][0] = 0;
+ dct4x4[7][0] = 0;
+}
+
static inline void pixel_sub_wxh( dctcoef *diff, int i_size,
pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
{
static int sub4x4_dct_dc( pixel *pix1, pixel *pix2 )
{
- dctcoef d[16];
int sum = 0;
-
- pixel_sub_wxh( d, 4, pix1, FENC_STRIDE, pix2, FDEC_STRIDE );
-
- sum += d[0] + d[1] + d[2] + d[3] + d[4] + d[5] + d[6] + d[7];
- sum += d[8] + d[9] + d[10] + d[11] + d[12] + d[13] + d[14] + d[15];
-
+ for( int i=0; i<4; i++, pix1 += FENC_STRIDE, pix2 += FDEC_STRIDE )
+ sum += pix1[0] + pix1[1] + pix1[2] + pix1[3]
+ - pix2[0] - pix2[1] - pix2[2] - pix2[3];
return sum;
}
int d2 = dct[0] - dct[1];
int d3 = dct[2] - dct[3];
dct[0] = d0 + d1;
- dct[2] = d2 + d3;
dct[1] = d0 - d1;
+ dct[2] = d2 + d3;
dct[3] = d2 - d3;
}
+static void sub8x16_dct_dc( dctcoef dct[8], pixel *pix1, pixel *pix2 )
+{
+ int a0 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+0], &pix2[ 0*FDEC_STRIDE+0] );
+ int a1 = sub4x4_dct_dc( &pix1[ 0*FENC_STRIDE+4], &pix2[ 0*FDEC_STRIDE+4] );
+ int a2 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+0], &pix2[ 4*FDEC_STRIDE+0] );
+ int a3 = sub4x4_dct_dc( &pix1[ 4*FENC_STRIDE+4], &pix2[ 4*FDEC_STRIDE+4] );
+ int a4 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+0], &pix2[ 8*FDEC_STRIDE+0] );
+ int a5 = sub4x4_dct_dc( &pix1[ 8*FENC_STRIDE+4], &pix2[ 8*FDEC_STRIDE+4] );
+ int a6 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+0], &pix2[12*FDEC_STRIDE+0] );
+ int a7 = sub4x4_dct_dc( &pix1[12*FENC_STRIDE+4], &pix2[12*FDEC_STRIDE+4] );
+
+ /* 2x4 DC transform */
+ int b0 = a0 + a1;
+ int b1 = a2 + a3;
+ int b2 = a4 + a5;
+ int b3 = a6 + a7;
+ int b4 = a0 - a1;
+ int b5 = a2 - a3;
+ int b6 = a4 - a5;
+ int b7 = a6 - a7;
+ a0 = b0 + b1;
+ a1 = b2 + b3;
+ a2 = b4 + b5;
+ a3 = b6 + b7;
+ a4 = b0 - b1;
+ a5 = b2 - b3;
+ a6 = b4 - b5;
+ a7 = b6 - b7;
+ dct[0] = a0 + a1;
+ dct[1] = a2 + a3;
+ dct[2] = a0 - a1;
+ dct[3] = a2 - a3;
+ dct[4] = a4 - a5;
+ dct[5] = a6 - a7;
+ dct[6] = a4 + a5;
+ dct[7] = a6 + a7;
+}
+
static void add4x4_idct( pixel *p_dst, dctcoef dct[16] )
{
dctcoef d[16];
dctf->add8x8_idct = add8x8_idct;
dctf->add8x8_idct_dc = add8x8_idct_dc;
+ dctf->sub8x16_dct_dc = sub8x16_dct_dc;
+
dctf->sub16x16_dct = sub16x16_dct;
dctf->add16x16_idct = add16x16_idct;
dctf->add16x16_idct_dc = add16x16_idct_dc;
dctf->dct4x4dc = dct4x4dc;
dctf->idct4x4dc = idct4x4dc;
+ dctf->dct2x4dc = dct2x4dc;
+
#if HIGH_BIT_DEPTH
#if HAVE_MMX
if( cpu&X264_CPU_MMX )
void (*add8x8_idct) ( pixel *p_dst, dctcoef dct[4][16] );
void (*add8x8_idct_dc) ( pixel *p_dst, dctcoef dct[4] );
+ void (*sub8x16_dct_dc)( dctcoef dct[8], pixel *pix1, pixel *pix2 );
+
void (*sub16x16_dct) ( dctcoef dct[16][16], pixel *pix1, pixel *pix2 );
void (*add16x16_idct)( pixel *p_dst, dctcoef dct[16][16] );
void (*add16x16_idct_dc) ( pixel *p_dst, dctcoef dct[16] );
void (*dct4x4dc) ( dctcoef d[16] );
void (*idct4x4dc)( dctcoef d[16] );
+ void (*dct2x4dc)( dctcoef dct[8], dctcoef dct4x4[8][16] );
+
} x264_dct_function_t;
typedef struct
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
* Fiona Glaser <fiona@x264.com>
+ * Henrik Gramner <hengar-6@student.ltu.se>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
deblock_edge_luma_c( pix, xstride, alpha, beta, tc0[i] );
}
}
-static void deblock_v_luma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_luma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
for( int d = 0; d < 8; d++, pix += stride )
deblock_edge_luma_c( pix, 1, alpha, beta, tc0[d>>1] );
pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */
}
}
-static inline void deblock_chroma_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
+static ALWAYS_INLINE void deblock_chroma_c( pixel *pix, int height, int xstride, int ystride, int alpha, int beta, int8_t *tc0 )
{
for( int i = 0; i < 4; i++ )
{
int tc = tc0[i];
if( tc <= 0 )
{
- pix += 2*ystride;
+ pix += height*ystride;
continue;
}
- for( int d = 0; d < 2; d++, pix += ystride-2 )
- for( int e = 0; e < 2; e++, pix++ )
- deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] );
+ for( int d = 0; d < height; d++, pix += ystride-2 )
+ for( int e = 0; e < 2; e++, pix++ )
+ deblock_edge_chroma_c( pix, xstride, alpha, beta, tc0[i] );
}
}
-static void deblock_v_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+static void deblock_h_chroma_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
for( int i = 0; i < 4; i++, pix += stride )
deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i] );
}
+static void deblock_h_chroma_422_mbaff_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+{
+ for( int i = 0; i < 8; i++, pix += stride )
+ deblock_edge_chroma_c( pix, 2, alpha, beta, tc0[i>>1] );
+}
static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
- deblock_chroma_c( pix, stride, 2, alpha, beta, tc0 );
+ deblock_chroma_c( pix, 2, stride, 2, alpha, beta, tc0 );
}
static void deblock_h_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
- deblock_chroma_c( pix, 2, stride, alpha, beta, tc0 );
+ deblock_chroma_c( pix, 2, 2, stride, alpha, beta, tc0 );
+}
+static void deblock_h_chroma_422_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
+{
+ deblock_chroma_c( pix, 4, 2, stride, alpha, beta, tc0 );
}
static ALWAYS_INLINE void deblock_edge_luma_intra_c( pixel *pix, int xstride, int alpha, int beta )
for( int d = 0; d < 16; d++, pix += ystride )
deblock_edge_luma_intra_c( pix, xstride, alpha, beta );
}
-static void deblock_v_luma_intra_mbaff_c( pixel *pix, int ystride, int alpha, int beta )
+static void deblock_h_luma_intra_mbaff_c( pixel *pix, int ystride, int alpha, int beta )
{
for( int d = 0; d < 8; d++, pix += ystride )
deblock_edge_luma_intra_c( pix, 1, alpha, beta );
pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */
}
}
-static inline void deblock_chroma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int dir )
+static ALWAYS_INLINE void deblock_chroma_intra_c( pixel *pix, int width, int height, int xstride, int ystride, int alpha, int beta )
{
- for( int d = 0; d < (dir?16:8); d++, pix += ystride-2 )
- for( int e = 0; e < (dir?1:2); e++, pix++ )
- deblock_edge_chroma_intra_c( pix, xstride, alpha, beta );
+ for( int d = 0; d < height; d++, pix += ystride-2 )
+ for( int e = 0; e < width; e++, pix++ )
+ deblock_edge_chroma_intra_c( pix, xstride, alpha, beta );
}
-static void deblock_v_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
+static void deblock_h_chroma_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
{
for( int i = 0; i < 4; i++, pix += stride )
deblock_edge_chroma_intra_c( pix, 2, alpha, beta );
}
+static void deblock_h_chroma_422_intra_mbaff_c( pixel *pix, int stride, int alpha, int beta )
+{
+ for( int i = 0; i < 8; i++, pix += stride )
+ deblock_edge_chroma_intra_c( pix, 2, alpha, beta );
+}
static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
- deblock_chroma_intra_c( pix, stride, 2, alpha, beta, 1 );
+ deblock_chroma_intra_c( pix, 1, 16, stride, 2, alpha, beta );
}
static void deblock_h_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
- deblock_chroma_intra_c( pix, 2, stride, alpha, beta, 0 );
+ deblock_chroma_intra_c( pix, 2, 8, 2, stride, alpha, beta );
+}
+static void deblock_h_chroma_422_intra_c( pixel *pix, int stride, int alpha, int beta )
+{
+ deblock_chroma_intra_c( pix, 2, 16, 2, stride, alpha, beta );
}
static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int stridey = h->fdec->i_stride[0];
int strideuv = h->fdec->i_stride[1];
int chroma444 = CHROMA444;
+ int chroma_height = 16 >> h->mb.chroma_v_shift;
intptr_t uvdiff = chroma444 ? h->fdec->plane[2] - h->fdec->plane[1] : 1;
for( int mb_x = 0; mb_x < h->mb.i_mb_width; mb_x += (~b_interlaced | mb_y)&1, mb_y ^= b_interlaced )
uint8_t (*bs)[8][4] = h->deblock_strength[mb_y&1][mb_x];
pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
- pixel *pixuv = h->fdec->plane[1] + (8<<chroma444)*mb_y*strideuv + 16*mb_x;
+ pixel *pixuv = h->fdec->plane[1] + chroma_height*mb_y*strideuv + 16*mb_x;
if( mb_y & MB_INTERLACED )
{
pixy -= 15*stridey;
- pixuv -= ((8<<chroma444)-1)*strideuv;
+ pixuv -= (chroma_height-1)*strideuv;
}
int stride2y = stridey << MB_INTERLACED;
#define FILTER( intra, dir, edge, qp, chroma_qp )\
do\
{\
- deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\
- stride2y, bs[dir][edge], qp, a, b, 0,\
- h->loopf.deblock_luma##intra[dir] );\
- if( chroma444 )\
+ if( !(edge & 1) || !transform_8x8 )\
{\
- deblock_edge##intra( h, pixuv + 4*edge*(dir?stride2uv:1),\
- stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
- h->loopf.deblock_luma##intra[dir] );\
- deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\
- stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
+ deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\
+ stride2y, bs[dir][edge], qp, a, b, 0,\
h->loopf.deblock_luma##intra[dir] );\
+ if( CHROMA_FORMAT == CHROMA_444 )\
+ {\
+ deblock_edge##intra( h, pixuv + 4*edge*(dir?stride2uv:1),\
+ stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
+ h->loopf.deblock_luma##intra[dir] );\
+ deblock_edge##intra( h, pixuv + uvdiff + 4*edge*(dir?stride2uv:1),\
+ stride2uv, bs[dir][edge], chroma_qp, a, b, 0,\
+ h->loopf.deblock_luma##intra[dir] );\
+ }\
+ else if( CHROMA_FORMAT == CHROMA_420 && !(edge & 1) )\
+ {\
+ deblock_edge##intra( h, pixuv + edge*(dir?2*stride2uv:4),\
+ stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\
+ h->loopf.deblock_chroma##intra[dir] );\
+ }\
}\
- else if( !(edge & 1) )\
- deblock_edge##intra( h, pixuv + 2*edge*(dir?stride2uv:2),\
+ if( CHROMA_FORMAT == CHROMA_422 && (dir || !(edge & 1)) )\
+ {\
+ deblock_edge##intra( h, pixuv + edge*(dir?4*stride2uv:4),\
stride2uv, bs[dir][edge], chroma_qp, a, b, 1,\
h->loopf.deblock_chroma##intra[dir] );\
+ }\
} while(0)
if( h->mb.i_neighbour & MB_LEFT )
int chroma_qp[2];
int left_qp[2];
x264_deblock_inter_t luma_deblock = h->loopf.deblock_luma_mbaff;
- x264_deblock_inter_t chroma_deblock = chroma444 ? h->loopf.deblock_luma_mbaff : h->loopf.deblock_chroma_mbaff;
+ x264_deblock_inter_t chroma_deblock = h->loopf.deblock_chroma_mbaff;
x264_deblock_intra_t luma_intra_deblock = h->loopf.deblock_luma_intra_mbaff;
- x264_deblock_intra_t chroma_intra_deblock = chroma444 ? h->loopf.deblock_luma_intra_mbaff : h->loopf.deblock_chroma_intra_mbaff;
+ x264_deblock_intra_t chroma_intra_deblock = h->loopf.deblock_chroma_intra_mbaff;
int c = chroma444 ? 0 : 1;
left_qp[0] = h->mb.qp[h->mb.i_mb_left_xy[0]];
}
int offy = MB_INTERLACED ? 4 : 0;
- int offuv = MB_INTERLACED ? 3 : 0;
- if( chroma444 ) offuv = offy;
+ int offuv = MB_INTERLACED ? 4-h->mb.chroma_v_shift : 0;
left_qp[1] = h->mb.qp[h->mb.i_mb_left_xy[1]];
luma_qp[1] = (qp + left_qp[1] + 1) >> 1;
chroma_qp[1] = (qpc + h->chroma_qp_table[left_qp[1]] + 1) >> 1;
}
if( !first_edge_only )
{
- if( !transform_8x8 ) FILTER( , 0, 1, qp, qpc );
- FILTER( , 0, 2, qp, qpc );
- if( !transform_8x8 ) FILTER( , 0, 3, qp, qpc );
+ FILTER( , 0, 1, qp, qpc );
+ FILTER( , 0, 2, qp, qpc );
+ FILTER( , 0, 3, qp, qpc );
}
if( h->mb.i_neighbour & MB_TOP )
if( !first_edge_only )
{
- if( !transform_8x8 ) FILTER( , 1, 1, qp, qpc );
- FILTER( , 1, 2, qp, qpc );
- if( !transform_8x8 ) FILTER( , 1, 3, qp, qpc );
+ FILTER( , 1, 1, qp, qpc );
+ FILTER( , 1, 2, qp, qpc );
+ FILTER( , 1, 3, qp, qpc );
}
#undef FILTER
* TODO:
* deblock macroblock edges
* support analysis partitions smaller than 16x16
- * deblock chroma for 4:2:0
+ * deblock chroma for 4:2:0/4:2:2
* handle duplicate refs correctly
* handle cavlc+8x8dct correctly
*/
pf->deblock_luma[1] = deblock_v_luma_c;
pf->deblock_luma[0] = deblock_h_luma_c;
pf->deblock_chroma[1] = deblock_v_chroma_c;
- pf->deblock_chroma[0] = deblock_h_chroma_c;
+ pf->deblock_h_chroma_420 = deblock_h_chroma_c;
+ pf->deblock_h_chroma_422 = deblock_h_chroma_422_c;
pf->deblock_luma_intra[1] = deblock_v_luma_intra_c;
pf->deblock_luma_intra[0] = deblock_h_luma_intra_c;
pf->deblock_chroma_intra[1] = deblock_v_chroma_intra_c;
- pf->deblock_chroma_intra[0] = deblock_h_chroma_intra_c;
- pf->deblock_luma_mbaff = deblock_v_luma_mbaff_c;
- pf->deblock_chroma_mbaff = deblock_v_chroma_mbaff_c;
- pf->deblock_luma_intra_mbaff = deblock_v_luma_intra_mbaff_c;
- pf->deblock_chroma_intra_mbaff = deblock_v_chroma_intra_mbaff_c;
+ pf->deblock_h_chroma_420_intra = deblock_h_chroma_intra_c;
+ pf->deblock_h_chroma_422_intra = deblock_h_chroma_422_intra_c;
+ pf->deblock_luma_mbaff = deblock_h_luma_mbaff_c;
+ pf->deblock_chroma_420_mbaff = deblock_h_chroma_mbaff_c;
+ pf->deblock_chroma_422_mbaff = deblock_h_chroma_422_mbaff_c;
+ pf->deblock_luma_intra_mbaff = deblock_h_luma_intra_mbaff_c;
+ pf->deblock_chroma_420_intra_mbaff = deblock_h_chroma_intra_mbaff_c;
+ pf->deblock_chroma_422_intra_mbaff = deblock_h_chroma_422_intra_mbaff_c;
pf->deblock_strength = deblock_strength_c;
#if HAVE_MMX
pf->deblock_luma[1] = x264_deblock_v_luma_mmx2;
pf->deblock_luma[0] = x264_deblock_h_luma_mmx2;
pf->deblock_chroma[1] = x264_deblock_v_chroma_mmx2;
- pf->deblock_chroma[0] = x264_deblock_h_chroma_mmx2;
+ pf->deblock_h_chroma_420 = x264_deblock_h_chroma_mmx2;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmx2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmx2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmx2;
- pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_mmx2;
+ pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_mmx2;
#endif
pf->deblock_strength = x264_deblock_strength_mmx2;
if( cpu&X264_CPU_SSE2 )
pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2;
- pf->deblock_chroma[0] = x264_deblock_h_chroma_sse2;
+ pf->deblock_h_chroma_420 = x264_deblock_h_chroma_sse2;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
- pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_sse2;
+ pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_sse2;
}
}
if( cpu&X264_CPU_SSSE3 )
pf->deblock_luma[1] = x264_deblock_v_luma_avx;
pf->deblock_luma[0] = x264_deblock_h_luma_avx;
pf->deblock_chroma[1] = x264_deblock_v_chroma_avx;
- pf->deblock_chroma[0] = x264_deblock_h_chroma_avx;
+ pf->deblock_h_chroma_420 = x264_deblock_h_chroma_avx;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_avx;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_avx;
pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_avx;
- pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_avx;
+ pf->deblock_h_chroma_420_intra = x264_deblock_h_chroma_intra_avx;
}
}
}
pf->deblock_luma[1] = x264_deblock_v_luma_neon;
pf->deblock_luma[0] = x264_deblock_h_luma_neon;
// pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
-// pf->deblock_chroma[0] = x264_deblock_h_chroma_neon;
+// pf->deblock_h_chroma_420 = x264_deblock_h_chroma_neon;
}
#endif
#endif // !HIGH_BIT_DEPTH
case X264_CSP_I420:
case X264_CSP_YV12:
return X264_CSP_NV12;
+ case X264_CSP_NV16:
+ case X264_CSP_I422:
+ case X264_CSP_YV16:
+ return X264_CSP_NV16;
case X264_CSP_I444:
case X264_CSP_YV24:
case X264_CSP_BGR:
x264_frame_t *frame;
int i_csp = x264_frame_internal_csp( h->param.i_csp );
int i_mb_count = h->mb.i_mb_count;
- int i_stride, i_width, i_lines;
+ int i_stride, i_width, i_lines, luma_plane_count;
int i_padv = PADV << PARAM_INTERLACED;
int align = h->param.cpu&X264_CPU_CACHELINE_64 ? 64 : h->param.cpu&X264_CPU_CACHELINE_32 ? 32 : 16;
int disalign = h->param.cpu&X264_CPU_ALTIVEC ? 1<<9 : 1<<10;
- int luma_plane_count = i_csp == X264_CSP_NV12 ? 1 : 3;
CHECKED_MALLOCZERO( frame, sizeof(x264_frame_t) );
i_lines = h->mb.i_mb_height*16;
i_stride = align_stride( i_width + 2*PADH, align, disalign );
- if( i_csp == X264_CSP_NV12 )
+ if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
{
+ luma_plane_count = 1;
frame->i_plane = 2;
for( int i = 0; i < 2; i++ )
{
frame->i_width[i] = i_width >> i;
- frame->i_lines[i] = i_lines >> i;
+ frame->i_lines[i] = i_lines >> (i && i_csp == X264_CSP_NV12);
frame->i_stride[i] = i_stride;
}
}
else if( i_csp == X264_CSP_I444 )
{
+ luma_plane_count = 3;
frame->i_plane = 3;
for( int i = 0; i < 3; i++ )
{
frame->orig = frame;
- if( i_csp == X264_CSP_NV12 )
+ if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
{
- int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + i_padv));
+ int chroma_padv = i_padv >> (i_csp == X264_CSP_NV12);
+ int chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*chroma_padv));
CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
- frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * i_padv/2 + PADH;
+ frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * chroma_padv + PADH;
if( PARAM_INTERLACED )
{
CHECKED_MALLOC( frame->buffer_fld[1], chroma_plane_size * sizeof(pixel) );
- frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * i_padv/2 + PADH;
+ frame->plane_fld[1] = frame->buffer_fld[1] + frame->i_stride[1] * chroma_padv + PADH;
}
}
}
else
{
+ int v_shift = h->mb.chroma_v_shift;
get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 );
h->mc.plane_copy( dst->plane[0], dst->i_stride[0], (pixel*)pix[0],
stride[0]/sizeof(pixel), h->param.i_width, h->param.i_height );
- if( i_csp == X264_CSP_NV12 )
+ if( i_csp == X264_CSP_NV12 || i_csp == X264_CSP_NV16 )
{
- get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, 1 );
+ get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, v_shift );
h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
- stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>1 );
+ stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>v_shift );
}
- else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_YV12 )
+ else if( i_csp == X264_CSP_I420 || i_csp == X264_CSP_I422 || i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16 )
{
- get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I420 ? 1 : 2, 1, 1 );
- get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I420 ? 2 : 1, 1, 1 );
+ int uv_swap = i_csp == X264_CSP_YV12 || i_csp == X264_CSP_YV16;
+ get_plane_ptr( h, src, &pix[1], &stride[1], uv_swap ? 2 : 1, 1, v_shift );
+ get_plane_ptr( h, src, &pix[2], &stride[2], uv_swap ? 1 : 2, 1, v_shift );
h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1],
(pixel*)pix[1], stride[1]/sizeof(pixel),
(pixel*)pix[2], stride[2]/sizeof(pixel),
- h->param.i_width>>1, h->param.i_height>>1 );
+ h->param.i_width>>1, h->param.i_height>>v_shift );
}
else //if( i_csp == X264_CSP_I444 || i_csp == X264_CSP_YV24 )
{
return;
for( int i = 0; i < frame->i_plane; i++ )
{
- int shift = i && !CHROMA444;
+ int h_shift = i && h->mb.chroma_h_shift;
+ int v_shift = i && h->mb.chroma_v_shift;
int stride = frame->i_stride[i];
int width = 16*h->mb.i_mb_width;
- int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> shift;
+ int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> SLICE_MBAFF : 16) >> v_shift;
int padh = PADH;
- int padv = PADV >> shift;
+ int padv = PADV >> v_shift;
// buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
if( b_end && !b_start )
- height += 4 >> (shift + SLICE_MBAFF);
+ height += 4 >> (v_shift + SLICE_MBAFF);
pixel *pix;
if( SLICE_MBAFF )
{
// border samples for each field are extended separately
- pix = frame->plane_fld[i] + X264_MAX(0, (16*mb_y-4)*stride >> shift);
- plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, shift );
- plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, shift );
+ pix = frame->plane_fld[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
+ plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, h_shift );
+ plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, h_shift );
- height = (b_end ? 16*(h->mb.i_mb_height - mb_y) : 32) >> shift;
+ height = (b_end ? 16*(h->mb.i_mb_height - mb_y) : 32) >> v_shift;
if( b_end && !b_start )
- height += 4 >> shift;
- pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> shift);
- plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, shift );
+ height += 4 >> v_shift;
+ pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
+ plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, h_shift );
}
else
{
- pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> shift);
- plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, shift );
+ pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> v_shift);
+ plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, h_shift );
}
}
}
void x264_frame_expand_border_chroma( x264_t *h, x264_frame_t *frame, int plane )
{
- int shift = !CHROMA444;
- plane_expand_border( frame->plane[plane], frame->i_stride[plane], 16*h->mb.i_mb_width, 16*h->mb.i_mb_height>>shift,
- PADH, PADV>>shift, 1, 1, shift );
+ int v_shift = h->mb.chroma_v_shift;
+ plane_expand_border( frame->plane[plane], frame->i_stride[plane], 16*h->mb.i_mb_width, 16*h->mb.i_mb_height>>v_shift,
+ PADH, PADV>>v_shift, 1, 1, h->mb.chroma_h_shift );
}
void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
for( int i = 0; i < frame->i_plane; i++ )
{
int i_width = h->param.i_width;
- int shift = i && !CHROMA444;
- int i_height = h->param.i_height >> shift;
+ int h_shift = i && h->mb.chroma_h_shift;
+ int v_shift = i && h->mb.chroma_v_shift;
+ int i_height = h->param.i_height >> v_shift;
int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width);
- int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> shift;
+ int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
if( i_padx )
{
for( int y = 0; y < i_height; y++ )
pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
- &frame->plane[i][y*frame->i_stride[i] + i_width - 1-shift],
- i_padx>>shift, sizeof(pixel)<<shift );
+ &frame->plane[i][y*frame->i_stride[i] + i_width - 1-h_shift],
+ i_padx>>h_shift, sizeof(pixel)<<h_shift );
}
if( i_pady )
{
{
for( int i = 0; i < h->fenc->i_plane; i++ )
{
- int shift = i && !CHROMA444;
+ int v_shift = i && h->mb.chroma_v_shift;
int stride = h->fenc->i_stride[i];
- int height = h->param.i_height >> shift;
- int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> shift;
+ int height = h->param.i_height >> v_shift;
+ int pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> v_shift;
pixel *fenc = h->fenc->plane[i] + 16*mb_x;
for( int y = height; y < height + pady; y++ )
memcpy( fenc + y*stride, fenc + (height-1)*stride, 16*sizeof(pixel) );
{
x264_deblock_inter_t deblock_luma[2];
x264_deblock_inter_t deblock_chroma[2];
+ x264_deblock_inter_t deblock_h_chroma_420;
+ x264_deblock_inter_t deblock_h_chroma_422;
x264_deblock_intra_t deblock_luma_intra[2];
x264_deblock_intra_t deblock_chroma_intra[2];
+ x264_deblock_intra_t deblock_h_chroma_420_intra;
+ x264_deblock_intra_t deblock_h_chroma_422_intra;
x264_deblock_inter_t deblock_luma_mbaff;
x264_deblock_inter_t deblock_chroma_mbaff;
+ x264_deblock_inter_t deblock_chroma_420_mbaff;
+ x264_deblock_inter_t deblock_chroma_422_mbaff;
x264_deblock_intra_t deblock_luma_intra_mbaff;
x264_deblock_intra_t deblock_chroma_intra_mbaff;
+ x264_deblock_intra_t deblock_chroma_420_intra_mbaff;
+ x264_deblock_intra_t deblock_chroma_422_intra_mbaff;
void (*deblock_strength) ( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][8][4], int mvy_limit,
int bframe );
* Authors: Fiona Glaser <fiona@x264.com>
* Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
+ * Henrik Gramner <hengar-6@student.ltu.se>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
}
else
{
- // chroma is offset if MCing from a field of opposite parity
- if( MB_INTERLACED & i_ref )
+ int v_shift = h->mb.chroma_v_shift;
+ // Chroma in 4:2:0 is offset if MCing from a field of opposite parity
+ if( v_shift & MB_INTERLACED & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2;
- h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x],
- &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+ int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
+ height = 4*height >> v_shift;
+
+ h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset],
+ &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
- mvx, mvy, 2*width, 2*height );
+ mvx, 2*mvy>>v_shift, 2*width, height );
if( h->sh.weight[i_ref][1].weightfn )
- h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->sh.weight[i_ref][1], height*2 );
+ h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE,
+ &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE,
+ &h->sh.weight[i_ref][1], height );
if( h->sh.weight[i_ref][2].weightfn )
- h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- &h->sh.weight[i_ref][2],height*2 );
+ h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
+ &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
+ &h->sh.weight[i_ref][2], height );
}
}
static NOINLINE void x264_mb_mc_1xywh( x264_t *h, int x, int y, int width, int height )
}
else
{
- if( MB_INTERLACED & i_ref )
+ int v_shift = h->mb.chroma_v_shift;
+ if( v_shift & MB_INTERLACED & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2;
- h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x],
- &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+ int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
+ h->mc.mc_chroma( &h->mb.pic.p_fdec[1][offset],
+ &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE,
h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1],
- mvx, mvy, 2*width, 2*height );
+ mvx, 2*mvy>>v_shift, 2*width, 4*height>>v_shift );
}
}
}
else
{
- if( MB_INTERLACED & i_ref0 )
+ int v_shift = h->mb.chroma_v_shift;
+ if( v_shift & MB_INTERLACED & i_ref0 )
mvy0 += (h->mb.i_mb_y & 1)*4 - 2;
- if( MB_INTERLACED & i_ref1 )
+ if( v_shift & MB_INTERLACED & i_ref1 )
mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
- mvx0, mvy0, 2*width, 2*height );
+ mvx0, 2*mvy0>>v_shift, 2*width, 4*height>>v_shift );
h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
- mvx1, mvy1, 2*width, 2*height );
- h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
- h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight );
+ mvx1, 2*mvy1>>v_shift, 2*width, 4*height>>v_shift );
+
+ int chromapix = h->luma2chroma_pixel[i_mode];
+ int offset = (4*FDEC_STRIDE>>v_shift)*y + 2*x;
+ h->mc.avg[chromapix]( &h->mb.pic.p_fdec[1][offset], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
+ h->mc.avg[chromapix]( &h->mb.pic.p_fdec[2][offset], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight );
}
}
}
else
{
- luma_plane_size = h->fdec->i_stride[0] * (h->mb.i_mb_height*16+2*i_padv);
+ /* Both ref and fenc is stored for 4:2:0 and 4:2:2 which means that 4:2:0 and 4:4:4
+ * needs the same amount of space and 4:2:2 needs twice that much */
+ luma_plane_size = h->fdec->i_stride[0] * (h->mb.i_mb_height*(16<<(CHROMA_FORMAT==CHROMA_422))+2*i_padv);
if( h->param.analyse.i_weighted_pred == X264_WEIGHTP_SMART )
//smart can weight one ref and one offset -1 in 8-bit
(h->param.analyse.b_dct_decimate && h->sh.i_type != SLICE_TYPE_I);
h->mb.i_mb_prev_xy = -1;
+ /* 4:2:0 4:2:2 4:4:4
+ * fdec fenc fdec fenc fdec fenc
+ * y y y y y y y Y Y Y Y y y y y y y y Y Y Y Y y y y y y y y Y Y Y Y
+ * y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y
+ * y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y
+ * y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y y Y Y Y Y Y Y Y Y
+ * y Y Y Y Y U U V V y Y Y Y Y U U V V y Y Y Y Y U U U U
+ * u u u v v v U U V V u u u v v v U U V V u u u u u u u U U U U
+ * u U U v V V u U U v V V U U V V u U U U U U U U U
+ * u U U v V V u U U v V V U U V V u U U U U U U U U
+ * u U U v V V u U U U U V V V V
+ * u U U v V V u U U U U V V V V
+ * v v v v v v v V V V V
+ * v V V V V V V V V
+ * v V V V V
+ * v V V V V
+ * v V V V V
+ */
h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE;
}
- /* fdec: fenc:
- * yyyyyyy
- * yYYYY YYYY
- * yYYYY YYYY
- * yYYYY YYYY
- * yYYYY YYYY
- * uuu vvv UUVV
- * uUU vVV UUVV
- * uUU vVV
- */
else
{
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
int stride_y = fenc->i_stride[0];
int stride_uv = fenc->i_stride[1];
int off_y = 16 * i_mb_x + 16 * i_mb_y * stride_y;
- int off_uv = 16 * i_mb_x + 8 * i_mb_y * stride_uv;
+ int off_uv = 16 * i_mb_x + (16 * i_mb_y * stride_uv >> h->mb.chroma_v_shift);
h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y,
fenc->plane[1]+off_uv, stride_uv, i_mb_x );
}
static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )
{
int mb_interlaced = b_mbaff && MB_INTERLACED;
- int w = b_chroma ? 8 : 16;
+ int height = b_chroma ? 16 >> h->mb.chroma_v_shift : 16;
int i_stride = h->fdec->i_stride[i];
int i_stride2 = i_stride << mb_interlaced;
int i_pix_offset = mb_interlaced
- ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
- : 16 * mb_x + w * mb_y * i_stride;
+ ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
+ : 16 * mb_x + height * mb_y * i_stride;
pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset];
int fdec_idx = b_mbaff ? (mb_interlaced ? (3 + (mb_y&1)) : (mb_y&1) ? 2 : 4) : 0;
pixel *intra_fdec = &h->intra_border_backup[fdec_idx][i][mb_x*16];
h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
if( b_chroma )
{
- h->mc.load_deinterleave_8x8x2_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2 );
+ h->mc.load_deinterleave_chroma_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2, height );
memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*sizeof(pixel) );
memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*sizeof(pixel) );
if( b_mbaff )
}
if( b_mbaff )
{
- for( int j = 0; j < w; j++ )
+ for( int j = 0; j < height; j++ )
if( b_chroma )
{
h->mb.pic.p_fdec[1][-1+j*FDEC_STRIDE] = plane_fdec[-2+j*i_stride2];
/* load non_zero_count */
CP32( &h->mb.cache.non_zero_count[x264_scan8[ 0] - 8], &nnz[top][12] );
- CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16+4 + 8*CHROMA444] );
- CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32+4 + 8*CHROMA444] );
+ CP32( &h->mb.cache.non_zero_count[x264_scan8[16] - 8], &nnz[top][16-4 + (16>>h->mb.chroma_v_shift)] );
+ CP32( &h->mb.cache.non_zero_count[x264_scan8[32] - 8], &nnz[top][32-4 + (16>>h->mb.chroma_v_shift)] );
/* Finish the prefetching */
for( int l = 0; l < lists; l++ )
h->mb.cache.non_zero_count[x264_scan8[ 8] - 1] = nnz[lbot][left_index_table->nnz[2]];
h->mb.cache.non_zero_count[x264_scan8[10] - 1] = nnz[lbot][left_index_table->nnz[3]];
- if( CHROMA444 )
+ if( CHROMA_FORMAT >= CHROMA_422 )
{
- h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+16];
- h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+16];
- h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+16];
- h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = nnz[lbot][left_index_table->nnz[3]+16];
- h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+32];
- h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+32];
- h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+32];
- h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = nnz[lbot][left_index_table->nnz[3]+32];
+ int offset = (4>>h->mb.chroma_h_shift) - 4;
+ h->mb.cache.non_zero_count[x264_scan8[16+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+16+offset];
+ h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+16+offset];
+ h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+16+offset];
+ h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] = nnz[lbot][left_index_table->nnz[3]+16+offset];
+ h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] = nnz[ltop][left_index_table->nnz[0]+32+offset];
+ h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = nnz[ltop][left_index_table->nnz[1]+32+offset];
+ h->mb.cache.non_zero_count[x264_scan8[32+ 8] - 1] = nnz[lbot][left_index_table->nnz[2]+32+offset];
+ h->mb.cache.non_zero_count[x264_scan8[32+10] - 1] = nnz[lbot][left_index_table->nnz[3]+32+offset];
}
else
{
h->mb.cache.non_zero_count[x264_scan8[16+ 2] - 1] =
h->mb.cache.non_zero_count[x264_scan8[32+ 0] - 1] =
h->mb.cache.non_zero_count[x264_scan8[32+ 2] - 1] = 0x80;
- if( CHROMA444 )
+ if( CHROMA_FORMAT >= CHROMA_422 )
{
h->mb.cache.non_zero_count[x264_scan8[16+ 8] - 1] =
h->mb.cache.non_zero_count[x264_scan8[16+10] - 1] =
{
x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE );
x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE );
+ if( CHROMA_FORMAT == CHROMA_422 )
+ {
+ x264_copy_column8( h->mb.pic.p_fdec[1]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+12*FDEC_STRIDE );
+ x264_copy_column8( h->mb.pic.p_fdec[2]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+12*FDEC_STRIDE );
+ }
x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1, 0 );
}
}
}
/* Early termination: in this case, nnz guarantees all edges use strength 2.*/
- if( h->mb.b_transform_8x8 && (h->mb.i_cbp_luma&7) == 7 && !CHROMA444 )
+ if( h->mb.b_transform_8x8 && !CHROMA444 )
{
- M32( bs[0][0] ) = 0x02020202;
- M32( bs[0][2] ) = 0x02020202;
- M32( bs[0][4] ) = 0x02020202;
- M32( bs[1][0] ) = 0x02020202;
- M32( bs[1][2] ) = 0x02020202;
- M32( bs[1][4] ) = 0x02020202;
- return;
+ int cbp_mask = 0xf >> h->mb.chroma_v_shift;
+ if( (h->mb.i_cbp_luma&cbp_mask) == cbp_mask )
+ {
+ M32( bs[0][0] ) = 0x02020202;
+ M32( bs[0][2] ) = 0x02020202;
+ M32( bs[0][4] ) = 0x02020202;
+ memset( bs[1][0], 2, 5*4*sizeof(uint8_t) ); /* [1][1] and [1][3] has to be set for 4:2:2 */
+ return;
+ }
}
int neighbour_changed = 0;
static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_chroma, int b_mbaff )
{
- int w = b_chroma ? 8 : 16;
+ int height = b_chroma ? 16>>h->mb.chroma_v_shift : 16;
int i_stride = h->fdec->i_stride[i];
int i_stride2 = i_stride << (b_mbaff && MB_INTERLACED);
int i_pix_offset = (b_mbaff && MB_INTERLACED)
- ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
- : 16 * mb_x + w * mb_y * i_stride;
+ ? 16 * mb_x + height * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
+ : 16 * mb_x + height * mb_y * i_stride;
if( b_chroma )
- h->mc.store_interleave_8x8x2( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2] );
+ h->mc.store_interleave_chroma( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], height );
else
h->mc.copy[PIXEL_16x16]( &h->fdec->plane[i][i_pix_offset], i_stride2, h->mb.pic.p_fdec[i], FDEC_STRIDE, 16 );
}
}
else
{
- memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+FDEC_STRIDE*7, 8*sizeof(pixel) );
- memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+FDEC_STRIDE*7, 8*sizeof(pixel) );
+ int backup_src = (15>>h->mb.chroma_v_shift) * FDEC_STRIDE;
+ memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*sizeof(pixel) );
+ memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*sizeof(pixel) );
}
if( b_mbaff )
{
}
else
{
- backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE;
+ if( CHROMA_FORMAT == CHROMA_420 )
+ backup_src = (MB_INTERLACED ? 3 : 6) * FDEC_STRIDE;
memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16 ], h->mb.pic.p_fdec[1]+backup_src, 8*sizeof(pixel) );
memcpy( &h->intra_border_backup[backup_dst][1][mb_x*16+8], h->mb.pic.p_fdec[2]+backup_src, 8*sizeof(pixel) );
}
/* In progressive we update intra_border_backup in-place, so the topleft neighbor will
* no longer exist there when load_pic_pointers wants it. Move it within p_fdec instead. */
h->mb.pic.p_fdec[0][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[0][-FDEC_STRIDE+15];
- h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+7 + 8*CHROMA444];
- h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+7 + 8*CHROMA444];
+ h->mb.pic.p_fdec[1][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[1][-FDEC_STRIDE+(15>>h->mb.chroma_h_shift)];
+ h->mb.pic.p_fdec[2][-FDEC_STRIDE-1] = h->mb.pic.p_fdec[2][-FDEC_STRIDE+(15>>h->mb.chroma_h_shift)];
}
}
CP32( &nnz[16+1*4], &h->mb.cache.non_zero_count[x264_scan8[16+2]] );
CP32( &nnz[32+0*4], &h->mb.cache.non_zero_count[x264_scan8[32+0]] );
CP32( &nnz[32+1*4], &h->mb.cache.non_zero_count[x264_scan8[32+2]] );
- if( CHROMA444 )
+ if( CHROMA_FORMAT >= CHROMA_422 )
{
CP32( &nnz[16+2*4], &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] );
CP32( &nnz[16+3*4], &h->mb.cache.non_zero_count[x264_scan8[16+10]] );
uint8_t (*mvd0)[2] = h->mb.mvd[0][i_mb_xy];
uint8_t (*mvd1)[2] = h->mb.mvd[1][i_mb_xy];
if( IS_INTRA(i_mb_type) && i_mb_type != I_PCM )
- h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ];
+ h->mb.chroma_pred_mode[i_mb_xy] = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode];
else
h->mb.chroma_pred_mode[i_mb_xy] = I_PRED_CHROMA_DC;
# define pack_pixel_2to4 pack16to32
#endif
-#define array_non_zero(a) array_non_zero_int(a, sizeof(a)/sizeof(dctcoef))
-#define array_non_zero_int array_non_zero_int
-static ALWAYS_INLINE int array_non_zero_int( dctcoef *v, int i_count )
-{
- for( int i = 0; i < i_count; i++ )
- if( v[i] )
- return 1;
- return 0;
-}
static ALWAYS_INLINE int x264_mb_predict_intra4x4_mode( x264_t *h, int idx )
{
const int ma = h->mb.cache.intra4x4_pred_mode[x264_scan8[idx] - 1];
PIXEL_AVG_C( pixel_avg_8x16, 8, 16 )
PIXEL_AVG_C( pixel_avg_8x8, 8, 8 )
PIXEL_AVG_C( pixel_avg_8x4, 8, 4 )
+PIXEL_AVG_C( pixel_avg_4x16, 4, 16 )
PIXEL_AVG_C( pixel_avg_4x8, 4, 8 )
PIXEL_AVG_C( pixel_avg_4x4, 4, 4 )
PIXEL_AVG_C( pixel_avg_4x2, 4, 2 )
+PIXEL_AVG_C( pixel_avg_2x8, 2, 8 )
PIXEL_AVG_C( pixel_avg_2x4, 2, 4 )
PIXEL_AVG_C( pixel_avg_2x2, 2, 2 )
}
}
-static void store_interleave_8x8x2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv )
+static void store_interleave_chroma( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height )
{
- for( int y=0; y<8; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
+ for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
for( int x=0; x<8; x++ )
{
dst[2*x] = srcu[x];
}
}
-static void load_deinterleave_8x8x2_fenc( pixel *dst, pixel *src, int i_src )
+static void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height )
{
- x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, 8 );
+ x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, height );
}
-static void load_deinterleave_8x8x2_fdec( pixel *dst, pixel *src, int i_src )
+static void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height )
{
- x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, 8 );
+ x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, height );
}
static void prefetch_fenc_null( pixel *pix_y, int stride_y,
{
pf->mc_luma = mc_luma;
pf->get_ref = get_ref;
+
pf->mc_chroma = mc_chroma;
pf->avg[PIXEL_16x16]= pixel_avg_16x16;
pf->avg[PIXEL_8x16] = pixel_avg_8x16;
pf->avg[PIXEL_8x8] = pixel_avg_8x8;
pf->avg[PIXEL_8x4] = pixel_avg_8x4;
+ pf->avg[PIXEL_4x16] = pixel_avg_4x16;
pf->avg[PIXEL_4x8] = pixel_avg_4x8;
pf->avg[PIXEL_4x4] = pixel_avg_4x4;
pf->avg[PIXEL_4x2] = pixel_avg_4x2;
+ pf->avg[PIXEL_2x8] = pixel_avg_2x8;
pf->avg[PIXEL_2x4] = pixel_avg_2x4;
pf->avg[PIXEL_2x2] = pixel_avg_2x2;
pf->copy[PIXEL_8x8] = mc_copy_w8;
pf->copy[PIXEL_4x4] = mc_copy_w4;
- pf->store_interleave_8x8x2 = store_interleave_8x8x2;
- pf->load_deinterleave_8x8x2_fenc = load_deinterleave_8x8x2_fenc;
- pf->load_deinterleave_8x8x2_fdec = load_deinterleave_8x8x2_fdec;
+ pf->store_interleave_chroma = store_interleave_chroma;
+ pf->load_deinterleave_chroma_fenc = load_deinterleave_chroma_fenc;
+ pf->load_deinterleave_chroma_fdec = load_deinterleave_chroma_fdec;
pf->plane_copy = x264_plane_copy_c;
pf->plane_copy_interleave = x264_plane_copy_interleave_c;
typedef struct
{
- void (*mc_luma)(pixel *dst, int i_dst, pixel **src, int i_src,
- int mvx, int mvy,
- int i_width, int i_height, const x264_weight_t *weight );
+ void (*mc_luma)( pixel *dst, int i_dst, pixel **src, int i_src,
+ int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
/* may round up the dimensions if they're not a power of 2 */
- pixel* (*get_ref)(pixel *dst, int *i_dst, pixel **src, int i_src,
- int mvx, int mvy,
- int i_width, int i_height, const x264_weight_t *weight );
+ pixel* (*get_ref)( pixel *dst, int *i_dst, pixel **src, int i_src,
+ int mvx, int mvy, int i_width, int i_height, const x264_weight_t *weight );
/* mc_chroma may write up to 2 bytes of garbage to the right of dst,
* so it must be run from left to right. */
- void (*mc_chroma)(pixel *dstu, pixel *dstv, int i_dst, pixel *src, int i_src,
- int mvx, int mvy,
- int i_width, int i_height );
+ void (*mc_chroma)( pixel *dstu, pixel *dstv, int i_dst, pixel *src, int i_src,
+ int mvx, int mvy, int i_width, int i_height );
- void (*avg[10])( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight );
+ void (*avg[12])( pixel *dst, int, pixel *src1, int, pixel *src2, int, int i_weight );
/* only 16x16, 8x8, and 4x4 defined */
void (*copy[7])( pixel *dst, int, pixel *src, int, int i_height );
void (*copy_16x16_unaligned)( pixel *dst, int, pixel *src, int, int i_height );
- void (*store_interleave_8x8x2)( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
- void (*load_deinterleave_8x8x2_fenc)( pixel *dst, pixel *src, int i_src );
- void (*load_deinterleave_8x8x2_fdec)( pixel *dst, pixel *src, int i_src );
+ void (*store_interleave_chroma)( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
+ void (*load_deinterleave_chroma_fenc)( pixel *dst, pixel *src, int i_src, int height );
+ void (*load_deinterleave_chroma_fdec)( pixel *dst, pixel *src, int i_src, int height );
void (*plane_copy)( pixel *dst, int i_dst,
pixel *src, int i_src, int w, int h );
PIXEL_SAD_C( x264_pixel_sad_8x16, 8, 16 )
PIXEL_SAD_C( x264_pixel_sad_8x8, 8, 8 )
PIXEL_SAD_C( x264_pixel_sad_8x4, 8, 4 )
+PIXEL_SAD_C( x264_pixel_sad_4x16, 4, 16 )
PIXEL_SAD_C( x264_pixel_sad_4x8, 4, 8 )
PIXEL_SAD_C( x264_pixel_sad_4x4, 4, 4 )
-
/****************************************************************************
* pixel_ssd_WxH
****************************************************************************/
PIXEL_SSD_C( x264_pixel_ssd_8x16, 8, 16 )
PIXEL_SSD_C( x264_pixel_ssd_8x8, 8, 8 )
PIXEL_SSD_C( x264_pixel_ssd_8x4, 8, 4 )
+PIXEL_SSD_C( x264_pixel_ssd_4x16, 4, 16 )
PIXEL_SSD_C( x264_pixel_ssd_4x8, 4, 8 )
PIXEL_SSD_C( x264_pixel_ssd_4x4, 4, 4 )
/****************************************************************************
* pixel_var_wxh
****************************************************************************/
-#define PIXEL_VAR_C( name, w ) \
+#define PIXEL_VAR_C( name, w, h ) \
static uint64_t name( pixel *pix, int i_stride ) \
{ \
uint32_t sum = 0, sqr = 0; \
- for( int y = 0; y < w; y++ ) \
+ for( int y = 0; y < h; y++ ) \
{ \
for( int x = 0; x < w; x++ ) \
{ \
return sum + ((uint64_t)sqr << 32); \
}
-PIXEL_VAR_C( x264_pixel_var_16x16, 16 )
-PIXEL_VAR_C( x264_pixel_var_8x8, 8 )
+PIXEL_VAR_C( x264_pixel_var_16x16, 16, 16 )
+PIXEL_VAR_C( x264_pixel_var_8x16, 8, 16 )
+PIXEL_VAR_C( x264_pixel_var_8x8, 8, 8 )
/****************************************************************************
* pixel_var2_wxh
****************************************************************************/
-static int pixel_var2_8x8( pixel *pix1, int i_stride1, pixel *pix2, int i_stride2, int *ssd )
-{
- uint32_t var = 0, sum = 0, sqr = 0;
- for( int y = 0; y < 8; y++ )
- {
- for( int x = 0; x < 8; x++ )
- {
- int diff = pix1[x] - pix2[x];
- sum += diff;
- sqr += diff * diff;
- }
- pix1 += i_stride1;
- pix2 += i_stride2;
- }
- sum = abs(sum);
- var = sqr - ((uint64_t)sum * sum >> 6);
- *ssd = sqr;
- return var;
+#define PIXEL_VAR2_C( name, w, h ) \
+static int name( pixel *pix1, int i_stride1, pixel *pix2, int i_stride2, int *ssd ) \
+{ \
+ uint32_t var = 0, sum = 0, sqr = 0; \
+ for( int y = 0; y < h; y++ ) \
+ { \
+ for( int x = 0; x < w; x++ ) \
+ { \
+ int diff = pix1[x] - pix2[x]; \
+ sum += diff; \
+ sqr += diff * diff; \
+ } \
+ pix1 += i_stride1; \
+ pix2 += i_stride2; \
+ } \
+ sum = abs(sum); \
+ var = sqr - ((uint64_t)sum * sum >> 6); \
+ *ssd = sqr; \
+ return var; \
}
+PIXEL_VAR2_C( x264_pixel_var2_8x16, 8, 16 )
+PIXEL_VAR2_C( x264_pixel_var2_8x8, 8, 8 )
+
#if BIT_DEPTH > 8
typedef uint32_t sum_t;
typedef uint64_t sum2_t;
PIXEL_SATD_C( 16, 8, x264_pixel_satd_8x4 )
PIXEL_SATD_C( 8, 16, x264_pixel_satd_8x4 )
PIXEL_SATD_C( 8, 8, x264_pixel_satd_8x4 )
+PIXEL_SATD_C( 4, 16, x264_pixel_satd_4x4 )
PIXEL_SATD_C( 4, 8, x264_pixel_satd_4x4 )
-
static NOINLINE int sa8d_8x8( pixel *pix1, int i_pix1, pixel *pix2, int i_pix2 )
{
sum2_t tmp[8][4];
INTRA_MBCMP(satd, 4x4, v, h, dc, ,, _c )
INTRA_MBCMP( sad, 8x8, dc, h, v, c,, _c )
INTRA_MBCMP(satd, 8x8, dc, h, v, c,, _c )
+INTRA_MBCMP( sad, 8x16, dc, h, v, c,, _c )
+INTRA_MBCMP(satd, 8x16, dc, h, v, c,, _c )
INTRA_MBCMP( sad, 16x16, v, h, dc, ,, _c )
INTRA_MBCMP(satd, 16x16, v, h, dc, ,, _c )
#define INIT7_NAME( name1, name2, cpu ) \
INIT6_NAME( name1, name2, cpu ) \
pixf->name1[PIXEL_4x4] = x264_pixel_##name2##_4x4##cpu;
+#define INIT8_NAME( name1, name2, cpu ) \
+ INIT7_NAME( name1, name2, cpu ) \
+ pixf->name1[PIXEL_4x16] = x264_pixel_##name2##_4x16##cpu;
#define INIT2( name, cpu ) INIT2_NAME( name, name, cpu )
#define INIT4( name, cpu ) INIT4_NAME( name, name, cpu )
#define INIT5( name, cpu ) INIT5_NAME( name, name, cpu )
#define INIT6( name, cpu ) INIT6_NAME( name, name, cpu )
#define INIT7( name, cpu ) INIT7_NAME( name, name, cpu )
+#define INIT8( name, cpu ) INIT8_NAME( name, name, cpu )
#define INIT_ADS( cpu ) \
pixf->ads[PIXEL_16x16] = x264_pixel_ads4##cpu;\
pixf->ads[PIXEL_16x8] = x264_pixel_ads2##cpu;\
pixf->ads[PIXEL_8x8] = x264_pixel_ads1##cpu;
- INIT7( sad, );
- INIT7_NAME( sad_aligned, sad, );
+ INIT8( sad, );
+ INIT8_NAME( sad_aligned, sad, );
INIT7( sad_x3, );
INIT7( sad_x4, );
- INIT7( ssd, );
- INIT7( satd, );
+ INIT8( ssd, );
+ INIT8( satd, );
INIT7( satd_x3, );
INIT7( satd_x4, );
INIT4( hadamard_ac, );
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16;
+ pixf->var[PIXEL_8x16] = x264_pixel_var_8x16;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8;
pixf->ssd_nv12_core = pixel_ssd_nv12_core;
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
pixf->ssim_end4 = ssim_end4;
- pixf->var2_8x8 = pixel_var2_8x8;
pixf->vsad = pixel_vsad;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4;
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c;
+ pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c;
+ pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16;
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16;
pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmx2;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmx2;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmx2;
- pixf->var2_8x8 = x264_pixel_var2_8x8_mmx2;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_mmx2;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmx2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
- pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
- pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_sse2;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
{
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmx2;
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_mmx2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_mmx2;
- pixf->var2_8x8 = x264_pixel_var2_8x8_mmx2;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_mmx2;
pixf->vsad = x264_pixel_vsad_mmx2;
if( cpu&X264_CPU_CACHELINE_32 )
#if ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_sse2;
#endif
- pixf->var2_8x8 = x264_pixel_var2_8x8_sse2;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
pixf->vsad = x264_pixel_vsad_sse2;
}
#if ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
#endif
- pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_neon;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon;
- pixf->var2_8x8 = x264_pixel_var2_8x8_neon;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon;
pixf->ssim_end4 = x264_pixel_ssim_end4_neon;
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
* Fiona Glaser <fiona@x264.com>
+ Henrik Gramner <hengar-6@student.ltu.se>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
PIXEL_8x4 = 4,
PIXEL_4x8 = 5,
PIXEL_4x4 = 6,
- PIXEL_4x2 = 7,
- PIXEL_2x4 = 8,
- PIXEL_2x2 = 9,
+
+ /* Subsampled chroma only */
+ PIXEL_4x16 = 7, /* 4:2:2 */
+ PIXEL_4x2 = 8,
+ PIXEL_2x8 = 9, /* 4:2:2 */
+ PIXEL_2x4 = 10,
+ PIXEL_2x2 = 11,
};
-static const struct
-{
- int w;
- int h;
-} x264_pixel_size[7] =
+static const struct { uint8_t w, h; } x264_pixel_size[12] =
{
- { 16, 16 },
- { 16, 8 }, { 8, 16 },
- { 8, 8 },
- { 8, 4 }, { 4, 8 },
- { 4, 4 }
+ { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 },
+ { 4, 16 }, { 4, 2 }, { 2, 8 }, { 2, 4 }, { 2, 2 },
};
static const uint8_t x264_size2pixel[5][5] =
{ 0, 0, PIXEL_8x16, 0, PIXEL_16x16 }
};
+static const uint8_t x264_luma2chroma_pixel[4][7] =
+{
+ { 0 },
+ { PIXEL_8x8, PIXEL_8x4, PIXEL_4x8, PIXEL_4x4, PIXEL_4x2, PIXEL_2x4, PIXEL_2x2 }, /* 4:2:0 */
+ { PIXEL_8x16, PIXEL_8x8, PIXEL_4x16, PIXEL_4x8, PIXEL_4x4, PIXEL_2x8, PIXEL_2x4 }, /* 4:2:2 */
+ { PIXEL_16x16, PIXEL_16x8, PIXEL_8x16, PIXEL_8x8, PIXEL_8x4, PIXEL_4x8, PIXEL_4x4 }, /* 4:4:4 */
+};
+
typedef struct
{
- x264_pixel_cmp_t sad[7];
- x264_pixel_cmp_t ssd[7];
- x264_pixel_cmp_t satd[7];
+ x264_pixel_cmp_t sad[8];
+ x264_pixel_cmp_t ssd[8];
+ x264_pixel_cmp_t satd[8];
x264_pixel_cmp_t ssim[7];
x264_pixel_cmp_t sa8d[4];
- x264_pixel_cmp_t mbcmp[7]; /* either satd or sad for subpel refine and mode decision */
- x264_pixel_cmp_t mbcmp_unaligned[7]; /* unaligned mbcmp for subpel */
- x264_pixel_cmp_t fpelcmp[7]; /* either satd or sad for fullpel motion search */
+ x264_pixel_cmp_t mbcmp[8]; /* either satd or sad for subpel refine and mode decision */
+ x264_pixel_cmp_t mbcmp_unaligned[8]; /* unaligned mbcmp for subpel */
+ x264_pixel_cmp_t fpelcmp[8]; /* either satd or sad for fullpel motion search */
x264_pixel_cmp_x3_t fpelcmp_x3[7];
x264_pixel_cmp_x4_t fpelcmp_x4[7];
- x264_pixel_cmp_t sad_aligned[7]; /* Aligned SAD for mbcmp */
+ x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */
int (*vsad)( pixel *, int, int );
- int (*var2_8x8)( pixel *, int, pixel *, int, int * );
uint64_t (*var[4])( pixel *pix, int stride );
+ int (*var2[4])( pixel *pix1, int stride1,
+ pixel *pix2, int stride2, int *ssd );
uint64_t (*hadamard_ac[4])( pixel *pix, int stride );
void (*ssd_nv12_core)( pixel *pixuv1, int stride1,
void (*intra_mbcmp_x3_16x16)( pixel *fenc, pixel *fdec, int res[3] );
void (*intra_satd_x3_16x16) ( pixel *fenc, pixel *fdec, int res[3] );
void (*intra_sad_x3_16x16) ( pixel *fenc, pixel *fdec, int res[3] );
- void (*intra_mbcmp_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] );
- void (*intra_satd_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] );
- void (*intra_sad_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] );
void (*intra_mbcmp_x3_4x4) ( pixel *fenc, pixel *fdec, int res[3] );
void (*intra_satd_x3_4x4) ( pixel *fenc, pixel *fdec, int res[3] );
void (*intra_sad_x3_4x4) ( pixel *fenc, pixel *fdec, int res[3] );
+ void (*intra_mbcmp_x3_chroma)( pixel *fenc, pixel *fdec, int res[3] );
+ void (*intra_satd_x3_chroma) ( pixel *fenc, pixel *fdec, int res[3] );
+ void (*intra_sad_x3_chroma) ( pixel *fenc, pixel *fdec, int res[3] );
+ void (*intra_mbcmp_x3_8x16c) ( pixel *fenc, pixel *fdec, int res[3] );
+ void (*intra_satd_x3_8x16c) ( pixel *fenc, pixel *fdec, int res[3] );
+ void (*intra_sad_x3_8x16c) ( pixel *fenc, pixel *fdec, int res[3] );
+ void (*intra_mbcmp_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] );
+ void (*intra_satd_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] );
+ void (*intra_sad_x3_8x8c) ( pixel *fenc, pixel *fdec, int res[3] );
void (*intra_mbcmp_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] );
void (*intra_sa8d_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] );
void (*intra_sad_x3_8x8) ( pixel *fenc, pixel edge[36], int res[3] );
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
* Fiona Glaser <fiona@x264.com>
+ * Henrik Gramner <hengar-6@student.ltu.se>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
/****************************************************************************
- * 8x8 prediction for intra chroma block
+ * 8x8 prediction for intra chroma block (4:2:0)
****************************************************************************/
static void x264_predict_8x8c_dc_128_c( pixel *src )
}
}
+/****************************************************************************
+ * 8x16 prediction for intra chroma block (4:2:2)
+ ****************************************************************************/
+
+static void x264_predict_8x16c_dc_128_c( pixel *src )
+{
+ for( int y = 0; y < 16; y++ )
+ {
+ MPIXEL_X4( src+0 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
+ MPIXEL_X4( src+4 ) = PIXEL_SPLAT_X4( 1 << (BIT_DEPTH-1) );
+ src += FDEC_STRIDE;
+ }
+}
+static void x264_predict_8x16c_dc_left_c( pixel *src )
+{
+ for( int i = 0; i < 4; i++ )
+ {
+ int dc = 0;
+
+ for( int y = 0; y < 4; y++ )
+ dc += src[y*FDEC_STRIDE - 1];
+
+ pixel4 dcsplat = PIXEL_SPLAT_X4( (dc + 2) >> 2 );
+
+ for( int y = 0; y < 4; y++ )
+ {
+ MPIXEL_X4( src+0 ) = dcsplat;
+ MPIXEL_X4( src+4 ) = dcsplat;
+ src += FDEC_STRIDE;
+ }
+ }
+}
+static void x264_predict_8x16c_dc_top_c( pixel *src )
+{
+ int dc0 = 0, dc1 = 0;
+
+ for(int x = 0; x < 4; x++ )
+ {
+ dc0 += src[x - FDEC_STRIDE];
+ dc1 += src[x + 4 - FDEC_STRIDE];
+ }
+ pixel4 dc0splat = PIXEL_SPLAT_X4( ( dc0 + 2 ) >> 2 );
+ pixel4 dc1splat = PIXEL_SPLAT_X4( ( dc1 + 2 ) >> 2 );
+
+ for( int y = 0; y < 16; y++ )
+ {
+ MPIXEL_X4( src+0 ) = dc0splat;
+ MPIXEL_X4( src+4 ) = dc1splat;
+ src += FDEC_STRIDE;
+ }
+}
+void x264_predict_8x16c_dc_c( pixel *src )
+{
+ int s0 = 0, s1 = 0, s2 = 0, s3 = 0, s4 = 0, s5 = 0;
+
+ /*
+ s0 s1
+ s2
+ s3
+ s4
+ s5
+ */
+ for( int i = 0; i < 4; i++ )
+ {
+ s0 += src[i+0 - FDEC_STRIDE];
+ s1 += src[i+4 - FDEC_STRIDE];
+ s2 += src[-1 + (i+0) * FDEC_STRIDE];
+ s3 += src[-1 + (i+4) * FDEC_STRIDE];
+ s4 += src[-1 + (i+8) * FDEC_STRIDE];
+ s5 += src[-1 + (i+12) * FDEC_STRIDE];
+ }
+ /*
+ dc0 dc1
+ dc2 dc3
+ dc4 dc5
+ dc6 dc7
+ */
+ pixel4 dc0 = PIXEL_SPLAT_X4( ( s0 + s2 + 4 ) >> 3 );
+ pixel4 dc1 = PIXEL_SPLAT_X4( ( s1 + 2 ) >> 2 );
+ pixel4 dc2 = PIXEL_SPLAT_X4( ( s3 + 2 ) >> 2 );
+ pixel4 dc3 = PIXEL_SPLAT_X4( ( s1 + s3 + 4 ) >> 3 );
+ pixel4 dc4 = PIXEL_SPLAT_X4( ( s4 + 2 ) >> 2 );
+ pixel4 dc5 = PIXEL_SPLAT_X4( ( s1 + s4 + 4 ) >> 3 );
+ pixel4 dc6 = PIXEL_SPLAT_X4( ( s5 + 2 ) >> 2 );
+ pixel4 dc7 = PIXEL_SPLAT_X4( ( s1 + s5 + 4 ) >> 3 );
+
+ for( int y = 0; y < 4; y++ )
+ {
+ MPIXEL_X4( src+0 ) = dc0;
+ MPIXEL_X4( src+4 ) = dc1;
+ src += FDEC_STRIDE;
+ }
+ for( int y = 0; y < 4; y++ )
+ {
+ MPIXEL_X4( src+0 ) = dc2;
+ MPIXEL_X4( src+4 ) = dc3;
+ src += FDEC_STRIDE;
+ }
+ for( int y = 0; y < 4; y++ )
+ {
+ MPIXEL_X4( src+0 ) = dc4;
+ MPIXEL_X4( src+4 ) = dc5;
+ src += FDEC_STRIDE;
+ }
+ for( int y = 0; y < 4; y++ )
+ {
+ MPIXEL_X4( src+0 ) = dc6;
+ MPIXEL_X4( src+4 ) = dc7;
+ src += FDEC_STRIDE;
+ }
+}
+void x264_predict_8x16c_h_c( pixel *src )
+{
+ for( int i = 0; i < 16; i++ )
+ {
+ pixel4 v = PIXEL_SPLAT_X4( src[-1] );
+ MPIXEL_X4( src+0 ) = v;
+ MPIXEL_X4( src+4 ) = v;
+ src += FDEC_STRIDE;
+ }
+}
+void x264_predict_8x16c_v_c( pixel *src )
+{
+ pixel4 v0 = MPIXEL_X4( src+0-FDEC_STRIDE );
+ pixel4 v1 = MPIXEL_X4( src+4-FDEC_STRIDE );
+
+ for( int i = 0; i < 16; i++ )
+ {
+ MPIXEL_X4( src+0 ) = v0;
+ MPIXEL_X4( src+4 ) = v1;
+ src += FDEC_STRIDE;
+ }
+}
+void x264_predict_8x16c_p_c( pixel *src )
+{
+ int H = 0;
+ int V = 0;
+
+ for( int i = 0; i < 4; i++ )
+ H += ( i + 1 ) * ( src[4 + i - FDEC_STRIDE] - src[2 - i - FDEC_STRIDE] );
+ for( int i = 0; i < 8; i++ )
+ V += ( i + 1 ) * ( src[-1 + (i+8)*FDEC_STRIDE] - src[-1 + (6-i)*FDEC_STRIDE] );
+
+ int a = 16 * ( src[-1 + 15*FDEC_STRIDE] + src[7 - FDEC_STRIDE] );
+ int b = ( 17 * H + 16 ) >> 5;
+ int c = ( 5 * V + 32 ) >> 6;
+ int i00 = a -3*b -7*c + 16;
+
+ for( int y = 0; y < 16; y++ )
+ {
+ int pix = i00;
+ for( int x = 0; x < 8; x++ )
+ {
+ src[x] = x264_clip_pixel( pix>>5 );
+ pix += b;
+ }
+ src += FDEC_STRIDE;
+ i00 += c;
+ }
+}
+
/****************************************************************************
* 4x4 prediction for intra luma block
****************************************************************************/
#endif
}
+void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] )
+{
+ pf[I_PRED_CHROMA_V ] = x264_predict_8x16c_v_c;
+ pf[I_PRED_CHROMA_H ] = x264_predict_8x16c_h_c;
+ pf[I_PRED_CHROMA_DC] = x264_predict_8x16c_dc_c;
+ pf[I_PRED_CHROMA_P ] = x264_predict_8x16c_p_c;
+ pf[I_PRED_CHROMA_DC_LEFT]= x264_predict_8x16c_dc_left_c;
+ pf[I_PRED_CHROMA_DC_TOP ]= x264_predict_8x16c_dc_top_c;
+ pf[I_PRED_CHROMA_DC_128 ]= x264_predict_8x16c_dc_128_c;
+}
+
void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
{
pf[I_PRED_8x8_V] = x264_predict_8x8_v_c;
I_PRED_CHROMA_DC_TOP = 5,
I_PRED_CHROMA_DC_128 = 6
};
-static const uint8_t x264_mb_pred_mode8x8c_fix[7] =
+static const uint8_t x264_mb_chroma_pred_mode_fix[7] =
{
I_PRED_CHROMA_DC, I_PRED_CHROMA_H, I_PRED_CHROMA_V, I_PRED_CHROMA_P,
I_PRED_CHROMA_DC, I_PRED_CHROMA_DC,I_PRED_CHROMA_DC
void x264_predict_8x8c_h_c ( pixel *src );
void x264_predict_8x8c_v_c ( pixel *src );
void x264_predict_8x8c_p_c ( pixel *src );
+void x264_predict_8x16c_dc_c( pixel *src );
+void x264_predict_8x16c_h_c ( pixel *src );
+void x264_predict_8x16c_v_c ( pixel *src );
+void x264_predict_8x16c_p_c ( pixel *src );
void x264_predict_16x16_init ( int cpu, x264_predict_t pf[7] );
void x264_predict_8x8c_init ( int cpu, x264_predict_t pf[7] );
+void x264_predict_8x16c_init ( int cpu, x264_predict_t pf[7] );
void x264_predict_4x4_init ( int cpu, x264_predict_t pf[12] );
void x264_predict_8x8_init ( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
* Authors: Loren Merritt <lorenm@u.washington.edu>
* Fiona Glaser <fiona@x264.com>
* Christian Heine <sennindemokrit@gmx.net>
+ * Henrik Gramner <hengar-6@student.ltu.se>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
}
}
-static ALWAYS_INLINE void idct_dequant_2x2_dconly( dctcoef out[4], dctcoef dct[4], int dequant_mf )
+#define IDCT_DEQUANT_2X4_START \
+ int a0 = dct[0] + dct[1]; \
+ int a1 = dct[2] + dct[3]; \
+ int a2 = dct[4] + dct[5]; \
+ int a3 = dct[6] + dct[7]; \
+ int a4 = dct[0] - dct[1]; \
+ int a5 = dct[2] - dct[3]; \
+ int a6 = dct[4] - dct[5]; \
+ int a7 = dct[6] - dct[7]; \
+ int b0 = a0 + a1; \
+ int b1 = a2 + a3; \
+ int b2 = a4 + a5; \
+ int b3 = a6 + a7; \
+ int b4 = a0 - a1; \
+ int b5 = a2 - a3; \
+ int b6 = a4 - a5; \
+ int b7 = a6 - a7;
+
+static void idct_dequant_2x4_dc( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp )
+{
+ IDCT_DEQUANT_2X4_START
+ int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
+ dct4x4[0][0] = ((b0 + b1) * dmf + 32) >> 6;
+ dct4x4[1][0] = ((b2 + b3) * dmf + 32) >> 6;
+ dct4x4[2][0] = ((b0 - b1) * dmf + 32) >> 6;
+ dct4x4[3][0] = ((b2 - b3) * dmf + 32) >> 6;
+ dct4x4[4][0] = ((b4 - b5) * dmf + 32) >> 6;
+ dct4x4[5][0] = ((b6 - b7) * dmf + 32) >> 6;
+ dct4x4[6][0] = ((b4 + b5) * dmf + 32) >> 6;
+ dct4x4[7][0] = ((b6 + b7) * dmf + 32) >> 6;
+}
+
+static void idct_dequant_2x4_dconly( dctcoef dct[8], int dequant_mf[6][16], int i_qp )
+{
+ IDCT_DEQUANT_2X4_START
+ int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
+ dct[0] = ((b0 + b1) * dmf + 32) >> 6;
+ dct[1] = ((b2 + b3) * dmf + 32) >> 6;
+ dct[2] = ((b0 - b1) * dmf + 32) >> 6;
+ dct[3] = ((b2 - b3) * dmf + 32) >> 6;
+ dct[4] = ((b4 - b5) * dmf + 32) >> 6;
+ dct[5] = ((b6 - b7) * dmf + 32) >> 6;
+ dct[6] = ((b4 + b5) * dmf + 32) >> 6;
+ dct[7] = ((b6 + b7) * dmf + 32) >> 6;
+}
+
+static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x4( dctcoef out[8], dctcoef dct[8], int dmf )
+{
+ IDCT_DEQUANT_2X4_START
+ out[0] = ((b0 + b1) * dmf + 2080) >> 6; /* 2080 = 32 + (32<<6) */
+ out[1] = ((b2 + b3) * dmf + 2080) >> 6;
+ out[2] = ((b0 - b1) * dmf + 2080) >> 6;
+ out[3] = ((b2 - b3) * dmf + 2080) >> 6;
+ out[4] = ((b4 - b5) * dmf + 2080) >> 6;
+ out[5] = ((b6 - b7) * dmf + 2080) >> 6;
+ out[6] = ((b4 + b5) * dmf + 2080) >> 6;
+ out[7] = ((b6 + b7) * dmf + 2080) >> 6;
+}
+#undef IDCT_DEQUANT_2X4_START
+
+static ALWAYS_INLINE void optimize_chroma_idct_dequant_2x2( dctcoef out[4], dctcoef dct[4], int dmf )
{
int d0 = dct[0] + dct[1];
int d1 = dct[2] + dct[3];
int d2 = dct[0] - dct[1];
int d3 = dct[2] - dct[3];
- out[0] = (d0 + d1) * dequant_mf >> 5;
- out[1] = (d0 - d1) * dequant_mf >> 5;
- out[2] = (d2 + d3) * dequant_mf >> 5;
- out[3] = (d2 - d3) * dequant_mf >> 5;
+ out[0] = ((d0 + d1) * dmf >> 5) + 32;
+ out[1] = ((d0 - d1) * dmf >> 5) + 32;
+ out[2] = ((d2 + d3) * dmf >> 5) + 32;
+ out[3] = ((d2 - d3) * dmf >> 5) + 32;
}
-static ALWAYS_INLINE int idct_dequant_round_2x2_dc( dctcoef ref[4], dctcoef dct[4], int dequant_mf )
+static ALWAYS_INLINE int optimize_chroma_round( dctcoef *ref, dctcoef *dct, int dequant_mf, int chroma422 )
{
- dctcoef out[4];
- idct_dequant_2x2_dconly( out, dct, dequant_mf );
- return ((ref[0] ^ (out[0]+32))
- | (ref[1] ^ (out[1]+32))
- | (ref[2] ^ (out[2]+32))
- | (ref[3] ^ (out[3]+32))) >> 6;
+ dctcoef out[8];
+
+ if( chroma422 )
+ optimize_chroma_idct_dequant_2x4( out, dct, dequant_mf );
+ else
+ optimize_chroma_idct_dequant_2x2( out, dct, dequant_mf );
+
+ int sum = 0;
+ for( int i = 0; i < (chroma422?8:4); i++ )
+ sum |= ref[i] ^ out[i];
+ return sum >> 6;
}
-static int optimize_chroma_dc( dctcoef dct[4], int dequant_mf )
+static ALWAYS_INLINE int optimize_chroma_dc_internal( dctcoef *dct, int dequant_mf, int chroma422 )
{
/* dequant_mf = h->dequant4_mf[CQM_4IC + b_inter][i_qp%6][0] << i_qp/6, max 32*64 */
- dctcoef dct_orig[4];
+ dctcoef dct_orig[8];
int coeff, nz;
- idct_dequant_2x2_dconly( dct_orig, dct, dequant_mf );
- dct_orig[0] += 32;
- dct_orig[1] += 32;
- dct_orig[2] += 32;
- dct_orig[3] += 32;
+ if( chroma422 )
+ optimize_chroma_idct_dequant_2x4( dct_orig, dct, dequant_mf );
+ else
+ optimize_chroma_idct_dequant_2x2( dct_orig, dct, dequant_mf );
/* If the DC coefficients already round to zero, terminate early. */
- if( !((dct_orig[0]|dct_orig[1]|dct_orig[2]|dct_orig[3])>>6) )
+ int sum = 0;
+ for( int i = 0; i < (chroma422?8:4); i++ )
+ sum |= dct_orig[i];
+ if( !(sum >> 6) )
return 0;
/* Start with the highest frequency coefficient... is this the best option? */
- for( nz = 0, coeff = 3; coeff >= 0; coeff-- )
+ for( nz = 0, coeff = (chroma422?7:3); coeff >= 0; coeff-- )
{
int level = dct[coeff];
- int sign = level>>31 | 1; /* dct2x2[coeff] < 0 ? -1 : 1 */
+ int sign = level>>31 | 1; /* dct[coeff] < 0 ? -1 : 1 */
while( level )
{
dct[coeff] = level - sign;
- if( idct_dequant_round_2x2_dc( dct_orig, dct, dequant_mf ) )
+ if( optimize_chroma_round( dct_orig, dct, dequant_mf, chroma422 ) )
{
nz = 1;
dct[coeff] = level;
return nz;
}
+static int optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
+{
+ return optimize_chroma_dc_internal( dct, dequant_mf, 0 );
+}
+
+static int optimize_chroma_2x4_dc( dctcoef dct[8], int dequant_mf )
+{
+ return optimize_chroma_dc_internal( dct, dequant_mf, 1 );
+}
+
static void x264_denoise_dct( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size )
{
for( int i = 0; i < size; i++ )
return x264_decimate_score_internal( dct, 64 );
}
-static int ALWAYS_INLINE x264_coeff_last_internal( dctcoef *l, int i_count )
-{
- int i_last = i_count-1;
- while( i_last >= 0 && l[i_last] == 0 )
- i_last--;
- return i_last;
+#define last(num)\
+static int x264_coeff_last##num( dctcoef *l )\
+{\
+ int i_last = num-1;\
+ while( i_last >= 0 && l[i_last] == 0 )\
+ i_last--;\
+ return i_last;\
}
-static int x264_coeff_last4( dctcoef *l )
-{
- return x264_coeff_last_internal( l, 4 );
-}
-static int x264_coeff_last15( dctcoef *l )
-{
- return x264_coeff_last_internal( l, 15 );
-}
-static int x264_coeff_last16( dctcoef *l )
-{
- return x264_coeff_last_internal( l, 16 );
-}
-static int x264_coeff_last64( dctcoef *l )
-{
- return x264_coeff_last_internal( l, 64 );
-}
+last(4)
+last(8)
+last(15)
+last(16)
+last(64)
#define level_run(num)\
static int x264_coeff_level_run##num( dctcoef *dct, x264_run_level_t *runlevel )\
}
level_run(4)
+level_run(8)
level_run(15)
level_run(16)
-
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
{
pf->quant_8x8 = quant_8x8;
pf->dequant_4x4_dc = dequant_4x4_dc;
pf->dequant_8x8 = dequant_8x8;
- pf->optimize_chroma_dc = optimize_chroma_dc;
+ pf->idct_dequant_2x4_dc = idct_dequant_2x4_dc;
+ pf->idct_dequant_2x4_dconly = idct_dequant_2x4_dconly;
+
+ pf->optimize_chroma_2x2_dc = optimize_chroma_2x2_dc;
+ pf->optimize_chroma_2x4_dc = optimize_chroma_2x4_dc;
pf->denoise_dct = x264_denoise_dct;
pf->decimate_score15 = x264_decimate_score15;
pf->decimate_score16 = x264_decimate_score16;
pf->decimate_score64 = x264_decimate_score64;
- pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4;
+ pf->coeff_last4 = x264_coeff_last4;
+ pf->coeff_last8 = x264_coeff_last8;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64;
- pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4;
+ pf->coeff_level_run4 = x264_coeff_level_run4;
+ pf->coeff_level_run8 = x264_coeff_level_run8;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15;
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16;
pf->decimate_score16 = x264_decimate_score16_mmx2_slowctz;
}
pf->decimate_score64 = x264_decimate_score64_mmx2;
- pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2;
+ pf->coeff_last4 = x264_coeff_last4_mmx2;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_mmx2;
pf->coeff_last[ DCT_LUMA_4x4] = x264_coeff_last16_mmx2;
pf->coeff_last[ DCT_LUMA_8x8] = x264_coeff_last64_mmx2;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
#endif
- pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2;
+ pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
if( cpu&X264_CPU_LZCNT )
- pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2_lzcnt;
+ pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
}
if( cpu&X264_CPU_SSE2 )
{
pf->coeff_level_run[DCT_LUMA_4x4] = x264_coeff_level_run16_sse2;
if( cpu&X264_CPU_LZCNT )
{
- pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2_lzcnt;
+ pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
pf->coeff_last[ DCT_LUMA_AC] = x264_coeff_last15_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_4x4] = x264_coeff_last16_sse2_lzcnt;
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_sse2_lzcnt;
pf->coeff_level_run[ DCT_LUMA_AC] = x264_coeff_level_run15_mmx2;
pf->coeff_level_run[ DCT_LUMA_4x4] = x264_coeff_level_run16_mmx2;
#endif
- pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2;
- pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2;
+ pf->coeff_last4 = x264_coeff_last4_mmx2;
+ pf->coeff_level_run4 = x264_coeff_level_run4_mmx2;
if( cpu&X264_CPU_LZCNT )
{
- pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_mmx2_lzcnt;
- pf->coeff_level_run[DCT_CHROMA_DC] = x264_coeff_level_run4_mmx2_lzcnt;
+ pf->coeff_last4 = x264_coeff_last4_mmx2_lzcnt;
+ pf->coeff_level_run4 = x264_coeff_level_run4_mmx2_lzcnt;
}
}
pf->dequant_4x4 = x264_dequant_4x4_flat16_sse2;
pf->dequant_8x8 = x264_dequant_8x8_flat16_sse2;
}
- pf->optimize_chroma_dc = x264_optimize_chroma_dc_sse2;
+ pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse2;
pf->denoise_dct = x264_denoise_dct_sse2;
pf->decimate_score15 = x264_decimate_score15_sse2;
pf->decimate_score16 = x264_decimate_score16_sse2;
pf->quant_4x4_dc = x264_quant_4x4_dc_ssse3;
pf->quant_4x4 = x264_quant_4x4_ssse3;
pf->quant_8x8 = x264_quant_8x8_ssse3;
- pf->optimize_chroma_dc = x264_optimize_chroma_dc_ssse3;
+ pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_ssse3;
pf->denoise_dct = x264_denoise_dct_ssse3;
pf->decimate_score15 = x264_decimate_score15_ssse3;
pf->decimate_score16 = x264_decimate_score16_ssse3;
pf->quant_4x4_dc = x264_quant_4x4_dc_sse4;
pf->quant_4x4 = x264_quant_4x4_sse4;
pf->quant_8x8 = x264_quant_8x8_sse4;
- pf->optimize_chroma_dc = x264_optimize_chroma_dc_sse4;
+ pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_sse4;
}
if( cpu&X264_CPU_AVX )
pf->dequant_4x4 = x264_dequant_4x4_avx;
pf->dequant_8x8 = x264_dequant_8x8_avx;
}
- pf->optimize_chroma_dc = x264_optimize_chroma_dc_avx;
+ pf->optimize_chroma_2x2_dc = x264_optimize_chroma_2x2_dc_avx;
pf->denoise_dct = x264_denoise_dct_avx;
}
#endif // HAVE_MMX
#if HAVE_ARMV6
if( cpu&X264_CPU_ARMV6 )
- pf->coeff_last[DCT_CHROMA_DC] = x264_coeff_last4_arm;
+ pf->coeff_last4 = x264_coeff_last4_arm;
if( cpu&X264_CPU_NEON )
{
void (*dequant_4x4)( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
void (*dequant_4x4_dc)( dctcoef dct[16], int dequant_mf[6][16], int i_qp );
- int (*optimize_chroma_dc)( dctcoef dct[4], int dequant_mf );
+ void (*idct_dequant_2x4_dc)( dctcoef dct[8], dctcoef dct4x4[8][16], int dequant_mf[6][16], int i_qp );
+ void (*idct_dequant_2x4_dconly)( dctcoef dct[8], int dequant_mf[6][16], int i_qp );
+
+ int (*optimize_chroma_2x2_dc)( dctcoef dct[4], int dequant_mf );
+ int (*optimize_chroma_2x4_dc)( dctcoef dct[8], int dequant_mf );
void (*denoise_dct)( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
int (*decimate_score16)( dctcoef *dct );
int (*decimate_score64)( dctcoef *dct );
int (*coeff_last[14])( dctcoef *dct );
+ int (*coeff_last4)( dctcoef *dct );
+ int (*coeff_last8)( dctcoef *dct );
int (*coeff_level_run[13])( dctcoef *dct, x264_run_level_t *runlevel );
+ int (*coeff_level_run4)( dctcoef *dct, x264_run_level_t *runlevel );
+ int (*coeff_level_run8)( dctcoef *dct, x264_run_level_t *runlevel );
} x264_quant_function_t;
void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf );
PROFILE_HIGH = 100,
PROFILE_HIGH10 = 110,
PROFILE_HIGH422 = 122,
- PROFILE_HIGH444 = 144,
PROFILE_HIGH444_PREDICTIVE = 244,
};
+enum chroma_format_e
+{
+ CHROMA_400 = 0,
+ CHROMA_420 = 1,
+ CHROMA_422 = 2,
+ CHROMA_444 = 3,
+};
+
enum cqm4_e
{
CQM_4IY = 0,
*
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Fiona Glaser <fiona@x264.com>
+ * Henrik Gramner <hengar-6@student.ltu.se>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
#include "common.h"
-const vlc_t x264_coeff0_token[5] =
+/* [nC] */
+const vlc_t x264_coeff0_token[6] =
{
{ 0x1, 1 }, /* str=1 */
{ 0x3, 2 }, /* str=11 */
{ 0xf, 4 }, /* str=1111 */
{ 0x3, 6 }, /* str=000011 */
{ 0x1, 2 }, /* str=01 */
+ { 0x1, 1 }, /* str=1 */
};
-const vlc_t x264_coeff_token[5][16][4] =
+/* [nC][i_total_coeff-1][i_trailing] */
+const vlc_t x264_coeff_token[6][16][4] =
{
{ /* table 0 */
{ /* i_total 1 */
{ 0x0, 7 }, /* str=0000000 */
},
},
+ { /* table 5 */
+ { /* i_total 1 */
+ { 0xf, 7 }, /* str=0001111 */
+ { 0x1, 2 }, /* str=01 */
+ },
+ { /* i_total 2 */
+ { 0xe, 7 }, /* str=0001110 */
+ { 0xd, 7 }, /* str=0001101 */
+ { 0x1, 3 }, /* str=001 */
+ },
+ { /* i_total 3 */
+ { 0x7, 9 }, /* str=000000111 */
+ { 0xc, 7 }, /* str=0001100 */
+ { 0xb, 7 }, /* str=0001011 */
+ { 0x1, 5 }, /* str=00001 */
+ },
+ { /* i_total 4 */
+ { 0x6, 9 }, /* str=000000110 */
+ { 0x5, 9 }, /* str=000000101 */
+ { 0xa, 7 }, /* str=0001010 */
+ { 0x1, 6 }, /* str=000001 */
+ },
+ { /* i_total 5 */
+ { 0x7, 10 }, /* str=0000000111 */
+ { 0x6, 10 }, /* str=0000000110 */
+ { 0x4, 9 }, /* str=000000100 */
+ { 0x9, 7 }, /* str=0001001 */
+ },
+ { /* i_total 6 */
+ { 0x7, 11 }, /* str=00000000111 */
+ { 0x6, 11 }, /* str=00000000110 */
+ { 0x5, 10 }, /* str=0000000101 */
+ { 0x8, 7 }, /* str=0001000 */
+ },
+ { /* i_total 7 */
+ { 0x7, 12 }, /* str=000000000111 */
+ { 0x6, 12 }, /* str=000000000110 */
+ { 0x5, 11 }, /* str=00000000101 */
+ { 0x4, 10 }, /* str=0000000100 */
+ },
+ { /* i_total 8 */
+ { 0x7, 13 }, /* str=0000000000111 */
+ { 0x5, 12 }, /* str=000000000101 */
+ { 0x4, 12 }, /* str=000000000100 */
+ { 0x4, 11 }, /* str=00000000100 */
+ },
+ },
};
/* [i_total_coeff-1][i_total_zeros] */
};
/* [i_total_coeff-1][i_total_zeros] */
-const vlc_t x264_total_zeros_dc[3][4] =
+const vlc_t x264_total_zeros_2x2_dc[3][4] =
{
{ /* i_total 1 */
{ 0x1, 1 }, /* str=1 */
},
};
-/* x264_run_before[__MIN( i_zero_left -1, 6 )][run_before] */
+/* [i_total_coeff-1][i_total_zeros] */
+const vlc_t x264_total_zeros_2x4_dc[7][8] =
+{
+ { /* i_total 1 */
+ { 0x1, 1 }, /* str=1 */
+ { 0x2, 3 }, /* str=010 */
+ { 0x3, 3 }, /* str=011 */
+ { 0x2, 4 }, /* str=0010 */
+ { 0x3, 4 }, /* str=0011 */
+ { 0x1, 4 }, /* str=0001 */
+ { 0x1, 5 }, /* str=00001 */
+ { 0x0, 5 }, /* str=00000 */
+ },
+ { /* i_total 2 */
+ { 0x0, 3 }, /* str=000 */
+ { 0x1, 2 }, /* str=01 */
+ { 0x1, 3 }, /* str=001 */
+ { 0x4, 3 }, /* str=100 */
+ { 0x5, 3 }, /* str=101 */
+ { 0x6, 3 }, /* str=110 */
+ { 0x7, 3 }, /* str=111 */
+ },
+ { /* i_total 3 */
+ { 0x0, 3 }, /* str=000 */
+ { 0x1, 3 }, /* str=001 */
+ { 0x1, 2 }, /* str=01 */
+ { 0x2, 2 }, /* str=10 */
+ { 0x6, 3 }, /* str=110 */
+ { 0x7, 3 }, /* str=111 */
+ },
+ { /* i_total 4 */
+ { 0x6, 3 }, /* str=110 */
+ { 0x0, 2 }, /* str=00 */
+ { 0x1, 2 }, /* str=01 */
+ { 0x2, 2 }, /* str=10 */
+ { 0x7, 3 }, /* str=111 */
+ },
+ { /* i_total 5 */
+ { 0x0, 2 }, /* str=00 */
+ { 0x1, 2 }, /* str=01 */
+ { 0x2, 2 }, /* str=10 */
+ { 0x3, 2 }, /* str=11 */
+ },
+ { /* i_total 6 */
+ { 0x0, 2 }, /* str=00 */
+ { 0x1, 2 }, /* str=01 */
+ { 0x1, 1 }, /* str=1 */
+ },
+ { /* i_total 7 */
+ { 0x0, 1 }, /* str=0 */
+ { 0x1, 1 }, /* str=1 */
+ }
+};
+
+/* [MIN( i_zero_left-1, 6 )][run_before] */
const vlc_t x264_run_before[7][16] =
{
{ /* i_zero_left 1 */
{ 0x5, 3 }, /* str=101 */
{ 0x4, 3 }, /* str=100 */
},
- { /* i_zero_left 7 */
+ { /* i_zero_left >6 */
{ 0x7, 3 }, /* str=111 */
{ 0x6, 3 }, /* str=110 */
{ 0x5, 3 }, /* str=101 */
RET
;-----------------------------------------------------------------------------
-; void store_interleave_8x8x2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv )
+; void store_interleave_chroma( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv, int height )
;-----------------------------------------------------------------------------
-cglobal store_interleave_8x8x2, 4,5
- mov r4d, 4
+cglobal store_interleave_chroma, 5,5
FIX_STRIDES r1d
.loop:
INTERLEAVE r0+ 0, r2+ 0, r3+ 0, a
add r2, FDEC_STRIDEB*2
add r3, FDEC_STRIDEB*2
lea r0, [r0+r1*2]
- dec r4d
+ sub r4d, 2
jg .loop
REP_RET
%endmacro ; PLANE_INTERLEAVE
REP_RET
;-----------------------------------------------------------------------------
-; void load_deinterleave_8x8x2_fenc( pixel *dst, pixel *src, int i_src )
+; void load_deinterleave_chroma_fenc( pixel *dst, pixel *src, int i_src, int height )
;-----------------------------------------------------------------------------
-cglobal load_deinterleave_8x8x2_fenc, 3,4
+cglobal load_deinterleave_chroma_fenc, 4,4
DEINTERLEAVE_START
- mov r3d, 4
FIX_STRIDES r2d
.loop:
DEINTERLEAVE r0+ 0, r0+FENC_STRIDEB*1/2, r1+ 0, 1, m4, a
DEINTERLEAVE r0+FENC_STRIDEB, r0+FENC_STRIDEB*3/2, r1+r2, 1, m4, a
add r0, FENC_STRIDEB*2
lea r1, [r1+r2*2]
- dec r3d
+ sub r3d, 2
jg .loop
REP_RET
;-----------------------------------------------------------------------------
-; void load_deinterleave_8x8x2_fdec( pixel *dst, pixel *src, int i_src )
+; void load_deinterleave_chroma_fdec( pixel *dst, pixel *src, int i_src, int height )
;-----------------------------------------------------------------------------
-cglobal load_deinterleave_8x8x2_fdec, 3,4
+cglobal load_deinterleave_chroma_fdec, 4,4
DEINTERLEAVE_START
- mov r3d, 4
FIX_STRIDES r2d
.loop:
DEINTERLEAVE r0+ 0, r0+FDEC_STRIDEB*1/2, r1+ 0, 0, m4, a
DEINTERLEAVE r0+FDEC_STRIDEB, r0+FDEC_STRIDEB*3/2, r1+r2, 0, m4, a
add r0, FDEC_STRIDEB*2
lea r1, [r1+r2*2]
- dec r3d
+ sub r3d, 2
jg .loop
REP_RET
%endmacro ; PLANE_DEINTERLEAVE
void x264_plane_copy_deinterleave_avx( uint16_t *dstu, int i_dstu,
uint16_t *dstv, int i_dstv,
uint16_t *src, int i_src, int w, int h );
-void x264_store_interleave_8x8x2_mmx2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
-void x264_store_interleave_8x8x2_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
-void x264_store_interleave_8x8x2_avx( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
-void x264_load_deinterleave_8x8x2_fenc_mmx( pixel *dst, pixel *src, int i_src );
-void x264_load_deinterleave_8x8x2_fenc_sse2( pixel *dst, pixel *src, int i_src );
-void x264_load_deinterleave_8x8x2_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src );
-void x264_load_deinterleave_8x8x2_fenc_avx( uint16_t *dst, uint16_t *src, int i_src );
-void x264_load_deinterleave_8x8x2_fdec_mmx( pixel *dst, pixel *src, int i_src );
-void x264_load_deinterleave_8x8x2_fdec_sse2( pixel *dst, pixel *src, int i_src );
-void x264_load_deinterleave_8x8x2_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src );
-void x264_load_deinterleave_8x8x2_fdec_avx( uint16_t *dst, uint16_t *src, int i_src );
+void x264_store_interleave_chroma_mmx2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_store_interleave_chroma_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_store_interleave_chroma_avx( pixel *dst, int i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_load_deinterleave_chroma_fenc_mmx( pixel *dst, pixel *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fdec_mmx( pixel *dst, pixel *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src, int height );
+void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, int i_src, int height );
void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
void x264_memzero_aligned_mmx( void * dst, int n );
if( !(cpu&X264_CPU_MMX) )
return;
- pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_mmx;
- pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_mmx;
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_mmx;
+ pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_mmx;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx;
pf->plane_copy = x264_plane_copy_mmx2;
pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
- pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmx2;
+ pf->store_interleave_chroma = x264_store_interleave_chroma_mmx2;
pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmx2;
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_mmx2;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
- pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_sse2;
- pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_sse2;
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
+ pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
pf->plane_copy_interleave = x264_plane_copy_interleave_sse2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
pf->integral_init4v = x264_integral_init4v_sse2;
pf->integral_init8v = x264_integral_init8v_sse2;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
- pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2;
+ pf->store_interleave_chroma = x264_store_interleave_chroma_sse2;
pf->offsetadd = x264_mc_offsetadd_wtab_sse2;
pf->offsetsub = x264_mc_offsetsub_wtab_sse2;
if( !(cpu&X264_CPU_AVX) )
return;
- pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_avx;
- pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_avx;
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx;
+ pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
pf->plane_copy_interleave = x264_plane_copy_interleave_avx;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx;
- pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_avx;
+ pf->store_interleave_chroma = x264_store_interleave_chroma_avx;
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_avx;
if( cpu&X264_CPU_SSE2_IS_FAST )
{
- pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2; // FIXME sse2fast? sse2medium?
- pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_sse2;
- pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_sse2;
+ pf->store_interleave_chroma = x264_store_interleave_chroma_sse2; // FIXME sse2fast? sse2medium?
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_sse2;
+ pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_sse2;
pf->plane_copy_interleave = x264_plane_copy_interleave_sse2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
pf->mc_luma = mc_luma_sse2;
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3;
- pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_ssse3;
- pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_ssse3;
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_ssse3;
+ pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_ssse3;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
pf->hpel_filter = x264_hpel_filter_ssse3;
%endif
;-----------------------------------------------------------------------------
-; x264_optimize_chroma_dc( dctcoef dct[4], int dequant_mf )
+; x264_optimize_chroma_2x2_dc( dctcoef dct[4], int dequant_mf )
;-----------------------------------------------------------------------------
-%macro OPTIMIZE_CHROMA_DC 0
+%macro OPTIMIZE_CHROMA_2x2_DC 0
%assign %%regs 5
%if cpuflag(sse4)
%assign %%regs %%regs-1
%ifndef ARCH_X86_64
%assign %%regs %%regs+1 ; t0-t4 are volatile on x86-64
%endif
-cglobal optimize_chroma_dc, 0,%%regs,7
+cglobal optimize_chroma_2x2_dc, 0,%%regs,7
movifnidn t0, r0mp
movd m2, r1m
movq m1, [t0]
%ifndef HIGH_BIT_DEPTH
INIT_XMM sse2
-OPTIMIZE_CHROMA_DC
+OPTIMIZE_CHROMA_2x2_DC
INIT_XMM ssse3
-OPTIMIZE_CHROMA_DC
+OPTIMIZE_CHROMA_2x2_DC
INIT_XMM sse4
-OPTIMIZE_CHROMA_DC
+OPTIMIZE_CHROMA_2x2_DC
INIT_XMM avx
-OPTIMIZE_CHROMA_DC
+OPTIMIZE_CHROMA_2x2_DC
%endif ; !HIGH_BIT_DEPTH
%ifdef HIGH_BIT_DEPTH
void x264_dequant_8x8_flat16_mmx( int16_t dct[64], int dequant_mf[6][64], int i_qp );
void x264_dequant_4x4_flat16_sse2( int16_t dct[16], int dequant_mf[6][16], int i_qp );
void x264_dequant_8x8_flat16_sse2( int16_t dct[64], int dequant_mf[6][64], int i_qp );
-int x264_optimize_chroma_dc_sse2( dctcoef dct[4], int dequant_mf );
-int x264_optimize_chroma_dc_ssse3( dctcoef dct[4], int dequant_mf );
-int x264_optimize_chroma_dc_sse4( dctcoef dct[4], int dequant_mf );
-int x264_optimize_chroma_dc_avx( dctcoef dct[4], int dequant_mf );
+int x264_optimize_chroma_2x2_dc_sse2( dctcoef dct[4], int dequant_mf );
+int x264_optimize_chroma_2x2_dc_ssse3( dctcoef dct[4], int dequant_mf );
+int x264_optimize_chroma_2x2_dc_sse4( dctcoef dct[4], int dequant_mf );
+int x264_optimize_chroma_2x2_dc_avx( dctcoef dct[4], int dequant_mf );
void x264_denoise_dct_mmx ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_sse2 ( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
void x264_denoise_dct_ssse3( dctcoef *dct, uint32_t *sum, udctcoef *offset, int size );
int i_satd_pcm;
/* Chroma part */
- int i_satd_i8x8chroma;
- int i_satd_i8x8chroma_dir[7];
+ int i_satd_chroma;
+ int i_satd_chroma_dir[7];
int i_predict8x8chroma;
/* II: Inter part P/B frame */
a->i_satd_i16x16 =
a->i_satd_i8x8 =
a->i_satd_i4x4 =
- a->i_satd_i8x8chroma = COST_MAX;
+ a->i_satd_chroma = COST_MAX;
/* non-RD PCM decision is inaccurate (as is psy-rd), so don't do it */
a->i_satd_pcm = !h->mb.i_psy_rd && a->i_mbrd ? ((uint64_t)X264_PCM_COST*a->i_lambda2 + 128) >> 8 : COST_MAX;
{I_PRED_16x16_V, I_PRED_16x16_H, I_PRED_16x16_DC, I_PRED_16x16_P, -1},
};
-static const int8_t i8x8chroma_mode_available[5][5] =
+static const int8_t chroma_mode_available[5][5] =
{
{I_PRED_CHROMA_DC_128, -1, -1, -1, -1},
{I_PRED_CHROMA_DC_LEFT, I_PRED_CHROMA_H, -1, -1, -1},
return i16x16_mode_available[idx];
}
-static ALWAYS_INLINE const int8_t *predict_8x8chroma_mode_available( int i_neighbour )
+static ALWAYS_INLINE const int8_t *predict_chroma_mode_available( int i_neighbour )
{
int idx = i_neighbour & (MB_TOP|MB_LEFT|MB_TOPLEFT);
idx = (idx == (MB_TOP|MB_LEFT|MB_TOPLEFT)) ? 4 : idx & (MB_TOP|MB_LEFT);
- return i8x8chroma_mode_available[idx];
+ return chroma_mode_available[idx];
}
static ALWAYS_INLINE const int8_t *predict_8x8_mode_available( int force_intra, int i_neighbour, int i )
static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
{
- if( a->i_satd_i8x8chroma < COST_MAX )
+ if( a->i_satd_chroma < COST_MAX )
return;
if( CHROMA444 )
{
if( !h->mb.b_chroma_me )
{
- a->i_satd_i8x8chroma = 0;
+ a->i_satd_chroma = 0;
return;
}
/* Cheap approximation of chroma costs to avoid a full i4x4/i8x8 analysis. */
h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[1] );
h->predict_16x16[a->i_predict16x16]( h->mb.pic.p_fdec[2] );
- a->i_satd_i8x8chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
- + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
+ a->i_satd_chroma = h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE )
+ + h->pixf.mbcmp[PIXEL_16x16]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
return;
}
- const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
+ const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
+ int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
- /* 8x8 prediction selection for chroma */
+ /* Prediction selection for chroma */
if( predict_mode[3] >= 0 && !h->mb.b_lossless )
{
int satdu[4], satdv[4];
- h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
- h->pixf.intra_mbcmp_x3_8x8c( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
- h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
- h->predict_8x8c[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
- satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
- satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
+ h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[1], h->mb.pic.p_fdec[1], satdu );
+ h->pixf.intra_mbcmp_x3_chroma( h->mb.pic.p_fenc[2], h->mb.pic.p_fdec[2], satdv );
+ h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[1] );
+ h->predict_chroma[I_PRED_CHROMA_P]( h->mb.pic.p_fdec[2] );
+ satdu[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE );
+ satdv[I_PRED_CHROMA_P] = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE );
for( ; *predict_mode >= 0; predict_mode++ )
{
int i_mode = *predict_mode;
int i_satd = satdu[i_mode] + satdv[i_mode] + a->i_lambda * bs_size_ue( i_mode );
- a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
- COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
+ a->i_satd_chroma_dir[i_mode] = i_satd;
+ COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
}
}
else
/* we do the prediction */
if( h->mb.b_lossless )
- x264_predict_lossless_8x8_chroma( h, i_mode );
+ x264_predict_lossless_chroma( h, i_mode );
else
{
- h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
- h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+ h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
+ h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
}
/* we calculate the cost */
- i_satd = h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
- h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
- a->i_lambda * bs_size_ue( x264_mb_pred_mode8x8c_fix[i_mode] );
+ i_satd = h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE ) +
+ h->pixf.mbcmp[chromapix]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE ) +
+ a->i_lambda * bs_size_ue( x264_mb_chroma_pred_mode_fix[i_mode] );
- a->i_satd_i8x8chroma_dir[i_mode] = i_satd;
- COPY2_IF_LT( a->i_satd_i8x8chroma, i_satd, a->i_predict8x8chroma, i_mode );
+ a->i_satd_chroma_dir[i_mode] = i_satd;
+ COPY2_IF_LT( a->i_satd_chroma, i_satd, a->i_predict8x8chroma, i_mode );
}
}
/* RD selection for chroma prediction */
if( !CHROMA444 )
{
- const int8_t *predict_mode = predict_8x8chroma_mode_available( h->mb.i_neighbour_intra );
+ const int8_t *predict_mode = predict_chroma_mode_available( h->mb.i_neighbour_intra );
if( predict_mode[1] >= 0 )
{
int8_t predict_mode_sorted[4];
int i_max;
- int i_thresh = a->b_early_terminate ? a->i_satd_i8x8chroma * 5/4 : COST_MAX;
+ int i_thresh = a->b_early_terminate ? a->i_satd_chroma * 5/4 : COST_MAX;
for( i_max = 0; *predict_mode >= 0; predict_mode++ )
{
int i_mode = *predict_mode;
- if( a->i_satd_i8x8chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
+ if( a->i_satd_chroma_dir[i_mode] < i_thresh && i_mode != a->i_predict8x8chroma )
predict_mode_sorted[i_max++] = i_mode;
}
/* the previous thing encoded was x264_intra_rd(), so the pixels and
* coefs for the current chroma mode are still around, so we only
* have to recount the bits. */
- i_best = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
+ i_best = x264_rd_cost_chroma( h, i_chroma_lambda, a->i_predict8x8chroma, 0 );
for( int i = 0; i < i_max; i++ )
{
int i_mode = predict_mode_sorted[i];
if( h->mb.b_lossless )
- x264_predict_lossless_8x8_chroma( h, i_mode );
+ x264_predict_lossless_chroma( h, i_mode );
else
{
- h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
- h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+ h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
+ h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
}
/* if we've already found a mode that needs no residual, then
* probably any mode with a residual will be worse.
* so avoid dct on the remaining modes to improve speed. */
- i_satd = x264_rd_cost_i8x8_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
+ i_satd = x264_rd_cost_chroma( h, i_chroma_lambda, i_mode, h->mb.i_cbp_chroma != 0x00 );
COPY3_IF_LT( i_best, i_satd, a->i_predict8x8chroma, i_mode, i_cbp_chroma_best, h->mb.i_cbp_chroma );
}
h->mb.i_chroma_pred_mode = a->i_predict8x8chroma;
#define LOAD_FENC(m, src, xoff, yoff) \
{ \
- int s = !CHROMA444; \
(m)->p_cost_mv = a->p_cost_mv; \
(m)->i_stride[0] = h->mb.pic.i_stride[0]; \
(m)->i_stride[1] = h->mb.pic.i_stride[1]; \
(m)->i_stride[2] = h->mb.pic.i_stride[2]; \
(m)->p_fenc[0] = &(src)[0][(xoff)+(yoff)*FENC_STRIDE]; \
- (m)->p_fenc[1] = &(src)[1][((xoff)>>s)+((yoff)>>s)*FENC_STRIDE]; \
- (m)->p_fenc[2] = &(src)[2][((xoff)>>s)+((yoff)>>s)*FENC_STRIDE]; \
+ (m)->p_fenc[1] = &(src)[1][((xoff)>>h->mb.chroma_h_shift)+((yoff)>>h->mb.chroma_v_shift)*FENC_STRIDE]; \
+ (m)->p_fenc[2] = &(src)[2][((xoff)>>h->mb.chroma_h_shift)+((yoff)>>h->mb.chroma_v_shift)*FENC_STRIDE]; \
}
#define LOAD_HPELS(m, src, list, ref, xoff, yoff) \
(m)->p_fref[11] = &(src)[11][(xoff)+(yoff)*(m)->i_stride[2]]; \
} \
else \
- (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>1)*(m)->i_stride[1]]; \
+ (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>h->mb.chroma_v_shift)*(m)->i_stride[1]]; \
(m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
(m)->weight = x264_weight_none; \
(m)->i_ref = ref; \
a->l0.i_cost8x16 = a->l0.me8x16[0].cost + a->l0.me8x16[1].cost;
}
-static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
+static ALWAYS_INLINE int x264_mb_analyse_inter_p4x4_chroma_internal( x264_t *h, x264_mb_analysis_t *a,
+ pixel **p_fref, int i8x8, int size, int chroma )
{
- ALIGNED_ARRAY_16( pixel, pix1,[16*8] );
+ ALIGNED_ARRAY_16( pixel, pix1,[16*16] );
pixel *pix2 = pix1+8;
- const int i_stride = h->mb.pic.i_stride[1];
- const int or = 8*(i8x8&1) + 2*(i8x8&2)*i_stride;
- const int i_ref = a->l0.me8x8[i8x8].i_ref;
- const int mvy_offset = MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ int i_stride = h->mb.pic.i_stride[1];
+ int chroma_h_shift = chroma <= CHROMA_422;
+ int chroma_v_shift = chroma == CHROMA_420;
+ int or = 8*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*i_stride;
+ int i_ref = a->l0.me8x8[i8x8].i_ref;
+ int mvy_offset = chroma_v_shift && MB_INTERLACED & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
x264_weight_t *weight = h->sh.weight[i_ref];
// FIXME weight can be done on 4x4 blocks even if mc is smaller
#define CHROMA4x4MC( width, height, me, x, y ) \
- if( CHROMA444 ) \
+ if( chroma == CHROMA_444 ) \
{ \
int mvx = (me).mv[0] + 4*2*x; \
int mvy = (me).mv[1] + 4*2*y; \
} \
else \
{ \
- h->mc.mc_chroma( &pix1[x+y*16], &pix2[x+y*16], 16, &p_fref[4][or+x*2+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
+ int offset = x + (2>>chroma_v_shift)*16*y; \
+ int chroma_height = (2>>chroma_v_shift)*height; \
+ h->mc.mc_chroma( &pix1[offset], &pix2[offset], 16, &p_fref[4][or+2*x+(2>>chroma_v_shift)*y*i_stride], i_stride, \
+ (me).mv[0], (2>>chroma_v_shift)*((me).mv[1]+mvy_offset), width, chroma_height ); \
if( weight[1].weightfn ) \
- weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
+ weight[1].weightfn[width>>2]( &pix1[offset], 16, &pix1[offset], 16, &weight[1], chroma_height ); \
if( weight[2].weightfn ) \
- weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height ); \
+ weight[2].weightfn[width>>2]( &pix2[offset], 16, &pix2[offset], 16, &weight[2], chroma_height ); \
}
-
if( size == PIXEL_4x4 )
{
x264_me_t *m = a->l0.me4x4[i8x8];
CHROMA4x4MC( 2,4, m[0], 0,0 );
CHROMA4x4MC( 2,4, m[1], 2,0 );
}
+#undef CHROMA4x4MC
- int oe = (8*(i8x8&1) + 4*(i8x8&2)*FENC_STRIDE) >> !CHROMA444;
- int chromapix = CHROMA444 ? PIXEL_8x8 : PIXEL_4x4;
+ int oe = (8>>chroma_h_shift)*(i8x8&1) + (4>>chroma_v_shift)*(i8x8&2)*FENC_STRIDE;
+ int chromapix = chroma == CHROMA_444 ? PIXEL_8x8 : chroma == CHROMA_422 ? PIXEL_4x8 : PIXEL_4x4;
return h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][oe], FENC_STRIDE, pix1, 16 )
+ h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][oe], FENC_STRIDE, pix2, 16 );
}
+static int x264_mb_analyse_inter_p4x4_chroma( x264_t *h, x264_mb_analysis_t *a, pixel **p_fref, int i8x8, int size )
+{
+ if( CHROMA_FORMAT == CHROMA_444 )
+ return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_444 );
+ else if( CHROMA_FORMAT == CHROMA_422 )
+ return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_422 );
+ else
+ return x264_mb_analyse_inter_p4x4_chroma_internal( h, a, p_fref, i8x8, size, CHROMA_420 );
+}
+
static void x264_mb_analyse_inter_p4x4( x264_t *h, x264_mb_analysis_t *a, int i8x8 )
{
pixel **p_fref = h->mb.pic.p_fref[0][a->l0.me8x8[i8x8].i_ref];
{
ALIGNED_ARRAY_16( pixel, pix, [4],[16*16] );
ALIGNED_ARRAY_16( pixel, bi, [2],[16*16] );
- int l0_mvy_offset, l1_mvy_offset;
int i_chroma_cost = 0;
+ int chromapix = h->luma2chroma_pixel[i_pixel];
#define COST_BI_CHROMA( m0, m1, width, height ) \
{ \
if( CHROMA444 ) \
{ \
h->mc.mc_luma( pix[0], 16, &m0.p_fref[4], m0.i_stride[1], \
- m0.mv[0], m0.mv[1], 2*width, 2*height, x264_weight_none ); \
+ m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
h->mc.mc_luma( pix[1], 16, &m0.p_fref[8], m0.i_stride[2], \
- m0.mv[0], m0.mv[1], 2*width, 2*height, x264_weight_none ); \
+ m0.mv[0], m0.mv[1], width, height, x264_weight_none ); \
h->mc.mc_luma( pix[2], 16, &m1.p_fref[4], m1.i_stride[1], \
- m1.mv[0], m1.mv[1], 2*width, 2*height, x264_weight_none ); \
+ m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
h->mc.mc_luma( pix[3], 16, &m1.p_fref[8], m1.i_stride[2], \
- m1.mv[0], m1.mv[1], 2*width, 2*height, x264_weight_none ); \
- h->mc.avg[i_pixel]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
- h->mc.avg[i_pixel]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
- i_chroma_cost = h->pixf.mbcmp[i_pixel]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ); \
- i_chroma_cost += h->pixf.mbcmp[i_pixel]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
+ m1.mv[0], m1.mv[1], width, height, x264_weight_none ); \
} \
else \
{ \
- l0_mvy_offset = MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
- l1_mvy_offset = MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
- h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], m0.mv[0], m0.mv[1] + l0_mvy_offset, width, height ); \
- h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], m1.mv[0], m1.mv[1] + l1_mvy_offset, width, height ); \
- h->mc.avg[i_pixel+3]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
- h->mc.avg[i_pixel+3]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
- i_chroma_cost = h->pixf.mbcmp[i_pixel+3]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ); \
- i_chroma_cost += h->pixf.mbcmp[i_pixel+3]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
+ int v_shift = h->mb.chroma_v_shift; \
+ int l0_mvy_offset = v_shift & MB_INTERLACED & m0.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
+ int l1_mvy_offset = v_shift & MB_INTERLACED & m1.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; \
+ h->mc.mc_chroma( pix[0], pix[1], 16, m0.p_fref[4], m0.i_stride[1], \
+ m0.mv[0], 2*(m0.mv[1]+l0_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
+ h->mc.mc_chroma( pix[2], pix[3], 16, m1.p_fref[4], m1.i_stride[1], \
+ m1.mv[0], 2*(m1.mv[1]+l1_mvy_offset)>>v_shift, width>>1, height>>v_shift ); \
} \
+ h->mc.avg[chromapix]( bi[0], 16, pix[0], 16, pix[2], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
+ h->mc.avg[chromapix]( bi[1], 16, pix[1], 16, pix[3], 16, h->mb.bipred_weight[m0.i_ref][m1.i_ref] ); \
+ i_chroma_cost = h->pixf.mbcmp[chromapix]( m0.p_fenc[1], FENC_STRIDE, bi[0], 16 ) \
+ + h->pixf.mbcmp[chromapix]( m0.p_fenc[2], FENC_STRIDE, bi[1], 16 ); \
}
if( i_pixel == PIXEL_16x16 )
- COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 8, 8 )
+ COST_BI_CHROMA( a->l0.bi16x16, a->l1.bi16x16, 16, 16 )
else if( i_pixel == PIXEL_16x8 )
- COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 8, 4 )
+ COST_BI_CHROMA( a->l0.me16x8[idx], a->l1.me16x8[idx], 16, 8 )
else if( i_pixel == PIXEL_8x16 )
- COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 4, 8 )
+ COST_BI_CHROMA( a->l0.me8x16[idx], a->l1.me8x16[idx], 8, 16 )
else
- COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 4, 4 )
+ COST_BI_CHROMA( a->l0.me8x8[idx], a->l1.me8x8[idx], 8, 8 )
return i_chroma_cost;
}
pixel *p_fenc = h->mb.pic.p_fenc[0];
pixel *p_fdec = h->mb.pic.p_fdec[0];
- int s = !CHROMA444;
a->i_cost16x16direct = a->i_lambda * i_mb_b_cost_table[B_DIRECT];
if( h->param.analyse.inter & X264_ANALYSE_BSUB16x16 )
{
- int chromapix = CHROMA444 ? PIXEL_8x8 : PIXEL_4x4;
+ int chromapix = h->luma2chroma_pixel[PIXEL_8x8];
+
for( int i = 0; i < 4; i++ )
{
const int x = (i&1)*8;
&p_fdec[x+y*FDEC_STRIDE], FDEC_STRIDE );
if( h->mb.b_chroma_me )
{
- a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][(x>>s)+(y>>s)*FENC_STRIDE], FENC_STRIDE,
- &h->mb.pic.p_fdec[1][(x>>s)+(y>>s)*FDEC_STRIDE], FDEC_STRIDE )
- + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][(x>>s)+(y>>s)*FENC_STRIDE], FENC_STRIDE,
- &h->mb.pic.p_fdec[2][(x>>s)+(y>>s)*FDEC_STRIDE], FDEC_STRIDE );
+ int fenc_offset = (x>>h->mb.chroma_h_shift) + (y>>h->mb.chroma_v_shift)*FENC_STRIDE;
+ int fdec_offset = (x>>h->mb.chroma_h_shift) + (y>>h->mb.chroma_v_shift)*FDEC_STRIDE;
+ a->i_cost8x8direct[i] += h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[1][fenc_offset], FENC_STRIDE,
+ &h->mb.pic.p_fdec[1][fdec_offset], FDEC_STRIDE )
+ + h->pixf.mbcmp[chromapix]( &h->mb.pic.p_fenc[2][fenc_offset], FENC_STRIDE,
+ &h->mb.pic.p_fdec[2][fdec_offset], FDEC_STRIDE );
}
a->i_cost16x16direct += a->i_cost8x8direct[i];
}
else
{
- int chromapix = CHROMA444 ? PIXEL_16x16 : PIXEL_8x8;
a->i_cost16x16direct += h->pixf.mbcmp[PIXEL_16x16]( p_fenc, FENC_STRIDE, p_fdec, FDEC_STRIDE );
if( h->mb.b_chroma_me )
{
+ int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
a->i_cost16x16direct += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE )
+ h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE );
}
if( h->mb.b_chroma_me )
{
- ALIGNED_ARRAY_16( pixel, pixuv, [2],[8*FENC_STRIDE] );
ALIGNED_ARRAY_16( pixel, bi, [16*FENC_STRIDE] );
if( CHROMA444 )
}
else
{
- if( MB_INTERLACED & a->l0.bi16x16.i_ref )
+ ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] );
+ int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
+ int v_shift = h->mb.chroma_v_shift;
+
+ if( v_shift & MB_INTERLACED & a->l0.bi16x16.i_ref )
{
- int l0_mvy_offset = MB_INTERLACED & a->l0.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ int l0_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
h->mc.mc_chroma( pixuv[0], pixuv[0]+8, FENC_STRIDE, h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
h->mb.pic.i_stride[1], 0, 0 + l0_mvy_offset, 8, 8 );
}
else
- h->mc.load_deinterleave_8x8x2_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4], h->mb.pic.i_stride[1] );
+ h->mc.load_deinterleave_chroma_fenc( pixuv[0], h->mb.pic.p_fref[0][a->l0.bi16x16.i_ref][4],
+ h->mb.pic.i_stride[1], 16>>v_shift );
- if( MB_INTERLACED & a->l1.bi16x16.i_ref )
+ if( v_shift & MB_INTERLACED & a->l1.bi16x16.i_ref )
{
- int l1_mvy_offset = MB_INTERLACED & a->l1.bi16x16.i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ int l1_mvy_offset = (h->mb.i_mb_y & 1)*4 - 2;
h->mc.mc_chroma( pixuv[1], pixuv[1]+8, FENC_STRIDE, h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
h->mb.pic.i_stride[1], 0, 0 + l1_mvy_offset, 8, 8 );
}
else
- h->mc.load_deinterleave_8x8x2_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4], h->mb.pic.i_stride[1] );
+ h->mc.load_deinterleave_chroma_fenc( pixuv[1], h->mb.pic.p_fref[1][a->l1.bi16x16.i_ref][4],
+ h->mb.pic.i_stride[1], 16>>v_shift );
- h->mc.avg[PIXEL_8x8]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE,
+ h->mc.avg[chromapix]( bi, FENC_STRIDE, pixuv[0], FENC_STRIDE, pixuv[1], FENC_STRIDE,
h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
- h->mc.avg[PIXEL_8x8]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
+ h->mc.avg[chromapix]( bi+8, FENC_STRIDE, pixuv[0]+8, FENC_STRIDE, pixuv[1]+8, FENC_STRIDE,
h->mb.bipred_weight[a->l0.bi16x16.i_ref][a->l1.bi16x16.i_ref] );
- cost00 += h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE )
- + h->pixf.mbcmp[PIXEL_8x8]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
+ cost00 += h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, bi, FENC_STRIDE )
+ + h->pixf.mbcmp[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, bi+8, FENC_STRIDE );
}
}
else
{
x264_mb_analyse_intra_chroma( h, &analysis );
- x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_i8x8chroma );
+ x264_mb_analyse_intra( h, &analysis, i_cost - analysis.i_satd_chroma );
}
- analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
- analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
- analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
+ analysis.i_satd_i16x16 += analysis.i_satd_chroma;
+ analysis.i_satd_i8x8 += analysis.i_satd_chroma;
+ analysis.i_satd_i4x4 += analysis.i_satd_chroma;
}
else
x264_mb_analyse_intra( h, &analysis, i_cost );
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[p], FENC_STRIDE, h->mb.pic.p_fdec[p], FDEC_STRIDE, 16 );
if( !CHROMA444 )
{
- h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, 8 );
- h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, 8 );
+ int height = 16 >> h->mb.chroma_v_shift;
+ h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, height );
+ h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, height );
}
x264_mb_analyse_init_qp( h, &analysis, X264_MAX( h->mb.i_qp - h->mb.ip_offset, h->param.rc.i_qp_min ) );
goto intra_analysis;
else
{
x264_mb_analyse_intra_chroma( h, &analysis );
- x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_i8x8chroma );
+ x264_mb_analyse_intra( h, &analysis, i_satd_inter - analysis.i_satd_chroma );
}
- analysis.i_satd_i16x16 += analysis.i_satd_i8x8chroma;
- analysis.i_satd_i8x8 += analysis.i_satd_i8x8chroma;
- analysis.i_satd_i4x4 += analysis.i_satd_i8x8chroma;
+ analysis.i_satd_i16x16 += analysis.i_satd_chroma;
+ analysis.i_satd_i8x8 += analysis.i_satd_chroma;
+ analysis.i_satd_i4x4 += analysis.i_satd_chroma;
}
else
x264_mb_analyse_intra( h, &analysis, i_satd_inter );
static void x264_cabac_mb_intra_chroma_pred_mode( x264_t *h, x264_cabac_t *cb )
{
- const int i_mode = x264_mb_pred_mode8x8c_fix[h->mb.i_chroma_pred_mode];
- int ctx = 0;
+ int i_mode = x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode];
+ int ctx = 0;
/* No need to test for I4x4 or I_16x16 as cache_save handle that */
if( (h->mb.i_neighbour & MB_LEFT) && h->mb.chroma_pred_mode[h->mb.i_mb_left_xy[0]] != 0 )
* 1-> AC 16x16 i_idx = luma4x4idx
* 2-> Luma4x4 i_idx = luma4x4idx
* 3-> DC Chroma i_idx = iCbCr
- * 4-> AC Chroma i_idx = 4 * iCbCr + chroma4x4idx
+ * 4-> AC Chroma i_idx = numChroma4x4Blks * iCbCr + chroma4x4idx
* 5-> Luma8x8 i_idx = luma8x8idx
*/
3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
};
+static const uint8_t coeff_flag_offset_chroma_422_dc[7] = { 0, 0, 1, 1, 2, 2, 2 }; /* MIN( i/2, 2 ) */
// node ctx: 0..3: abslevel1 (with abslevelgt1 == 0).
// 4..7: abslevelgt1 + 3 (and abslevel1 doesn't matter).
static const uint8_t coeff_abs_level1_ctx[8] = { 1, 2, 3, 4, 0, 0, 0, 0 };
/* map node ctx => cabac ctx for level>1 */
static const uint8_t coeff_abs_levelgt1_ctx[8] = { 5, 5, 5, 5, 6, 7, 8, 9 };
+/* 4:2:2 chroma dc uses a slightly different state machine for some reason, also note that
+ * 4:2:0 chroma dc doesn't use the last state so it has identical output with both arrays. */
+static const uint8_t coeff_abs_levelgt1_ctx_chroma_dc[8] = { 5, 5, 5, 5, 6, 7, 8, 8 };
static const uint8_t coeff_abs_level_transition[2][8] = {
/* update node ctx after coding a level=1 */
{ 1, 2, 3, 3, 4, 5, 6, 7 },
static const uint8_t count_cat_m1[14] = {15, 14, 15, 3, 14, 63, 15, 14, 15, 63, 15, 14, 15, 63};
#if !RDO_SKIP_BS
-static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+static ALWAYS_INLINE void block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int chroma422dc )
{
- const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED];
int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
int ctx_last = last_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
int ctx_level = coeff_abs_level_m1_offset[ctx_block_cat];
- int coeff_idx = -1, node_ctx = 0, last;
- int coeffs[64];
-
- last = h->quantf.coeff_last[ctx_block_cat]( l );
+ int coeff_idx = -1, node_ctx = 0;
+ int last = h->quantf.coeff_last[ctx_block_cat]( l );
+ const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
+ dctcoef coeffs[64];
-#define WRITE_SIGMAP( l8x8 )\
+#define WRITE_SIGMAP( sig_off, last_off )\
{\
int i = 0;\
while( 1 )\
if( l[i] )\
{\
coeffs[++coeff_idx] = l[i];\
- x264_cabac_encode_decision( cb, ctx_sig + (l8x8 ? sig_offset[i] : i), 1 );\
+ x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );\
if( i == last )\
{\
- x264_cabac_encode_decision( cb, ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 1 );\
+ x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );\
break;\
}\
else\
- x264_cabac_encode_decision( cb, ctx_last + (l8x8 ? last_coeff_flag_offset_8x8[i] : i), 0 );\
+ x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );\
}\
else\
- x264_cabac_encode_decision( cb, ctx_sig + (l8x8 ? sig_offset[i] : i), 0 );\
- i++;\
- if( i == count_m1 )\
+ x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );\
+ if( ++i == count_m1 )\
{\
coeffs[++coeff_idx] = l[i];\
break;\
}\
}
- int count_m1 = count_cat_m1[ctx_block_cat];
- if( count_m1 == 63 )
- WRITE_SIGMAP( 1 )
+ if( chroma422dc )
+ {
+ int count_m1 = 7;
+ WRITE_SIGMAP( coeff_flag_offset_chroma_422_dc[i], coeff_flag_offset_chroma_422_dc[i] )
+ }
else
- WRITE_SIGMAP( 0 )
+ {
+ int count_m1 = count_cat_m1[ctx_block_cat];
+ if( count_m1 == 63 )
+ {
+ const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED];
+ WRITE_SIGMAP( sig_offset[i], last_coeff_flag_offset_8x8[i] )
+ }
+ else
+ WRITE_SIGMAP( i, i )
+ }
do
{
if( abs_coeff > 1 )
{
x264_cabac_encode_decision( cb, ctx, 1 );
- ctx = coeff_abs_levelgt1_ctx[node_ctx] + ctx_level;
+ ctx = levelgt1_ctx[node_ctx] + ctx_level;
for( int i = X264_MIN( abs_coeff, 15 ) - 2; i > 0; i-- )
x264_cabac_encode_decision( cb, ctx, 1 );
if( abs_coeff < 15 )
x264_cabac_encode_bypass( cb, coeff_sign );
} while( --coeff_idx >= 0 );
}
+static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+ block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 0 );
+}
+static void block_residual_write_cabac_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+ /* Template a version specifically for chroma 4:2:2 DC in order to avoid
+ * slowing down everything else due to the added complexity. */
+ block_residual_write_cabac_internal( h, cb, DCT_CHROMA_DC, l, 1 );
+}
#define block_residual_write_cabac_8x8( h, cb, cat, l ) block_residual_write_cabac( h, cb, cat, l )
-
#else
-/* Faster RDO by merging sigmap and level coding. Note that for 8x8dct
- * this is slightly incorrect because the sigmap is not reversible
- * (contexts are repeated). However, there is nearly no quality penalty
- * for this (~0.001db) and the speed boost (~30%) is worth it. */
-static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8 )
+/* Faster RDO by merging sigmap and level coding. Note that for 8x8dct and chroma 4:2:2 dc this is
+ * slightly incorrect because the sigmap is not reversible (contexts are repeated). However, there
+ * is nearly no quality penalty for this (~0.001db) and the speed boost (~30%) is worth it. */
+static void ALWAYS_INLINE block_residual_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l, int b_8x8, int chroma422dc )
{
const uint8_t *sig_offset = significant_coeff_flag_offset_8x8[MB_INTERLACED];
int ctx_sig = significant_coeff_flag_offset[MB_INTERLACED][ctx_block_cat];
int coeff_abs = abs(l[last]);
int ctx = coeff_abs_level1_ctx[0] + ctx_level;
int node_ctx;
+ const uint8_t *levelgt1_ctx = chroma422dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
- if( last != (b_8x8 ? 63 : count_cat_m1[ctx_block_cat]) )
+ if( last != (b_8x8 ? 63 : chroma422dc ? 7 : count_cat_m1[ctx_block_cat]) )
{
- x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] : last), 1 );
- x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[last] : last), 1 );
+ x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[last] :
+ chroma422dc ? coeff_flag_offset_chroma_422_dc[last] : last), 1 );
+ x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[last] :
+ chroma422dc ? coeff_flag_offset_chroma_422_dc[last] : last), 1 );
}
if( coeff_abs > 1 )
{
x264_cabac_encode_decision( cb, ctx, 1 );
- ctx = coeff_abs_levelgt1_ctx[0] + ctx_level;
+ ctx = levelgt1_ctx[0] + ctx_level;
if( coeff_abs < 15 )
{
cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]];
if( l[i] )
{
coeff_abs = abs(l[i]);
- x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : i), 1 );
- x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[i] : i), 0 );
+ x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] :
+ chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 1 );
+ x264_cabac_encode_decision( cb, ctx_last + (b_8x8 ? last_coeff_flag_offset_8x8[i] :
+ chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 0 );
ctx = coeff_abs_level1_ctx[node_ctx] + ctx_level;
if( coeff_abs > 1 )
{
x264_cabac_encode_decision( cb, ctx, 1 );
- ctx = coeff_abs_levelgt1_ctx[node_ctx] + ctx_level;
+ ctx = levelgt1_ctx[node_ctx] + ctx_level;
if( coeff_abs < 15 )
{
cb->f8_bits_encoded += cabac_size_unary[coeff_abs-1][cb->state[ctx]];
}
}
else
- x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] : i), 0 );
+ x264_cabac_encode_decision( cb, ctx_sig + (b_8x8 ? sig_offset[i] :
+ chroma422dc ? coeff_flag_offset_chroma_422_dc[i] : i), 0 );
}
}
static void block_residual_write_cabac_8x8( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
- block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 1 );
+ block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 1, 0 );
+}
+static void block_residual_write_cabac_422_dc( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
+{
+ block_residual_write_cabac_internal( h, cb, DCT_CHROMA_DC, l, 0, 1 );
}
static void block_residual_write_cabac( x264_t *h, x264_cabac_t *cb, int ctx_block_cat, dctcoef *l )
{
- block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 0 );
+ block_residual_write_cabac_internal( h, cb, ctx_block_cat, l, 0, 0 );
}
#endif
-#define block_residual_write_cabac_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
+#define block_residual_write_cabac_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, name )\
do\
{\
int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, ctx_block_cat, i_idx, b_intra );\
if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\
{\
x264_cabac_encode_decision( cb, ctxidxinc, 1 );\
- block_residual_write_cabac( h, cb, ctx_block_cat, l );\
+ block_residual_write_cabac##name( h, cb, ctx_block_cat, l );\
}\
else\
x264_cabac_encode_decision( cb, ctxidxinc, 0 );\
} while(0)
+#define block_residual_write_cabac_dc_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
+ block_residual_write_cabac_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, )
+
+#define block_residual_write_cabac_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
+ block_residual_write_cabac_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, )
+
#define block_residual_write_cabac_8x8_cbf( h, cb, ctx_block_cat, i_idx, l, b_intra )\
-do\
-{\
- int ctxidxinc = x264_cabac_mb_cbf_ctxidxinc( h, ctx_block_cat, i_idx, b_intra );\
- if( h->mb.cache.non_zero_count[x264_scan8[i_idx]] )\
- {\
- x264_cabac_encode_decision( cb, ctxidxinc, 1 );\
- block_residual_write_cabac_8x8( h, cb, ctx_block_cat, l );\
- }\
- else\
- x264_cabac_encode_decision( cb, ctxidxinc, 0 );\
-} while(0)
+ block_residual_write_cabac_cbf_internal( h, cb, ctx_block_cat, i_idx, l, b_intra, _8x8 )
+
+#define block_residual_write_cabac_422_dc_cbf( h, cb, ch, b_intra )\
+ block_residual_write_cabac_cbf_internal( h, cb, DCT_CHROMA_DC, CHROMA_DC+(ch), h->dct.chroma_dc[ch], b_intra, _422_dc )
static ALWAYS_INLINE void x264_macroblock_write_cabac_internal( x264_t *h, x264_cabac_t *cb, int plane_count, int chroma )
{
bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );
if( chroma )
for( int ch = 1; ch < 3; ch++ )
- for( int i = 0; i < 8; i++ )
+ for( int i = 0; i < 16>>h->mb.chroma_v_shift; i++ )
for( int j = 0; j < 8; j++ )
bs_write( &s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
x264_cabac_mb_transform_size( h, cb );
}
- if( h->mb.i_cbp_luma > 0 || (chroma && h->mb.i_cbp_chroma > 0) || i_mb_type == I_16x16 )
+ if( h->mb.i_cbp_luma || (chroma && h->mb.i_cbp_chroma) || i_mb_type == I_16x16 )
{
const int b_intra = IS_INTRA( i_mb_type );
x264_cabac_mb_qp_delta( h, cb );
/* DC Luma */
for( int p = 0; p < plane_count; p++ )
{
- block_residual_write_cabac_cbf( h, cb, ctx_cat_plane[DCT_LUMA_DC][p], LUMA_DC+p, h->dct.luma16x16_dc[p], 1 );
+ block_residual_write_cabac_dc_cbf( h, cb, ctx_cat_plane[DCT_LUMA_DC][p], LUMA_DC+p, h->dct.luma16x16_dc[p], 1 );
/* AC Luma */
if( h->mb.i_cbp_luma )
if( chroma && h->mb.i_cbp_chroma ) /* Chroma DC residual present */
{
- block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], b_intra );
- block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], b_intra );
- if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
- for( int ch = 1; ch < 3; ch++ )
- for( int i = ch*16; i < ch*16+4; i++ )
- block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, b_intra );
+ if( CHROMA_FORMAT == CHROMA_422 )
+ {
+ block_residual_write_cabac_422_dc_cbf( h, cb, 0, b_intra );
+ block_residual_write_cabac_422_dc_cbf( h, cb, 1, b_intra );
+ }
+ else
+ {
+ block_residual_write_cabac_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], b_intra );
+ block_residual_write_cabac_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], b_intra );
+ }
+
+ if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */
+ {
+ int step = 8 << h->mb.chroma_v_shift;
+ for( int i = 16; i < 3*16; i += step )
+ for( int j = i; j < i+4; j++ )
+ block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, b_intra );
+ }
}
}
if( h->mb.i_cbp_chroma )
{
- block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 0 );
- block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1, 0 );
+ if( CHROMA_FORMAT == CHROMA_422 )
+ {
+ int offset = (5*i8) & 0x09;
+ block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1, 0 );
+ block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1, 0 );
+ block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1, 0 );
+ block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1, 0 );
+ }
+ else
+ {
+ block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1, 0 );
+ block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1, 0 );
+ }
}
i8 += x264_pixel_size[i_pixel].h >> 3;
block_residual_write_cabac_cbf( h, cb, ctx_cat_plane[DCT_LUMA_4x4][p], i4+p*16, h->dct.luma4x4[i4+p*16], 1 );
}
-static void x264_i8x8_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
+static void x264_chroma_size_cabac( x264_t *h, x264_cabac_t *cb )
{
x264_cabac_mb_intra_chroma_pred_mode( h, cb );
x264_cabac_mb_cbp_chroma( h, cb );
- if( h->mb.i_cbp_chroma > 0 )
+ if( h->mb.i_cbp_chroma )
{
- block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], 1 );
- block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], 1 );
+ if( CHROMA_FORMAT == CHROMA_422 )
+ {
+ block_residual_write_cabac_422_dc_cbf( h, cb, 0, 1 );
+ block_residual_write_cabac_422_dc_cbf( h, cb, 1, 1 );
+ }
+ else
+ {
+ block_residual_write_cabac_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0], 1 );
+ block_residual_write_cabac_dc_cbf( h, cb, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1], 1 );
+ }
if( h->mb.i_cbp_chroma == 2 )
- for( int ch = 1; ch < 3; ch++ )
- for( int i = ch*16; i < ch*16+4; i++ )
- block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1, 1 );
+ {
+ int step = 8 << h->mb.chroma_v_shift;
+ for( int i = 16; i < 3*16; i += step )
+ for( int j = i; j < i+4; j++ )
+ block_residual_write_cabac_cbf( h, cb, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1, 1 );
+ }
}
}
#endif
{
bs_t *s = &h->out.bs;
static const uint8_t ctz_index[8] = {3,0,1,0,2,0,1,0};
- static const uint8_t count_cat[14] = {16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64};
+ static const uint8_t count_cat[14] = {16, 15, 16, 0, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64};
x264_run_level_t runlevel;
- int i_trailing, i_total_zero, i_suffix_length;
- int i_total = 0;
+ int i_total, i_trailing, i_total_zero, i_suffix_length;
unsigned int i_sign;
/* level and run and total */
}
}
- if( (uint8_t)i_total < count_cat[ctx_block_cat] )
+ if( ctx_block_cat == DCT_CHROMA_DC )
{
- if( ctx_block_cat == DCT_CHROMA_DC )
- bs_write_vlc( s, x264_total_zeros_dc[i_total-1][i_total_zero] );
- else
- bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
+ if( i_total < 8>>h->mb.chroma_v_shift )
+ {
+ vlc_t total_zeros = CHROMA_FORMAT == CHROMA_420 ? x264_total_zeros_2x2_dc[i_total-1][i_total_zero]
+ : x264_total_zeros_2x4_dc[i_total-1][i_total_zero];
+ bs_write_vlc( s, total_zeros );
+ }
}
+ else if( (uint8_t)i_total < count_cat[ctx_block_cat] )
+ bs_write_vlc( s, x264_total_zeros[i_total-1][i_total_zero] );
for( int i = 0; i < i_total-1 && i_total_zero > 0; i++ )
{
#define block_residual_write_cavlc(h,cat,idx,l)\
{\
- int nC = cat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];\
+ int nC = cat == DCT_CHROMA_DC ? 3 + CHROMA_FORMAT\
+ : ct_index[x264_mb_predict_non_zero_code( h, cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];\
uint8_t *nnz = &h->mb.cache.non_zero_count[x264_scan8[idx]];\
if( !*nnz )\
bs_write_vlc( &h->out.bs, x264_coeff0_token[nC] );\
bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[p][i] );
if( chroma )
for( int ch = 1; ch < 3; ch++ )
- for( int i = 0; i < 8; i++ )
+ for( int i = 0; i < 16>>h->mb.chroma_v_shift; i++ )
for( int j = 0; j < 8; j++ )
bs_write( s, BIT_DEPTH, h->mb.pic.p_fenc[ch][i*FENC_STRIDE+j] );
bs_write( s, 4, i_mode - (i_mode > i_pred) );
}
if( chroma )
- bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
+ bs_write_ue( s, x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
}
else if( i_mb_type == I_16x16 )
{
bs_write_ue( s, i_mb_i_offset + 1 + x264_mb_pred_mode16x16_fix[h->mb.i_intra16x16_pred_mode] +
h->mb.i_cbp_chroma * 4 + ( h->mb.i_cbp_luma == 0 ? 0 : 12 ) );
if( chroma )
- bs_write_ue( s, x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
+ bs_write_ue( s, x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
}
else if( i_mb_type == P_L0 )
{
/* Chroma DC residual present */
block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );
block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );
- if( h->mb.i_cbp_chroma&0x02 ) /* Chroma AC residual present */
- for( int ch = 1; ch < 3; ch++ )
- for( int i = ch*16; i < ch*16+4; i++ )
- block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
+ if( h->mb.i_cbp_chroma == 2 ) /* Chroma AC residual present */
+ {
+ int step = 8 << h->mb.chroma_v_shift;
+ for( int i = 16; i < 3*16; i += step )
+ for( int j = i; j < i+4; j++ )
+ block_residual_write_cavlc( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );
+ }
}
#if !RDO_SKIP_BS
x264_macroblock_luma_write_cavlc( h, p*4+i8, p*4+i8 );
if( h->mb.i_cbp_chroma )
{
- block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
- block_residual_write_cavlc( h, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1 );
+ if( CHROMA_FORMAT == CHROMA_422 )
+ {
+ int offset = (5*i8) & 0x09;
+ block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+offset, h->dct.luma4x4[16+offset]+1 );
+ block_residual_write_cavlc( h, DCT_CHROMA_AC, 18+offset, h->dct.luma4x4[18+offset]+1 );
+ block_residual_write_cavlc( h, DCT_CHROMA_AC, 32+offset, h->dct.luma4x4[32+offset]+1 );
+ block_residual_write_cavlc( h, DCT_CHROMA_AC, 34+offset, h->dct.luma4x4[34+offset]+1 );
+ }
+ else
+ {
+ block_residual_write_cavlc( h, DCT_CHROMA_AC, 16+i8, h->dct.luma4x4[16+i8]+1 );
+ block_residual_write_cavlc( h, DCT_CHROMA_AC, 32+i8, h->dct.luma4x4[32+i8]+1 );
+ }
}
i8 += x264_pixel_size[i_pixel].h >> 3;
}
return h->out.bs.i_bits_encoded;
}
-static int x264_i8x8_chroma_size_cavlc( x264_t *h )
+static int x264_chroma_size_cavlc( x264_t *h )
{
- h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_pred_mode8x8c_fix[ h->mb.i_chroma_pred_mode ] );
+ h->out.bs.i_bits_encoded = bs_size_ue( x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode] );
if( h->mb.i_cbp_chroma )
{
block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+0, h->dct.chroma_dc[0] );
block_residual_write_cavlc( h, DCT_CHROMA_DC, CHROMA_DC+1, h->dct.chroma_dc[1] );
if( h->mb.i_cbp_chroma == 2 )
- for( int ch = 1; ch < 3; ch++ )
- for( int i = ch*16; i < ch*16+4; i++ )
- block_residual_write_cavlc( h, DCT_CHROMA_AC, i, h->dct.luma4x4[i]+1 );
+ {
+ int step = 8 << h->mb.chroma_v_shift;
+ for( int i = 16; i < 3*16; i += step )
+ for( int j = i; j < i+4; j++ )
+ block_residual_write_cavlc( h, DCT_CHROMA_AC, j, h->dct.luma4x4[j]+1 );
+ }
}
return h->out.bs.i_bits_encoded;
}
return;
/* Write the frame in display order */
- int frame_size = h->param.i_height * h->param.i_width * (3<<CHROMA444)/2 * sizeof(pixel);
+ int frame_size = FRAME_SIZE( h->param.i_height * h->param.i_width * sizeof(pixel) );
fseek( f, (uint64_t)h->fdec->i_frame * frame_size, SEEK_SET );
for( int p = 0; p < (CHROMA444 ? 3 : 1); p++ )
for( int y = 0; y < h->param.i_height; y++ )
if( !CHROMA444 )
{
int cw = h->param.i_width>>1;
- int ch = h->param.i_height>>1;
+ int ch = h->param.i_height>>h->mb.chroma_v_shift;
pixel *planeu = x264_malloc( (cw*ch*2+32)*sizeof(pixel) );
pixel *planev = planeu + cw*ch + 16;
h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch );
fclose( f );
}
-
/* Fill "default" values */
static void x264_slice_header_init( x264_t *h, x264_slice_header_t *sh,
x264_sps_t *sps, x264_pps_t *pps,
return -1;
}
#endif
+
+#if HAVE_INTERLACED
+ h->param.b_interlaced = !!PARAM_INTERLACED;
+#else
+ if( h->param.b_interlaced )
+ {
+ x264_log( h, X264_LOG_ERROR, "not compiled with interlaced support\n" );
+ return -1;
+ }
+#endif
+
if( h->param.i_width <= 0 || h->param.i_height <= 0 )
{
x264_log( h, X264_LOG_ERROR, "invalid width x height (%dx%d)\n",
int i_csp = h->param.i_csp & X264_CSP_MASK;
if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX )
{
- x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/I444/YV24/BGR/BGRA/RGB supported)\n" );
+ x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12/I422/YV16/NV16/I444/YV24/BGR/BGRA/RGB supported)\n" );
return -1;
}
- if( i_csp < X264_CSP_I444 && (h->param.i_width % 2 || h->param.i_height % 2) )
+ if( i_csp < X264_CSP_I444 && h->param.i_width % 2 )
{
- x264_log( h, X264_LOG_ERROR, "width or height not divisible by 2 (%dx%d)\n",
+ x264_log( h, X264_LOG_ERROR, "width not divisible by 2 (%dx%d)\n",
h->param.i_width, h->param.i_height );
return -1;
}
-#if HAVE_INTERLACED
- h->param.b_interlaced = !!PARAM_INTERLACED;
-#else
- if( h->param.b_interlaced )
+ if( i_csp < X264_CSP_I422 && PARAM_INTERLACED && h->param.i_height % 4 )
{
- x264_log( h, X264_LOG_ERROR, "not compiled with interlaced support\n" );
+ x264_log( h, X264_LOG_ERROR, "height not divisible by 4 (%dx%d)\n",
+ h->param.i_width, h->param.i_height );
+ return -1;
+ }
+
+ if( (i_csp < X264_CSP_I422 || PARAM_INTERLACED) && h->param.i_height % 2 )
+ {
+ x264_log( h, X264_LOG_ERROR, "height not divisible by 2 (%dx%d)\n",
+ h->param.i_width, h->param.i_height );
return -1;
}
-#endif
if( (h->param.crop_rect.i_left + h->param.crop_rect.i_right ) >= h->param.i_width ||
(h->param.crop_rect.i_top + h->param.crop_rect.i_bottom) >= h->param.i_height )
memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) );
memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
- h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
+ h->pixf.intra_mbcmp_x3_8x16c = satd ? h->pixf.intra_satd_x3_8x16c : h->pixf.intra_sad_x3_8x16c;
+ h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
h->pixf.intra_mbcmp_x3_8x8 = satd ? h->pixf.intra_sa8d_x3_8x8 : h->pixf.intra_sad_x3_8x8;
h->pixf.intra_mbcmp_x3_4x4 = satd ? h->pixf.intra_satd_x3_4x4 : h->pixf.intra_sad_x3_4x4;
h->pixf.intra_mbcmp_x9_4x4 = h->param.b_cpu_independent || h->mb.b_lossless ? NULL
memcpy( h->pixf.fpelcmp_x4, satd ? h->pixf.satd_x4 : h->pixf.sad_x4, sizeof(h->pixf.fpelcmp_x4) );
}
+static void chroma_dsp_init( x264_t *h )
+{
+ memcpy( h->luma2chroma_pixel, x264_luma2chroma_pixel[CHROMA_FORMAT], sizeof(h->luma2chroma_pixel) );
+
+ switch( CHROMA_FORMAT )
+ {
+ case CHROMA_420:
+ memcpy( h->predict_chroma, h->predict_8x8c, sizeof(h->predict_chroma) );
+ h->loopf.deblock_chroma[0] = h->loopf.deblock_h_chroma_420;
+ h->loopf.deblock_chroma_intra[0] = h->loopf.deblock_h_chroma_420_intra;
+ h->loopf.deblock_chroma_mbaff = h->loopf.deblock_chroma_420_mbaff;
+ h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_chroma_420_intra_mbaff;
+ h->pixf.intra_mbcmp_x3_chroma = h->pixf.intra_mbcmp_x3_8x8c;
+ h->quantf.coeff_last[DCT_CHROMA_DC] = h->quantf.coeff_last4;
+ h->quantf.coeff_level_run[DCT_CHROMA_DC] = h->quantf.coeff_level_run4;
+ break;
+ case CHROMA_422:
+ memcpy( h->predict_chroma, h->predict_8x16c, sizeof(h->predict_chroma) );
+ h->loopf.deblock_chroma[0] = h->loopf.deblock_h_chroma_422;
+ h->loopf.deblock_chroma_intra[0] = h->loopf.deblock_h_chroma_422_intra;
+ h->loopf.deblock_chroma_mbaff = h->loopf.deblock_chroma_422_mbaff;
+ h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_chroma_422_intra_mbaff;
+ h->pixf.intra_mbcmp_x3_chroma = h->pixf.intra_mbcmp_x3_8x16c;
+ h->quantf.coeff_last[DCT_CHROMA_DC] = h->quantf.coeff_last8;
+ h->quantf.coeff_level_run[DCT_CHROMA_DC] = h->quantf.coeff_level_run8;
+ break;
+ case CHROMA_444:
+ h->loopf.deblock_chroma_mbaff = h->loopf.deblock_luma_mbaff;
+ h->loopf.deblock_chroma_intra_mbaff = h->loopf.deblock_luma_intra_mbaff;
+ break;
+ }
+}
+
static void x264_set_aspect_ratio( x264_t *h, x264_param_t *param, int initial )
{
/* VUI */
h->mb.i_mb_width = h->sps->i_mb_width;
h->mb.i_mb_height = h->sps->i_mb_height;
h->mb.i_mb_count = h->mb.i_mb_width * h->mb.i_mb_height;
+
+ h->mb.chroma_h_shift = CHROMA_FORMAT == CHROMA_420 || CHROMA_FORMAT == CHROMA_422;
+ h->mb.chroma_v_shift = CHROMA_FORMAT == CHROMA_420;
+
/* Adaptive MBAFF and subme 0 are not supported as we require halving motion
* vectors during prediction, resulting in hpel mvs.
* The chosen solution is to make MBAFF non-adaptive in this case. */
/* init CPU functions */
x264_predict_16x16_init( h->param.cpu, h->predict_16x16 );
x264_predict_8x8c_init( h->param.cpu, h->predict_8x8c );
+ x264_predict_8x16c_init( h->param.cpu, h->predict_8x16c );
x264_predict_8x8_init( h->param.cpu, h->predict_8x8, &h->predict_8x8_filter );
x264_predict_4x4_init( h->param.cpu, h->predict_4x4 );
if( h->param.b_cabac )
x264_dct_init_weights();
mbcmp_init( h );
+ chroma_dsp_init( h );
p = buf + sprintf( buf, "using cpu capabilities:" );
for( int i = 0; x264_cpu_names[i].flags; i++ )
h->sps->i_profile_idc == PROFILE_MAIN ? "Main" :
h->sps->i_profile_idc == PROFILE_HIGH ? "High" :
h->sps->i_profile_idc == PROFILE_HIGH10 ? (h->sps->b_constraint_set3 == 1 ? "High 10 Intra" : "High 10") :
+ h->sps->i_profile_idc == PROFILE_HIGH422 ? (h->sps->b_constraint_set3 == 1 ? "High 4:2:2 Intra" : "High 4:2:2") :
h->sps->b_constraint_set3 == 1 ? "High 4:4:4 Intra" : "High 4:4:4 Predictive";
char level[4];
snprintf( level, sizeof(level), "%d.%d", h->sps->i_level_idc/10, h->sps->i_level_idc%10 );
}
else
{
+ static const char * const subsampling[4] = { "4:0:0", "4:2:0", "4:2:2", "4:4:4" };
x264_log( h, X264_LOG_INFO, "profile %s, level %s, %s %d-bit\n",
- profile, level, CHROMA444 ? "4:4:4" : "4:2:0", BIT_DEPTH );
+ profile, level, subsampling[CHROMA_FORMAT], BIT_DEPTH );
}
return h;
* consistency by copying deblocked pixels between planes. */
if( PARAM_INTERLACED )
for( int p = 0; p < h->fdec->i_plane; p++ )
- for( int i = minpix_y>>(!CHROMA444 && p); i < maxpix_y>>(!CHROMA444 && p); i++ )
+ for( int i = minpix_y>>(h->mb.chroma_v_shift && p); i < maxpix_y>>(h->mb.chroma_v_shift && p); i++ )
memcpy( h->fdec->plane_fld[p] + i*h->fdec->i_stride[p],
h->fdec->plane[p] + i*h->fdec->i_stride[p],
h->mb.i_mb_width*16*sizeof(pixel) );
if( !CHROMA444 )
{
uint64_t ssd_u, ssd_v;
+ int v_shift = h->mb.chroma_v_shift;
x264_pixel_ssd_nv12( &h->pixf,
- h->fdec->plane[1] + (minpix_y>>1) * h->fdec->i_stride[1], h->fdec->i_stride[1],
- h->fenc->plane[1] + (minpix_y>>1) * h->fenc->i_stride[1], h->fenc->i_stride[1],
- h->param.i_width>>1, (maxpix_y-minpix_y)>>1, &ssd_u, &ssd_v );
+ h->fdec->plane[1] + (minpix_y>>v_shift) * h->fdec->i_stride[1], h->fdec->i_stride[1],
+ h->fenc->plane[1] + (minpix_y>>v_shift) * h->fenc->i_stride[1], h->fenc->i_stride[1],
+ h->param.i_width>>1, (maxpix_y-minpix_y)>>v_shift, &ssd_u, &ssd_v );
h->stat.frame.i_ssd[1] += ssd_u;
h->stat.frame.i_ssd[2] += ssd_v;
}
else //if( h->mb.i_type == I_4x4 )
for( int i = 0; i < 16; i++ )
h->stat.frame.i_mb_pred_mode[2][h->mb.cache.intra4x4_pred_mode[x264_scan8[i]]]++;
- h->stat.frame.i_mb_pred_mode[3][x264_mb_pred_mode8x8c_fix[h->mb.i_chroma_pred_mode]]++;
+ h->stat.frame.i_mb_pred_mode[3][x264_mb_chroma_pred_mode_fix[h->mb.i_chroma_pred_mode]]++;
}
h->stat.frame.i_mb_field[b_intra?0:b_skip?2:1] += MB_INTERLACED;
}
h->stat.frame.i_ssd[2],
};
int luma_size = h->param.i_width * h->param.i_height;
- int chroma_size = h->param.i_width * h->param.i_height >> (!CHROMA444 * 2);
+ int chroma_size = CHROMA_SIZE( luma_size );
double psnr_y = x264_psnr( ssd[0], luma_size );
double psnr_u = x264_psnr( ssd[1], chroma_size );
double psnr_v = x264_psnr( ssd[2], chroma_size );
****************************************************************************/
void x264_encoder_close ( x264_t *h )
{
- int luma_size = h->param.i_width * h->param.i_height;
- int chroma_size = h->param.i_width * h->param.i_height >> (!CHROMA444 * 2);
- int64_t i_yuv_size = luma_size + chroma_size * 2;
+ int64_t i_yuv_size = FRAME_SIZE( h->param.i_width * h->param.i_height );
int64_t i_mb_count_size[2][7] = {{0}};
char buf[200];
int b_print_pcm = h->stat.i_mb_count[SLICE_TYPE_I][I_PCM]
}
for( int i = 0; i <= I_PRED_CHROMA_DC_128; i++ )
{
- fixed_pred_modes[3][x264_mb_pred_mode8x8c_fix[i]] += h->stat.i_mb_pred_mode[3][i];
+ fixed_pred_modes[3][x264_mb_chroma_pred_mode_fix[i]] += h->stat.i_mb_pred_mode[3][i];
sum_pred_modes[3] += h->stat.i_mb_pred_mode[3][i];
}
if( sum_pred_modes[3] && !CHROMA444 )
* Authors: Laurent Aimar <fenrir@via.ecp.fr>
* Loren Merritt <lorenm@u.washington.edu>
* Fiona Glaser <fiona@x264.com>
+ * Henrik Gramner <hengar-6@student.ltu.se>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
}
#undef ZIG
-#define IDCT_DEQUANT_START \
+static inline void zigzag_scan_2x4_dc( dctcoef level[8], dctcoef dct[8] )
+{
+ level[0] = dct[0];
+ level[1] = dct[2];
+ level[2] = dct[1];
+ level[3] = dct[4];
+ level[4] = dct[6];
+ level[5] = dct[3];
+ level[6] = dct[5];
+ level[7] = dct[7];
+}
+
+#define IDCT_DEQUANT_2X2_START \
int d0 = dct[0] + dct[1]; \
int d1 = dct[2] + dct[3]; \
int d2 = dct[0] - dct[1]; \
static inline void idct_dequant_2x2_dc( dctcoef dct[4], dctcoef dct4x4[4][16], int dequant_mf[6][16], int i_qp )
{
- IDCT_DEQUANT_START
+ IDCT_DEQUANT_2X2_START
dct4x4[0][0] = (d0 + d1) * dmf >> 5;
dct4x4[1][0] = (d0 - d1) * dmf >> 5;
dct4x4[2][0] = (d2 + d3) * dmf >> 5;
dct4x4[3][0] = (d2 - d3) * dmf >> 5;
}
-static inline void idct_dequant_2x2_dconly( dctcoef out[4], dctcoef dct[4], int dequant_mf[6][16], int i_qp )
+static inline void idct_dequant_2x2_dconly( dctcoef dct[4], int dequant_mf[6][16], int i_qp )
{
- IDCT_DEQUANT_START
- out[0] = (d0 + d1) * dmf >> 5;
- out[1] = (d0 - d1) * dmf >> 5;
- out[2] = (d2 + d3) * dmf >> 5;
- out[3] = (d2 - d3) * dmf >> 5;
+ IDCT_DEQUANT_2X2_START
+ dct[0] = (d0 + d1) * dmf >> 5;
+ dct[1] = (d0 - d1) * dmf >> 5;
+ dct[2] = (d2 + d3) * dmf >> 5;
+ dct[3] = (d2 - d3) * dmf >> 5;
}
+#undef IDCT_2X2_DEQUANT_START
static inline void dct2x2dc( dctcoef d[4], dctcoef dct4x4[4][16] )
{
dct4x4[3][0] = 0;
}
+static ALWAYS_INLINE int array_non_zero( dctcoef *v, int i_count )
+{
+ if( WORD_SIZE == 8 )
+ {
+ for( int i = 0; i < i_count; i += 8/sizeof(dctcoef) )
+ if( M64( &v[i] ) )
+ return 1;
+ }
+ else
+ {
+ for( int i = 0; i < i_count; i += 4/sizeof(dctcoef) )
+ if( M32( &v[i] ) )
+ return 1;
+ }
+ return 0;
+}
+
static ALWAYS_INLINE int x264_quant_4x4( x264_t *h, dctcoef dct[16], int i_qp, int ctx_block_cat, int b_intra, int p, int idx )
{
int i_quant_cat = b_intra ? (p?CQM_4IC:CQM_4IY) : (p?CQM_4PC:CQM_4PY);
block_cbp |= nz;
}
h->mb.i_cbp_luma |= block_cbp * 0xf;
- h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4 );
+ h->mb.cache.non_zero_count[x264_scan8[LUMA_DC+p]] = array_non_zero( dct_dc4x4, 16 );
h->zigzagf.scan_4x4( h->dct.luma16x16_dc[p], dct_dc4x4 );
return;
}
h->dctf.dct4x4dc( dct_dc4x4 );
if( h->mb.b_trellis )
- nz = x264_quant_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, 0, LUMA_DC+p );
+ nz = x264_quant_luma_dc_trellis( h, dct_dc4x4, i_quant_cat, i_qp, ctx_cat_plane[DCT_LUMA_DC][p], 1, LUMA_DC+p );
else
nz = h->quantf.quant_4x4_dc( dct_dc4x4, h->quant4_mf[i_quant_cat][i_qp][0]>>1, h->quant4_bias[i_quant_cat][i_qp][0]<<1 );
* Unlike luma blocks, this can't be done with a lookup table or
* other shortcut technique because of the interdependencies
* between the coefficients due to the chroma DC transform. */
-static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef dct2x2[4], int dequant_mf[6][16], int i_qp )
+static ALWAYS_INLINE int x264_mb_optimize_chroma_dc( x264_t *h, dctcoef *dct_dc, int dequant_mf[6][16], int i_qp, int chroma422 )
{
int dmf = dequant_mf[i_qp%6][0] << i_qp/6;
if( dmf > 32*64 )
return 1;
- return h->quantf.optimize_chroma_dc( dct2x2, dmf );
+ if( chroma422 )
+ return h->quantf.optimize_chroma_2x4_dc( dct_dc, dmf );
+ else
+ return h->quantf.optimize_chroma_2x2_dc( dct_dc, dmf );
}
-void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp )
+static ALWAYS_INLINE void x264_mb_encode_chroma_internal( x264_t *h, int b_inter, int i_qp, int chroma422 )
{
int nz, nz_dc;
int b_decimate = b_inter && h->mb.b_dct_decimate;
- ALIGNED_ARRAY_16( dctcoef, dct2x2,[4] );
+ int (*dequant_mf)[16] = h->dequant4_mf[CQM_4IC + b_inter];
+ ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
h->mb.i_cbp_chroma = 0;
h->nr_count[2] += h->mb.b_noise_reduction * 4;
* Values are experimentally derived. */
if( b_decimate && i_qp >= (h->mb.b_trellis ? 12 : 18) && !h->mb.b_noise_reduction )
{
- int thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
+ int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
int ssd[2];
- int score = h->pixf.var2_8x8( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
+ int chromapix = chroma422 ? PIXEL_8x16 : PIXEL_8x8;
+
+ int score = h->pixf.var2[chromapix]( h->mb.pic.p_fenc[1], FENC_STRIDE, h->mb.pic.p_fdec[1], FDEC_STRIDE, &ssd[0] );
if( score < thresh*4 )
- score += h->pixf.var2_8x8( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
+ score += h->pixf.var2[chromapix]( h->mb.pic.p_fenc[2], FENC_STRIDE, h->mb.pic.p_fdec[2], FDEC_STRIDE, &ssd[1] );
if( score < thresh*4 )
{
M16( &h->mb.cache.non_zero_count[x264_scan8[16]] ) = 0;
M16( &h->mb.cache.non_zero_count[x264_scan8[18]] ) = 0;
M16( &h->mb.cache.non_zero_count[x264_scan8[32]] ) = 0;
M16( &h->mb.cache.non_zero_count[x264_scan8[34]] ) = 0;
+ if( chroma422 )
+ {
+ M16( &h->mb.cache.non_zero_count[x264_scan8[24]] ) = 0;
+ M16( &h->mb.cache.non_zero_count[x264_scan8[26]] ) = 0;
+ M16( &h->mb.cache.non_zero_count[x264_scan8[40]] ) = 0;
+ M16( &h->mb.cache.non_zero_count[x264_scan8[42]] ) = 0;
+ }
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+0]] = 0;
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] = 0;
{
if( ssd[ch] > thresh )
{
- h->dctf.sub8x8_dct_dc( dct2x2, h->mb.pic.p_fenc[1+ch], h->mb.pic.p_fdec[1+ch] );
+ pixel *p_src = h->mb.pic.p_fenc[1+ch];
+ pixel *p_dst = h->mb.pic.p_fdec[1+ch];
+
+ if( chroma422 )
+ /* Cannot be replaced by two calls to sub8x8_dct_dc since the hadamard transform is different */
+ h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
+ else
+ h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
+
if( h->mb.b_trellis )
- nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1, CHROMA_DC+ch );
+ nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
else
- nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
+ {
+ nz_dc = 0;
+ for( int i = 0; i <= chroma422; i++ )
+ nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
+ h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
+ }
if( nz_dc )
{
- if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
+ if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
continue;
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 1;
- zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
- idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
- h->dctf.add8x8_idct_dc( h->mb.pic.p_fdec[1+ch], dct2x2 );
+ if( chroma422 )
+ {
+ zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
+ h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
+ }
+ else
+ {
+ zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
+ idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
+ }
+
+ for( int i = 0; i <= chroma422; i++ )
+ h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
h->mb.i_cbp_chroma = 1;
}
}
int i_decimate_score = 0;
int nz_ac = 0;
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
+ ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
if( h->mb.b_lossless )
{
- for( int i = 0; i < 4; i++ )
+ static const uint8_t chroma422_scan[8] = { 0, 2, 1, 5, 3, 6, 4, 7 };
+
+ for( int i = 0; i < (chroma422?8:4); i++ )
{
- int oe = block_idx_x[i]*4 + block_idx_y[i]*4*FENC_STRIDE;
- int od = block_idx_x[i]*4 + block_idx_y[i]*4*FDEC_STRIDE;
- nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+ch*16], p_src+oe, p_dst+od, &h->dct.chroma_dc[ch][i] );
- h->mb.cache.non_zero_count[x264_scan8[16+i+ch*16]] = nz;
+ int oe = 4*(i&1) + 4*(i>>1)*FENC_STRIDE;
+ int od = 4*(i&1) + 4*(i>>1)*FDEC_STRIDE;
+ nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], p_src+oe, p_dst+od,
+ &h->dct.chroma_dc[ch][chroma422?chroma422_scan[i]:i] );
+ h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
h->mb.i_cbp_chroma |= nz;
}
- h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch] );
+ h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = array_non_zero( h->dct.chroma_dc[ch], chroma422?8:4 );
continue;
}
- h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
+ for( int i = 0; i <= chroma422; i++ )
+ h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
+
if( h->mb.b_noise_reduction )
- for( int i = 0; i < 4; i++ )
+ for( int i = 0; i < (chroma422?8:4); i++ )
h->quantf.denoise_dct( dct4x4[i], h->nr_residual_sum[2], h->nr_offset[2], 16 );
- dct2x2dc( dct2x2, dct4x4 );
+
+ if( chroma422 )
+ h->dctf.dct2x4dc( dct_dc, dct4x4 );
+ else
+ dct2x2dc( dct_dc, dct4x4 );
+
/* calculate dct coeffs */
- for( int i = 0; i < 4; i++ )
+ for( int i = 0; i < (chroma422?8:4); i++ )
{
if( h->mb.b_trellis )
nz = x264_quant_4x4_trellis( h, dct4x4[i], CQM_4IC+b_inter, i_qp, DCT_CHROMA_AC, !b_inter, 1, 0 );
else
nz = h->quantf.quant_4x4( dct4x4[i], h->quant4_mf[CQM_4IC+b_inter][i_qp], h->quant4_bias[CQM_4IC+b_inter][i_qp] );
- h->mb.cache.non_zero_count[x264_scan8[16+i+ch*16]] = nz;
+ h->mb.cache.non_zero_count[x264_scan8[16+i+(chroma422?i&4:0)+ch*16]] = nz;
if( nz )
{
nz_ac = 1;
- h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+ch*16], dct4x4[i] );
- h->quantf.dequant_4x4( dct4x4[i], h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+ h->zigzagf.scan_4x4( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16], dct4x4[i] );
+ h->quantf.dequant_4x4( dct4x4[i], dequant_mf, i_qp );
if( b_decimate )
- i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+ch*16] );
+ i_decimate_score += h->quantf.decimate_score15( h->dct.luma4x4[16+i+(chroma422?i&4:0)+ch*16] );
}
}
if( h->mb.b_trellis )
- nz_dc = x264_quant_dc_trellis( h, dct2x2, CQM_4IC+b_inter, i_qp, DCT_CHROMA_DC, !b_inter, 1, CHROMA_DC+ch );
+ nz_dc = x264_quant_chroma_dc_trellis( h, dct_dc, i_qp+3*chroma422, !b_inter, CHROMA_DC+ch );
else
- nz_dc = h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4IC+b_inter][i_qp][0]>>1, h->quant4_bias[CQM_4IC+b_inter][i_qp][0]<<1 );
+ {
+ nz_dc = 0;
+ for( int i = 0; i <= chroma422; i++ )
+ nz_dc |= h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4IC+b_inter][i_qp+3*chroma422][0] >> 1,
+ h->quant4_bias[CQM_4IC+b_inter][i_qp+3*chroma422][0] << 1 );
+ }
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = nz_dc;
if( (b_decimate && i_decimate_score < 7) || !nz_ac )
{
/* Decimate the block */
- M16( &h->mb.cache.non_zero_count[x264_scan8[16+0+16*ch]] ) = 0;
- M16( &h->mb.cache.non_zero_count[x264_scan8[16+2+16*ch]] ) = 0;
+ M16( &h->mb.cache.non_zero_count[x264_scan8[16+16*ch]] ) = 0;
+ M16( &h->mb.cache.non_zero_count[x264_scan8[18+16*ch]] ) = 0;
+ if( chroma422 )
+ {
+ M16( &h->mb.cache.non_zero_count[x264_scan8[24+16*ch]] ) = 0;
+ M16( &h->mb.cache.non_zero_count[x264_scan8[26+16*ch]] ) = 0;
+ }
+
if( !nz_dc ) /* Whole block is empty */
continue;
- if( !x264_mb_optimize_chroma_dc( h, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp ) )
+ if( !x264_mb_optimize_chroma_dc( h, dct_dc, dequant_mf, i_qp+3*chroma422, chroma422 ) )
{
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+ch]] = 0;
continue;
}
/* DC-only */
- zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
- idct_dequant_2x2_dconly( dct2x2, dct2x2, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
- h->dctf.add8x8_idct_dc( p_dst, dct2x2 );
+ if( chroma422 )
+ {
+ zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
+ h->quantf.idct_dequant_2x4_dconly( dct_dc, dequant_mf, i_qp+3 );
+ }
+ else
+ {
+ zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
+ idct_dequant_2x2_dconly( dct_dc, dequant_mf, i_qp );
+ }
+
+ for( int i = 0; i <= chroma422; i++ )
+ h->dctf.add8x8_idct_dc( p_dst + 8*i*FDEC_STRIDE, &dct_dc[4*i] );
}
else
{
h->mb.i_cbp_chroma = 1;
+
if( nz_dc )
{
- zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct2x2 );
- idct_dequant_2x2_dc( dct2x2, dct4x4, h->dequant4_mf[CQM_4IC + b_inter], i_qp );
+ if( chroma422 )
+ {
+ zigzag_scan_2x4_dc( h->dct.chroma_dc[ch], dct_dc );
+ h->quantf.idct_dequant_2x4_dc( dct_dc, dct4x4, dequant_mf, i_qp+3 );
+ }
+ else
+ {
+ zigzag_scan_2x2_dc( h->dct.chroma_dc[ch], dct_dc );
+ idct_dequant_2x2_dc( dct_dc, dct4x4, dequant_mf, i_qp );
+ }
}
- h->dctf.add8x8_idct( p_dst, dct4x4 );
+
+ for( int i = 0; i <= chroma422; i++ )
+ h->dctf.add8x8_idct( p_dst + 8*i*FDEC_STRIDE, &dct4x4[4*i] );
}
}
h->mb.cache.non_zero_count[x264_scan8[CHROMA_DC+1]] | h->mb.i_cbp_chroma);
}
+void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp )
+{
+ if( CHROMA_FORMAT == CHROMA_420 )
+ x264_mb_encode_chroma_internal( h, b_inter, i_qp, 0 );
+ else
+ x264_mb_encode_chroma_internal( h, b_inter, i_qp, 1 );
+}
+
static void x264_macroblock_encode_skip( x264_t *h )
{
M32( &h->mb.cache.non_zero_count[x264_scan8[ 0]] ) = 0;
M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 2]] ) = 0;
M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 0]] ) = 0;
M32( &h->mb.cache.non_zero_count[x264_scan8[32+ 2]] ) = 0;
- if( CHROMA444 )
+ if( CHROMA_FORMAT >= CHROMA_422 )
{
M32( &h->mb.cache.non_zero_count[x264_scan8[16+ 8]] ) = 0;
M32( &h->mb.cache.non_zero_count[x264_scan8[16+10]] ) = 0;
* Intra prediction for predictive lossless mode.
*****************************************************************************/
-void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode )
+void x264_predict_lossless_chroma( x264_t *h, int i_mode )
{
+ int height = 16 >> h->mb.chroma_v_shift;
if( i_mode == I_PRED_CHROMA_V )
{
- h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, 8 );
- h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, 8 );
+ h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, height );
+ h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, height );
memcpy( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE, 8*sizeof(pixel) );
memcpy( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE, 8*sizeof(pixel) );
}
else if( i_mode == I_PRED_CHROMA_H )
{
- h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, 8 );
- h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, 8 );
+ h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, height );
+ h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, height );
x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 );
x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 );
+ if( CHROMA_FORMAT == CHROMA_422 )
+ {
+ x264_copy_column8( h->mb.pic.p_fdec[1]+12*FDEC_STRIDE, h->mb.pic.p_fdec[1]+12*FDEC_STRIDE-1 );
+ x264_copy_column8( h->mb.pic.p_fdec[2]+12*FDEC_STRIDE, h->mb.pic.p_fdec[2]+12*FDEC_STRIDE-1 );
+ }
}
else
{
- h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
- h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+ h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
+ h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
}
}
h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fdec[p], FDEC_STRIDE, h->mb.pic.p_fenc[p], FENC_STRIDE, 16 );
if( chroma )
{
- h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, 8 );
- h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, 8 );
+ int height = 16 >> h->mb.chroma_v_shift;
+ h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1], FENC_STRIDE, height );
+ h->mc.copy[PIXEL_8x8] ( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2], FENC_STRIDE, height );
}
return;
}
if( chroma )
{
+ int v_shift = h->mb.chroma_v_shift;
+ int height = 16 >> v_shift;
+
/* Special case for mv0, which is (of course) very common in P-skip mode. */
if( mvx | mvy )
h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
- mvx, mvy, 8, 8 );
+ mvx, 2*mvy>>v_shift, 8, height );
else
- h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] );
+ h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
+ h->mb.pic.i_stride[1], height );
if( h->sh.weight[0][1].weightfn )
h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
h->mb.pic.p_fdec[1], FDEC_STRIDE,
- &h->sh.weight[0][1], 8 );
+ &h->sh.weight[0][1], height );
if( h->sh.weight[0][2].weightfn )
h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
h->mb.pic.p_fdec[2], FDEC_STRIDE,
- &h->sh.weight[0][2], 8 );
+ &h->sh.weight[0][2], height );
}
}
{
if( IS_INTRA( h->mb.i_type ) )
{
- const int i_mode = h->mb.i_chroma_pred_mode;
+ int i_mode = h->mb.i_chroma_pred_mode;
if( h->mb.b_lossless )
- x264_predict_lossless_8x8_chroma( h, i_mode );
+ x264_predict_lossless_chroma( h, i_mode );
else
{
- h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[1] );
- h->predict_8x8c[i_mode]( h->mb.pic.p_fdec[2] );
+ h->predict_chroma[i_mode]( h->mb.pic.p_fdec[1] );
+ h->predict_chroma[i_mode]( h->mb.pic.p_fdec[2] );
}
}
/* encode the 8x8 blocks */
- x264_mb_encode_8x8_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
+ x264_mb_encode_chroma( h, !IS_INTRA( h->mb.i_type ), h->mb.i_chroma_qp );
}
else
h->mb.i_cbp_chroma = 0;
*****************************************************************************/
static ALWAYS_INLINE int x264_macroblock_probe_skip_internal( x264_t *h, int b_bidir, int plane_count, int chroma )
{
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[4],[16] );
- ALIGNED_ARRAY_16( dctcoef, dct2x2,[4] );
+ ALIGNED_ARRAY_16( dctcoef, dct4x4,[8],[16] );
ALIGNED_ARRAY_16( dctcoef, dctscan,[16] );
ALIGNED_4( int16_t mvp[2] );
-
int i_qp = h->mb.i_qp;
- int thresh, ssd;
for( int p = 0; p < plane_count; p++ )
{
i_qp = h->mb.i_chroma_qp;
}
- if( chroma )
+ if( chroma == CHROMA_420 || chroma == CHROMA_422 )
{
- /* encode chroma */
i_qp = h->mb.i_chroma_qp;
- thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
+ int chroma422 = chroma == CHROMA_422;
+ int thresh = chroma422 ? (x264_lambda2_tab[i_qp] + 16) >> 5 : (x264_lambda2_tab[i_qp] + 32) >> 6;
+ int ssd;
+ ALIGNED_ARRAY_16( dctcoef, dct_dc,[8] );
if( !b_bidir )
{
if( M32( mvp ) )
h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
- mvp[0], mvp[1], 8, 8 );
+ mvp[0], mvp[1]<<chroma422, 8, chroma422?16:8 );
else
- h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] );
+ h->mc.load_deinterleave_chroma_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4],
+ h->mb.pic.i_stride[1], chroma422?16:8 );
}
for( int ch = 0; ch < 2; ch++ )
if( !b_bidir && h->sh.weight[0][1+ch].weightfn )
h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
- &h->sh.weight[0][1+ch], 8 );
+ &h->sh.weight[0][1+ch], chroma422?16:8 );
/* there is almost never a termination during chroma, but we can't avoid the check entirely */
/* so instead we check SSD and skip the actual check if the score is low enough. */
- ssd = h->pixf.ssd[PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
+ ssd = h->pixf.ssd[chroma422?PIXEL_8x16:PIXEL_8x8]( p_dst, FDEC_STRIDE, p_src, FENC_STRIDE );
if( ssd < thresh )
continue;
* threshold check, so we can save time by doing a DC-only DCT. */
if( h->mb.b_noise_reduction )
{
- h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
- for( int i4x4 = 0; i4x4 < 4; i4x4++ )
+ for( int i = 0; i <= chroma422; i++ )
+ h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
+
+ for( int i4x4 = 0; i4x4 < (chroma422?8:4); i4x4++ )
{
h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
- dct2x2[i4x4] = dct4x4[i4x4][0];
+ dct_dc[i4x4] = dct4x4[i4x4][0];
}
}
else
- h->dctf.sub8x8_dct_dc( dct2x2, p_src, p_dst );
+ {
+ if( chroma422 )
+ h->dctf.sub8x16_dct_dc( dct_dc, p_src, p_dst );
+ else
+ h->dctf.sub8x8_dct_dc( dct_dc, p_src, p_dst );
+ }
- if( h->quantf.quant_2x2_dc( dct2x2, h->quant4_mf[CQM_4PC][i_qp][0]>>1, h->quant4_bias[CQM_4PC][i_qp][0]<<1 ) )
- return 0;
+ for( int i = 0; i <= chroma422; i++ )
+ if( h->quantf.quant_2x2_dc( &dct_dc[4*i], h->quant4_mf[CQM_4PC][i_qp+3*chroma422][0] >> 1,
+ h->quant4_bias[CQM_4PC][i_qp+3*chroma422][0] << 1 ) )
+ return 0;
/* If there wasn't a termination in DC, we can check against a much higher threshold. */
if( ssd < thresh*4 )
continue;
if( !h->mb.b_noise_reduction )
- h->dctf.sub8x8_dct( dct4x4, p_src, p_dst );
+ for( int i = 0; i <= chroma422; i++ )
+ h->dctf.sub8x8_dct( &dct4x4[4*i], p_src + 8*i*FENC_STRIDE, p_dst + 8*i*FDEC_STRIDE );
/* calculate dct coeffs */
- for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < 4; i4x4++ )
+ for( int i4x4 = 0, i_decimate_mb = 0; i4x4 < (chroma422?8:4); i4x4++ )
{
dct4x4[i4x4][0] = 0;
if( h->mb.b_noise_reduction )
int x264_macroblock_probe_skip( x264_t *h, int b_bidir )
{
- if( CHROMA444 )
- return x264_macroblock_probe_skip_internal( h, b_bidir, 3, 0 );
+ if( CHROMA_FORMAT == CHROMA_444 )
+ return x264_macroblock_probe_skip_internal( h, b_bidir, 3, CHROMA_444 );
+ else if( CHROMA_FORMAT == CHROMA_422 )
+ return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_422 );
else
- return x264_macroblock_probe_skip_internal( h, b_bidir, 1, 1 );
+ return x264_macroblock_probe_skip_internal( h, b_bidir, 1, CHROMA_420 );
}
/****************************************************************************
int x = i8&1;
int y = i8>>1;
int nz;
+ int chroma422 = chroma == CHROMA_422;
h->mb.i_cbp_chroma = 0;
h->mb.i_cbp_luma &= ~(1 << i8);
}
h->mb.i_cbp_luma |= nnz8x8 << i8;
}
- if( chroma )
+ if( chroma == CHROMA_420 || chroma == CHROMA_422 )
{
for( int ch = 0; ch < 2; ch++ )
{
dctcoef dc;
- pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
- pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
- nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+i8+ch*16], p_fenc, p_fdec, &dc );
- h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*16]] = nz;
+ pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
+ pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
+
+ for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
+ {
+ int offset = chroma422 ? 8*y + 2*i4x4 + x : i8;
+ nz = h->zigzagf.sub_4x4ac( h->dct.luma4x4[16+offset+ch*16], p_fenc+4*i4x4*FENC_STRIDE, p_fdec+4*i4x4*FDEC_STRIDE, &dc );
+ h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
+ }
}
h->mb.i_cbp_chroma = 0x02;
}
}
}
- if( chroma )
+ if( chroma == CHROMA_420 || chroma == CHROMA_422 )
{
i_qp = h->mb.i_chroma_qp;
for( int ch = 0; ch < 2; ch++ )
{
- ALIGNED_ARRAY_16( dctcoef, dct4x4,[16] );
- pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + 4*y*FENC_STRIDE;
- pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + 4*y*FDEC_STRIDE;
- h->dctf.sub4x4_dct( dct4x4, p_fenc, p_fdec );
- if( h->mb.b_noise_reduction )
- h->quantf.denoise_dct( dct4x4, h->nr_residual_sum[2], h->nr_offset[2], 16 );
- dct4x4[0] = 0;
+ ALIGNED_ARRAY_16( dctcoef, dct4x4,[2],[16] );
+ pixel *p_fenc = h->mb.pic.p_fenc[1+ch] + 4*x + (chroma422?8:4)*y*FENC_STRIDE;
+ pixel *p_fdec = h->mb.pic.p_fdec[1+ch] + 4*x + (chroma422?8:4)*y*FDEC_STRIDE;
- if( h->mb.b_trellis )
- nz = x264_quant_4x4_trellis( h, dct4x4, CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
- else
- nz = h->quantf.quant_4x4( dct4x4, h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
-
- h->mb.cache.non_zero_count[x264_scan8[16+i8+ch*16]] = nz;
- if( nz )
+ for( int i4x4 = 0; i4x4 <= chroma422; i4x4++ )
{
- h->zigzagf.scan_4x4( h->dct.luma4x4[16+i8+ch*16], dct4x4 );
- h->quantf.dequant_4x4( dct4x4, h->dequant4_mf[CQM_4PC], i_qp );
- h->dctf.add4x4_idct( p_fdec, dct4x4 );
+ h->dctf.sub4x4_dct( dct4x4[i4x4], p_fenc + 4*i4x4*FENC_STRIDE, p_fdec + 4*i4x4*FDEC_STRIDE );
+
+ if( h->mb.b_noise_reduction )
+ h->quantf.denoise_dct( dct4x4[i4x4], h->nr_residual_sum[2], h->nr_offset[2], 16 );
+ dct4x4[i4x4][0] = 0;
+
+ if( h->mb.b_trellis )
+ nz = x264_quant_4x4_trellis( h, dct4x4[i4x4], CQM_4PC, i_qp, DCT_CHROMA_AC, 0, 1, 0 );
+ else
+ nz = h->quantf.quant_4x4( dct4x4[i4x4], h->quant4_mf[CQM_4PC][i_qp], h->quant4_bias[CQM_4PC][i_qp] );
+
+ int offset = chroma422 ? ((5*i8) & 0x09) + 2*i4x4 : i8;
+ h->mb.cache.non_zero_count[x264_scan8[16+offset+ch*16]] = nz;
+ if( nz )
+ {
+ h->zigzagf.scan_4x4( h->dct.luma4x4[16+offset+ch*16], dct4x4[i4x4] );
+ h->quantf.dequant_4x4( dct4x4[i4x4], h->dequant4_mf[CQM_4PC], i_qp );
+ h->dctf.add4x4_idct( p_fdec + 4*i4x4*FDEC_STRIDE, dct4x4[i4x4] );
+ }
}
}
h->mb.i_cbp_chroma = 0x02;
void x264_macroblock_encode_p8x8( x264_t *h, int i8 )
{
if( CHROMA444 )
- x264_macroblock_encode_p8x8_internal( h, i8, 3, 0 );
+ x264_macroblock_encode_p8x8_internal( h, i8, 3, CHROMA_444 );
+ else if( CHROMA_FORMAT == CHROMA_422 )
+ x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_422 );
else
- x264_macroblock_encode_p8x8_internal( h, i8, 1, 1 );
+ x264_macroblock_encode_p8x8_internal( h, i8, 1, CHROMA_420 );
}
/*****************************************************************************
#define x264_macroblock_probe_bskip( h )\
x264_macroblock_probe_skip( h, 1 )
-void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode );
void x264_predict_lossless_4x4( x264_t *h, pixel *p_dst, int p, int idx, int i_mode );
void x264_predict_lossless_8x8( x264_t *h, pixel *p_dst, int p, int idx, int i_mode, pixel edge[36] );
void x264_predict_lossless_16x16( x264_t *h, int p, int i_mode );
+void x264_predict_lossless_chroma( x264_t *h, int i_mode );
void x264_macroblock_encode ( x264_t *h );
void x264_macroblock_write_cabac ( x264_t *h, x264_cabac_t *cb );
void x264_macroblock_encode_p4x4( x264_t *h, int i4 );
void x264_mb_encode_i4x4( x264_t *h, int p, int idx, int i_qp, int i_mode );
void x264_mb_encode_i8x8( x264_t *h, int p, int idx, int i_qp, int i_mode, pixel *edge );
-void x264_mb_encode_8x8_chroma( x264_t *h, int b_inter, int i_qp );
+void x264_mb_encode_chroma( x264_t *h, int b_inter, int i_qp );
void x264_cabac_mb_skip( x264_t *h, int b_skip );
-int x264_quant_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
- int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );
+int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp,
+ int ctx_block_cat, int b_intra, int idx );
+int x264_quant_chroma_dc_trellis( x264_t *h, dctcoef *dct, int i_qp, int b_intra, int idx );
int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx );
int x264_quant_8x8_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
} \
else \
{ \
- h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
+ h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], \
+ mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \
if( m->weight[1].weightfn ) \
- m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 16, pix, 16, \
- &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \
- cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \
+ m->weight[1].weightfn[bw>>3]( pix, 16, pix, 16, &m->weight[1], bh>>chroma_v_shift ); \
+ cost += h->pixf.mbcmp[chromapix]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \
if( cost < bcost ) \
{ \
if( m->weight[2].weightfn ) \
- m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix+8, 16, pix+8, 16, \
- &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \
- cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \
+ m->weight[2].weightfn[bw>>3]( pix+8, 16, pix+8, 16, &m->weight[2], bh>>chroma_v_shift ); \
+ cost += h->pixf.mbcmp[chromapix]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \
} \
} \
} \
const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
const int i_pixel = m->i_pixel;
const int b_chroma_me = h->mb.b_chroma_me && (i_pixel <= PIXEL_8x8 || CHROMA444);
- const int mvy_offset = MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ int chromapix = h->luma2chroma_pixel[i_pixel];
+ int chroma_v_shift = h->mb.chroma_v_shift;
+ int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
ALIGNED_ARRAY_16( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
}\
else\
h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1],\
- mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
+ mvx, 2*(mvy+mv##list##y_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift );\
}\
}
ALIGNED_ARRAY_16( pixel, pixu_buf,[2],[9][16*16] );
ALIGNED_ARRAY_16( pixel, pixv_buf,[2],[9][16*16] );
pixel *src[3][2][9];
- int chromasize = CHROMA444 ? 8 : 4;
+ int chromapix = h->luma2chroma_pixel[i_pixel];
+ int chroma_v_shift = h->mb.chroma_v_shift;
+ int chroma_x = (8 >> h->mb.chroma_h_shift) * x;
+ int chroma_y = (8 >> chroma_v_shift) * y;
pixel *pix = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
- pixel *pixu = &h->mb.pic.p_fdec[1][chromasize*x + chromasize*y*FDEC_STRIDE];
- pixel *pixv = &h->mb.pic.p_fdec[2][chromasize*x + chromasize*y*FDEC_STRIDE];
+ pixel *pixu = &h->mb.pic.p_fdec[1][chroma_x + chroma_y*FDEC_STRIDE];
+ pixel *pixv = &h->mb.pic.p_fdec[2][chroma_x + chroma_y*FDEC_STRIDE];
int ref0 = h->mb.cache.ref[0][s8];
int ref1 = h->mb.cache.ref[1][s8];
- const int mv0y_offset = MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
- const int mv1y_offset = MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ const int mv0y_offset = chroma_v_shift & MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ const int mv1y_offset = chroma_v_shift & MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
int stride[3][2][9];
int bm0x = m0->mv[0];
int bm0y = m0->mv[1];
}
else
{
- h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
- h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
+ h->mc.avg[chromapix]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
+ h->mc.avg[chromapix]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
}
uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel );
COPY2_IF_LT( bcostrd, costrd, bestj, j );
} \
else if( m->i_pixel <= PIXEL_8x8 ) \
{ \
- h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
+ h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], \
+ mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \
if( m->weight[1].weightfn ) \
- m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pixu, FDEC_STRIDE, pixu, FDEC_STRIDE, \
- &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \
+ m->weight[1].weightfn[bw>>3]( pixu, FDEC_STRIDE, pixu, FDEC_STRIDE, &m->weight[1], bh>>chroma_v_shift ); \
if( m->weight[2].weightfn ) \
- m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pixv, FDEC_STRIDE, pixv, FDEC_STRIDE, \
- &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \
+ m->weight[2].weightfn[bw>>3]( pixv, FDEC_STRIDE, pixv, FDEC_STRIDE, &m->weight[2], bh>>chroma_v_shift ); \
} \
cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
const int bw = x264_pixel_size[m->i_pixel].w;
const int bh = x264_pixel_size[m->i_pixel].h;
const int i_pixel = m->i_pixel;
- const int mvy_offset = MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ int chroma_v_shift = h->mb.chroma_v_shift;
+ int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
uint64_t bcost = COST_MAX64;
int bmx = m->mv[0];
}
else
{
- pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
- pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+ pixu = &h->mb.pic.p_fdec[1][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4];
+ pixv = &h->mb.pic.p_fdec[2][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4];
}
h->mb.b_skip_mc = 1;
static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i, int b_chroma, int b_field, int b_store )
{
- int w = b_chroma ? 8 : 16;
+ int height = b_chroma ? 16>>h->mb.chroma_v_shift : 16;
int stride = frame->i_stride[i];
int offset = b_field
- ? 16 * mb_x + w * (mb_y&~1) * stride + (mb_y&1) * stride
- : 16 * mb_x + w * mb_y * stride;
+ ? 16 * mb_x + height * (mb_y&~1) * stride + (mb_y&1) * stride
+ : 16 * mb_x + height * mb_y * stride;
stride <<= b_field;
if( b_chroma )
{
- ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*8] );
- h->mc.load_deinterleave_8x8x2_fenc( pix, frame->plane[1] + offset, stride );
- return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6, frame, 1, b_store )
- + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6, frame, 2, b_store );
+ ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*16] );
+ int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
+ int shift = 7 - h->mb.chroma_v_shift;
+
+ h->mc.load_deinterleave_chroma_fenc( pix, frame->plane[1] + offset, stride, height );
+ return ac_energy_var( h->pixf.var[chromapix]( pix, FENC_STRIDE ), shift, frame, 1, b_store )
+ + ac_energy_var( h->pixf.var[chromapix]( pix+FENC_STRIDE/2, FENC_STRIDE ), shift, frame, 2, b_store );
}
else
return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[i] + offset, stride ), 8, frame, i, b_store );
{
uint64_t ssd = frame->i_pixel_ssd[i];
uint64_t sum = frame->i_pixel_sum[i];
- int size = CHROMA444 || !i ? 16 : 8;
- int width = h->mb.i_mb_width*size;
- int height = h->mb.i_mb_height*size;
+ int width = 16*h->mb.i_mb_width >> (i && h->mb.chroma_h_shift);
+ int height = 16*h->mb.i_mb_height >> (i && h->mb.chroma_v_shift);
frame->i_pixel_ssd[i] = ssd - (sum * sum + width * height / 2) / (width * height);
}
}
if( h->param.b_bluray_compat )
mincr = 4;
- /* High 10 / High 4:4:4 Predictive doesn't require minCR, so just set the maximum to a large value. */
- if( h->sps->i_profile_idc >= PROFILE_HIGH10 )
+ /* Profiles above High don't require minCR, so just set the maximum to a large value. */
+ if( h->sps->i_profile_idc > PROFILE_HIGH )
rc->frame_size_maximum = 1e9;
else
{
static inline int ssd_mb( x264_t *h )
{
- int chroma_size = CHROMA444 ? PIXEL_16x16 : PIXEL_8x8;
+ int chroma_size = h->luma2chroma_pixel[PIXEL_16x16];
int chroma_ssd = ssd_plane(h, chroma_size, 1, 0, 0) + ssd_plane(h, chroma_size, 2, 0, 0);
chroma_ssd = ((uint64_t)chroma_ssd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
return ssd_plane(h, PIXEL_16x16, 0, 0, 0) + chroma_ssd;
{
uint64_t i_ssd, i_bits;
int i8 = i4 >> 2;
- int chromassd;
if( i_pixel == PIXEL_16x16 )
{
if( i_pixel == PIXEL_8x16 )
x264_macroblock_encode_p8x8( h, i8+2 );
- i_ssd = ssd_plane( h, i_pixel, 0, (i8&1)*8, (i8>>1)*8 );
- if( CHROMA444 )
- {
- chromassd = ssd_plane( h, i_pixel, 1, (i8&1)*8, (i8>>1)*8 )
- + ssd_plane( h, i_pixel, 2, (i8&1)*8, (i8>>1)*8 );
- }
- else
- {
- chromassd = ssd_plane( h, i_pixel+3, 1, (i8&1)*4, (i8>>1)*4 )
- + ssd_plane( h, i_pixel+3, 2, (i8&1)*4, (i8>>1)*4 );
- }
- chromassd = ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
- i_ssd += chromassd;
+ int ssd_x = 8*(i8&1);
+ int ssd_y = 8*(i8>>1);
+ i_ssd = ssd_plane( h, i_pixel, 0, ssd_x, ssd_y );
+ int chromapix = h->luma2chroma_pixel[i_pixel];
+ int chromassd = ssd_plane( h, chromapix, 1, ssd_x>>h->mb.chroma_h_shift, ssd_y>>h->mb.chroma_v_shift )
+ + ssd_plane( h, chromapix, 2, ssd_x>>h->mb.chroma_h_shift, ssd_y>>h->mb.chroma_v_shift );
+ i_ssd += ((uint64_t)chromassd * h->mb.i_chroma_lambda2_offset + 128) >> 8;
if( h->param.b_cabac )
{
return (i_ssd<<8) + i_bits;
}
-static uint64_t x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
+static uint64_t x264_rd_cost_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
{
uint64_t i_ssd, i_bits;
if( b_dct )
- x264_mb_encode_8x8_chroma( h, 0, h->mb.i_chroma_qp );
- i_ssd = ssd_plane( h, PIXEL_8x8, 1, 0, 0 ) +
- ssd_plane( h, PIXEL_8x8, 2, 0, 0 );
+ x264_mb_encode_chroma( h, 0, h->mb.i_chroma_qp );
+
+ int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
+ i_ssd = ssd_plane( h, chromapix, 1, 0, 0 )
+ + ssd_plane( h, chromapix, 2, 0, 0 );
h->mb.i_chroma_pred_mode = i_mode;
{
x264_cabac_t cabac_tmp;
COPY_CABAC;
- x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
+ x264_chroma_size_cabac( h, &cabac_tmp );
i_bits = ( (uint64_t)cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
}
else
- i_bits = x264_i8x8_chroma_size_cavlc( h ) * i_lambda2;
+ i_bits = x264_chroma_size_cavlc( h ) * i_lambda2;
return (i_ssd<<8) + i_bits;
}
int ctx_block_cat, int i_lambda2, int b_ac,
int b_chroma, int dc, int i_coefs, int idx )
{
- int abs_coefs[64], signs[64];
+ udctcoef abs_coefs[64];
+ int8_t signs[64];
trellis_node_t nodes[2][8];
trellis_node_t *nodes_cur = nodes[0];
trellis_node_t *nodes_prev = nodes[1];
const int b_interlaced = MB_INTERLACED;
uint8_t *cabac_state_sig = &h->cabac.state[ significant_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
uint8_t *cabac_state_last = &h->cabac.state[ last_coeff_flag_offset[b_interlaced][ctx_block_cat] ];
+ const uint8_t *levelgt1_ctx = b_chroma && dc ? coeff_abs_levelgt1_ctx_chroma_dc : coeff_abs_levelgt1_ctx;
const int f = 1 << 15; // no deadzone
int i_last_nnz;
int i;
{
int coef = dct[zigzag[i]];
abs_coefs[i] = abs(coef);
- signs[i] = coef < 0 ? -1 : 1;
+ signs[i] = coef>>31 | 1;
}
/* init trellis */
{
// no need to calculate ssd of 0s: it's the same in all nodes.
// no need to modify level_tree for ctx=0: it starts with an infinite loop of 0s.
- int sigindex = i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] : i;
+ int sigindex = !dc && i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
+ b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
const uint32_t cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )
* (uint64_t)i_lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
for( int j = 1; j < 8; j++ )
if( i < i_coefs-1 )
{
- int sigindex = i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] : i;
- int lastindex = i_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i;
+ int sigindex = !dc && i_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
+ b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
+ int lastindex = !dc && i_coefs == 64 ? last_coeff_flag_offset_8x8[i] :
+ b_chroma && dc && i_coefs == 8 ? coeff_flag_offset_chroma_422_dc[i] : i;
cost_sig[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );
cost_sig[1] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );
cost_last[0] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 );
f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[node_ctx]], i_prefix > 0 );
if( i_prefix > 0 )
{
- uint8_t *ctx = &n.cabac_state[coeff_abs_levelgt1_ctx[node_ctx]];
+ uint8_t *ctx = &n.cabac_state[levelgt1_ctx[node_ctx]];
f8_bits += cabac_size_unary[i_prefix][*ctx];
*ctx = cabac_transition_unary[i_prefix][*ctx];
if( abs_level >= 15 )
int64_t score = 1ULL<<62;
int i, j;
const int f = 1<<15;
- int nC = ctx_block_cat == DCT_CHROMA_DC ? 4 : ct_index[x264_mb_predict_non_zero_code( h, ctx_block_cat == DCT_LUMA_DC ? (idx - LUMA_DC)*16 : idx )];
+ int nC = b_chroma && dc ? 3 + (i_coefs>>2)
+ : ct_index[x264_mb_predict_non_zero_code( h, !b_chroma && dc ? (idx - LUMA_DC)*16 : idx )];
/* Code for handling 8x8dct -> 4x4dct CAVLC munging. Input/output use a different
* step/start/end than internal processing. */
return 0;
}
-const static uint8_t x264_zigzag_scan2[4] = {0,1,2,3};
-
-int x264_quant_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
- int i_qp, int ctx_block_cat, int b_intra, int b_chroma, int idx )
+int x264_quant_luma_dc_trellis( x264_t *h, dctcoef *dct, int i_quant_cat, int i_qp, int ctx_block_cat, int b_intra, int idx )
{
if( h->param.b_cabac )
return quant_trellis_cabac( h, dct,
- h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
- NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[MB_INTERLACED],
- ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 1, ctx_block_cat==DCT_CHROMA_DC ? 4 : 16, idx );
+ h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, x264_zigzag_scan4[MB_INTERLACED],
+ ctx_block_cat, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx );
+
+ return quant_trellis_cavlc( h, dct,
+ h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp], NULL, x264_zigzag_scan4[MB_INTERLACED],
+ DCT_LUMA_DC, h->mb.i_trellis_lambda2[0][b_intra], 0, 0, 1, 16, idx, 0 );
+}
- if( ctx_block_cat != DCT_CHROMA_DC )
- ctx_block_cat = DCT_LUMA_DC;
+static const uint8_t x264_zigzag_scan2x2[4] = { 0, 1, 2, 3 };
+static const uint8_t x264_zigzag_scan2x4[8] = { 0, 2, 1, 4, 6, 3, 5, 7 };
+
+int x264_quant_chroma_dc_trellis( x264_t *h, dctcoef *dct, int i_qp, int b_intra, int idx )
+{
+ const uint8_t *zigzag;
+ int num_coefs;
+ int quant_cat = CQM_4IC+1 - b_intra;
+
+ if( CHROMA_FORMAT == CHROMA_422 )
+ {
+ zigzag = x264_zigzag_scan2x4;
+ num_coefs = 8;
+ }
+ else
+ {
+ zigzag = x264_zigzag_scan2x2;
+ num_coefs = 4;
+ }
+
+ if( h->param.b_cabac )
+ return quant_trellis_cabac( h, dct,
+ h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], NULL, zigzag,
+ DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx );
return quant_trellis_cavlc( h, dct,
- h->quant4_mf[i_quant_cat][i_qp], h->unquant4_mf[i_quant_cat][i_qp],
- NULL, ctx_block_cat==DCT_CHROMA_DC ? x264_zigzag_scan2 : x264_zigzag_scan4[MB_INTERLACED],
- ctx_block_cat, h->mb.i_trellis_lambda2[b_chroma][b_intra], 0, b_chroma, 1, ctx_block_cat==DCT_CHROMA_DC ? 4 : 16, idx, 0 );
+ h->quant4_mf[quant_cat][i_qp], h->unquant4_mf[quant_cat][i_qp], NULL, zigzag,
+ DCT_CHROMA_DC, h->mb.i_trellis_lambda2[1][b_intra], 0, 1, 1, num_coefs, idx, 0 );
}
int x264_quant_4x4_trellis( x264_t *h, dctcoef *dct, int i_quant_cat,
sps->i_id = i_id;
sps->i_mb_width = ( param->i_width + 15 ) / 16;
sps->i_mb_height= ( param->i_height + 15 ) / 16;
- sps->i_chroma_format_idc = csp >= X264_CSP_I444 ? 3 : 1;
+ sps->i_chroma_format_idc = csp >= X264_CSP_I444 ? CHROMA_444 :
+ csp >= X264_CSP_I422 ? CHROMA_422 : CHROMA_420;
sps->b_qpprime_y_zero_transform_bypass = param->rc.i_rc_method == X264_RC_CQP && param->rc.i_qp_constant == 0;
- if( sps->b_qpprime_y_zero_transform_bypass || sps->i_chroma_format_idc == 3 )
+ if( sps->b_qpprime_y_zero_transform_bypass || sps->i_chroma_format_idc == CHROMA_444 )
sps->i_profile_idc = PROFILE_HIGH444_PREDICTIVE;
+ else if( sps->i_chroma_format_idc == CHROMA_422 )
+ sps->i_profile_idc = PROFILE_HIGH422;
else if( BIT_DEPTH > 8 )
sps->i_profile_idc = PROFILE_HIGH10;
else if( param->analyse.b_transform_8x8 || param->i_cqm_preset != X264_CQM_FLAT )
sps->b_constraint_set3 = 1; /* level 1b with Baseline, Main or Extended profile is signalled via constraint_set3 */
sps->i_level_idc = 11;
}
- /* High 10 Intra profile */
- if( param->i_keyint_max == 1 && sps->i_profile_idc == PROFILE_HIGH10 )
- sps->b_constraint_set3 = 1;
- /* High 4:4:4 Intra profile */
- if( param->i_keyint_max == 1 && sps->i_profile_idc == PROFILE_HIGH444_PREDICTIVE )
+ /* Intra profiles */
+ if( param->i_keyint_max == 1 && sps->i_profile_idc > PROFILE_HIGH )
sps->b_constraint_set3 = 1;
sps->vui.i_num_reorder_frames = param->i_bframe_pyramid ? 2 : param->i_bframe ? 1 : 0;
bs_write1( s, sps->b_crop );
if( sps->b_crop )
{
- int cropshift = sps->i_chroma_format_idc != 3;
- bs_write_ue( s, sps->crop.i_left >> cropshift );
- bs_write_ue( s, sps->crop.i_right >> cropshift );
- bs_write_ue( s, sps->crop.i_top >> cropshift );
- bs_write_ue( s, sps->crop.i_bottom >> cropshift );
+ int h_shift = sps->i_chroma_format_idc == CHROMA_420 || sps->i_chroma_format_idc == CHROMA_422;
+ int v_shift = sps->i_chroma_format_idc == CHROMA_420;
+ bs_write_ue( s, sps->crop.i_left >> h_shift );
+ bs_write_ue( s, sps->crop.i_right >> h_shift );
+ bs_write_ue( s, sps->crop.i_top >> v_shift );
+ bs_write_ue( s, sps->crop.i_bottom >> v_shift );
}
bs_write1( s, sps->b_vui );
int ret = 0;
int mbs = h->sps->i_mb_width * h->sps->i_mb_height;
int dpb = mbs * 384 * h->sps->vui.i_max_dec_frame_buffering;
- int cbp_factor = h->sps->i_profile_idc==PROFILE_HIGH444_PREDICTIVE ? 16 :
+ int cbp_factor = h->sps->i_profile_idc>=PROFILE_HIGH422 ? 16 :
h->sps->i_profile_idc==PROFILE_HIGH10 ? 12 :
h->sps->i_profile_idc==PROFILE_HIGH ? 5 : 4;
return ref->lowres[0];
}
-/* How data is organized for chroma weightp 4:2:0:
+/* How data is organized for 4:2:0/4:2:2 chroma weightp:
* [U: ref] [U: fenc]
* [V: ref] [V: fenc]
* fenc = ref + offset
- * v = u + stride * chroma height
- * We'll need more room if we do 4:2:2. */
+ * v = u + stride * chroma height */
static NOINLINE void x264_weight_cost_init_chroma( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, pixel *dstu, pixel *dstv )
{
int i_offset = i_stride / 2;
int i_lines = fenc->i_lines[1];
int i_width = fenc->i_width[1];
- int cw = h->mb.i_mb_width << 3;
- int ch = h->mb.i_mb_height << 3;
+ int v_shift = h->mb.chroma_v_shift;
+ int cw = 8*h->mb.i_mb_width;
+ int ch = 16*h->mb.i_mb_height >> v_shift;
+ int height = 16 >> v_shift;
if( fenc->lowres_mvs[0][ref0_distance][0][0] != 0x7FFF )
{
x264_frame_expand_border_chroma( h, ref, 1 );
- for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += 8, pel_offset_y = y*i_stride )
+ for( int y = 0, mb_xy = 0, pel_offset_y = 0; y < i_lines; y += height, pel_offset_y = y*i_stride )
for( int x = 0, pel_offset_x = 0; x < i_width; x += 8, mb_xy++, pel_offset_x += 8 )
{
pixel *pixu = dstu + pel_offset_y + pel_offset_x;
pixel *pixv = dstv + pel_offset_y + pel_offset_x;
- pixel *src1 = ref->plane[1] + pel_offset_y + pel_offset_x*2; /* NV12 */
+ pixel *src1 = ref->plane[1] + pel_offset_y + pel_offset_x*2; /* NV12/NV16 */
int mvx = fenc->lowres_mvs[0][ref0_distance][mb_xy][0];
int mvy = fenc->lowres_mvs[0][ref0_distance][mb_xy][1];
- h->mc.mc_chroma( pixu, pixv, i_stride, src1, i_stride, mvx, mvy, 8, 8 );
+ h->mc.mc_chroma( pixu, pixv, i_stride, src1, i_stride, mvx, 2*mvy>>v_shift, 8, height );
}
}
else
int i_lines = fenc->i_lines[1];
int i_width = fenc->i_width[1];
pixel *src = ref + i_offset;
- ALIGNED_ARRAY_16( pixel, buf, [8*8] );
+ ALIGNED_ARRAY_16( pixel, buf, [8*16] );
int pixoff = 0;
+ int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
+ int height = 16 >> h->mb.chroma_v_shift;
ALIGNED_16( static pixel flat[8] ) = {0};
if( w )
{
- for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+ for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
for( int x = 0; x < i_width; x += 8, pixoff += 8 )
{
- w->weightfn[8>>2]( buf, 8, &ref[pixoff], i_stride, w, 8 );
+ w->weightfn[8>>2]( buf, 8, &ref[pixoff], i_stride, w, height );
/* The naive and seemingly sensible algorithm is to use mbcmp as in luma.
* But testing shows that for chroma the DC coefficient is by far the most
* important part of the coding cost. Thus a more useful chroma weight is
* pixels.
*
* FIXME: add a (faster) asm sum function to replace sad. */
- cost += abs( h->pixf.sad_aligned[PIXEL_8x8]( buf, 8, flat, 0 ) -
- h->pixf.sad_aligned[PIXEL_8x8]( &src[pixoff], i_stride, flat, 0 ) );
+ cost += abs( h->pixf.sad_aligned[chromapix]( buf, 8, flat, 0 ) -
+ h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) );
}
cost += x264_weight_slice_header_cost( h, w, 1 );
}
else
- for( int y = 0; y < i_lines; y += 8, pixoff = y*i_stride )
+ for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
for( int x = 0; x < i_width; x += 8, pixoff += 8 )
- cost += abs( h->pixf.sad_aligned[PIXEL_8x8]( &ref[pixoff], i_stride, flat, 0 ) -
- h->pixf.sad_aligned[PIXEL_8x8]( &src[pixoff], i_stride, flat, 0 ) );
+ cost += abs( h->pixf.sad_aligned[chromapix]( &ref[pixoff], i_stride, flat, 0 ) -
+ h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) );
x264_emms();
return cost;
}
return csp_mask == X264_CSP_I420 ||
csp_mask == X264_CSP_I422 ||
csp_mask == X264_CSP_I444 ||
- csp_mask == X264_CSP_YV24 ||
csp_mask == X264_CSP_YV12 ||
- csp_mask == X264_CSP_NV12;
+ csp_mask == X264_CSP_YV16 ||
+ csp_mask == X264_CSP_YV24 ||
+ csp_mask == X264_CSP_NV12 ||
+ csp_mask == X264_CSP_NV16;
}
static int csp_num_interleaved( int csp, int plane )
{
int csp_mask = csp & X264_CSP_MASK;
- return ( csp_mask == X264_CSP_NV12 && plane == 1 ) ? 2 : 1;
+ return ( (csp_mask == X264_CSP_NV12 || csp_mask == X264_CSP_NV16) && plane == 1 ) ? 2 : 1;
}
/* The dithering algorithm is based on Sierra-2-4A error diffusion. It has been
{
case X264_CSP_YV12: /* specially handled via swapping chroma */
case X264_CSP_I420: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV420P16 : PIX_FMT_YUV420P;
+ case X264_CSP_YV16: /* specially handled via swapping chroma */
case X264_CSP_I422: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV422P16 : PIX_FMT_YUV422P;
case X264_CSP_YV24: /* specially handled via swapping chroma */
case X264_CSP_I444: return csp&X264_CSP_HIGH_DEPTH ? PIX_FMT_YUV444P16 : PIX_FMT_YUV444P;
h->dst.pix_fmt = convert_csp_to_pix_fmt( h->dst_csp );
h->scale = h->dst;
- /* swap chroma planes if YV12/YV24 is involved, as libswscale works with I420/I444 */
+ /* swap chroma planes if YV12/YV16/YV24 is involved, as libswscale works with I420/I422/I444 */
int src_csp = info->csp & (X264_CSP_MASK | X264_CSP_OTHER);
int dst_csp = h->dst_csp & (X264_CSP_MASK | X264_CSP_OTHER);
- h->pre_swap_chroma = src_csp == X264_CSP_YV12 || src_csp == X264_CSP_YV24;
- h->post_swap_chroma = dst_csp == X264_CSP_YV12 || dst_csp == X264_CSP_YV24;
+ h->pre_swap_chroma = src_csp == X264_CSP_YV12 || src_csp == X264_CSP_YV16 || src_csp == X264_CSP_YV24;
+ h->post_swap_chroma = dst_csp == X264_CSP_YV12 || dst_csp == X264_CSP_YV16 || dst_csp == X264_CSP_YV24;
int src_pix_fmt = convert_csp_to_pix_fmt( info->csp );
}
#if !HAVE_SWSCALE
/* if swscale is not available, convert the CSP if necessary */
- if( (opt->output_csp == X264_CSP_I420 && !avs_is_yv12( vi )) || (opt->output_csp == X264_CSP_I444 && !avs_is_yv24( vi )) ||
- (opt->output_csp == X264_CSP_RGB && !avs_is_rgb( vi )) )
+ if( (opt->output_csp == X264_CSP_I420 && !avs_is_yv12( vi )) || (opt->output_csp == X264_CSP_I422 && !avs_is_yv16( vi )) ||
+ (opt->output_csp == X264_CSP_I444 && !avs_is_yv24( vi )) || (opt->output_csp == X264_CSP_RGB && !avs_is_rgb( vi )) )
{
- FAIL_IF_ERROR( avs_version < 2.6f && opt->output_csp == X264_CSP_I444, "avisynth >= 2.6 is required for i444 output\n" )
+ FAIL_IF_ERROR( avs_version < 2.6f && (opt->output_csp == X264_CSP_I422 || opt->output_csp == X264_CSP_I444),
+ "avisynth >= 2.6 is required for i422/i444 output\n" )
- const char *csp = opt->output_csp == X264_CSP_I420 ? "YV12" : (opt->output_csp == X264_CSP_I444 ? "YV24" : "RGB");
+ const char *csp = opt->output_csp == X264_CSP_I420 ? "YV12" :
+ opt->output_csp == X264_CSP_I422 ? "YV16" :
+ opt->output_csp == X264_CSP_I444 ? "YV24" : "RGB";
x264_cli_log( "avs", X264_LOG_WARNING, "converting input clip to %s\n", csp );
- FAIL_IF_ERROR( opt->output_csp == X264_CSP_I420 && (vi->width&1 || vi->height&1),
- "input clip width or height not divisible by 2 (%dx%d)\n", vi->width, vi->height )
+ FAIL_IF_ERROR( opt->output_csp < X264_CSP_I444 && (vi->width&1),
+ "input clip width not divisible by 2 (%dx%d)\n", vi->width, vi->height )
+ FAIL_IF_ERROR( opt->output_csp == X264_CSP_I420 && info->interlaced && (vi->height&3),
+ "input clip height not divisible by 4 (%dx%d)\n", vi->width, vi->height )
+ FAIL_IF_ERROR( (opt->output_csp == X264_CSP_I420 || info->interlaced) && (vi->height&1),
+ "input clip height not divisible by 2 (%dx%d)\n", vi->width, vi->height )
const char *arg_name[2] = { NULL, "interlaced" };
AVS_Value arg_arr[2] = { res, avs_new_value_bool( info->interlaced ) };
char conv_func[14] = { "ConvertTo" };
info->csp = X264_CSP_BGR | X264_CSP_VFLIP;
else if( avs_is_yv24( vi ) )
info->csp = X264_CSP_I444;
+ else if( avs_is_yv16( vi ) )
+ info->csp = X264_CSP_I422;
else if( avs_is_yv12( vi ) )
info->csp = X264_CSP_I420;
#if HAVE_SWSCALE
else if( avs_is_yuy2( vi ) )
info->csp = PIX_FMT_YUYV422 | X264_CSP_OTHER;
- else if( avs_is_yv16( vi ) )
- info->csp = X264_CSP_I422;
else if( avs_is_yv411( vi ) )
info->csp = PIX_FMT_YUV411P | X264_CSP_OTHER;
else if( avs_is_y8( vi ) )
[X264_CSP_I420] = { "i420", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 },
[X264_CSP_I422] = { "i422", 3, { 1, .5, .5 }, { 1, 1, 1 }, 2, 1 },
[X264_CSP_I444] = { "i444", 3, { 1, 1, 1 }, { 1, 1, 1 }, 1, 1 },
- [X264_CSP_YV24] = { "yv24", 3, { 1, 1, 1 }, { 1, 1, 1 }, 1, 1 },
[X264_CSP_YV12] = { "yv12", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 },
+ [X264_CSP_YV16] = { "yv16", 3, { 1, .5, .5 }, { 1, 1, 1 }, 2, 1 },
+ [X264_CSP_YV24] = { "yv24", 3, { 1, 1, 1 }, { 1, 1, 1 }, 1, 1 },
[X264_CSP_NV12] = { "nv12", 2, { 1, 1 }, { 1, .5 }, 2, 2 },
+ [X264_CSP_NV16] = { "nv16", 2, { 1, 1 }, { 1, 1 }, 2, 1 },
[X264_CSP_BGR] = { "bgr", 1, { 3 }, { 1 }, 1, 1 },
[X264_CSP_BGRA] = { "bgra", 1, { 4 }, { 1 }, 1, 1 },
[X264_CSP_RGB] = { "rgb", 1, { 3 }, { 1 }, 1, 1 },
extern cli_input_t cli_input;
/* extended colorspace list that isn't supported by libx264 but by the cli */
-#define X264_CSP_I422 X264_CSP_MAX /* yuv 4:2:2 planar */
-#define X264_CSP_CLI_MAX (X264_CSP_MAX+1) /* end of list */
+#define X264_CSP_CLI_MAX X264_CSP_MAX /* end of list */
#define X264_CSP_OTHER 0x4000 /* non x264 colorspace */
typedef struct
char func_name[100];
static bench_func_t benchs[MAX_FUNCS];
-static const char *pixel_names[10] = { "16x16", "16x8", "8x16", "8x8", "8x4", "4x8", "4x4", "4x2", "2x4", "2x2" };
+static const char *pixel_names[12] = { "16x16", "16x8", "8x16", "8x8", "8x4", "4x8", "4x4", "4x16", "4x2", "2x8", "2x4", "2x2" };
static const char *intra_predict_16x16_names[7] = { "v", "h", "dc", "p", "dcl", "dct", "dc8" };
static const char *intra_predict_8x8c_names[7] = { "dc", "h", "v", "p", "dcl", "dct", "dc8" };
static const char *intra_predict_4x4_names[12] = { "v", "h", "dc", "ddl", "ddr", "vr", "hd", "vl", "hu", "dcl", "dct", "dc8" };
static const char **intra_predict_8x8_names = intra_predict_4x4_names;
+static const char **intra_predict_8x16c_names = intra_predict_8x8c_names;
#define set_func_name(...) snprintf( func_name, sizeof(func_name), __VA_ARGS__ )
#define TEST_PIXEL( name, align ) \
ok = 1, used_asm = 0; \
- for( int i = 0; i < 7; i++ ) \
+ for( int i = 0; i < 8; i++ ) \
{ \
int res_c, res_asm; \
if( pixel_asm.name[i] != pixel_ref.name[i] ) \
ok = 1; used_asm = 0;
TEST_PIXEL_VAR( PIXEL_16x16 );
+ TEST_PIXEL_VAR( PIXEL_8x16 );
TEST_PIXEL_VAR( PIXEL_8x8 );
report( "pixel var :" );
- ok = 1; used_asm = 0;
- if( pixel_asm.var2_8x8 != pixel_ref.var2_8x8 )
- {
- int res_c, res_asm, ssd_c, ssd_asm;
- set_func_name( "var2_8x8" );
- used_asm = 1;
- res_c = call_c( pixel_c.var2_8x8, pbuf1, 16, pbuf2, 16, &ssd_c );
- res_asm = call_a( pixel_asm.var2_8x8, pbuf1, 16, pbuf2, 16, &ssd_asm );
- if( res_c != res_asm || ssd_c != ssd_asm )
- {
- ok = 0;
- fprintf( stderr, "var2_8x8: %d != %d or %d != %d [FAILED]\n", res_c, res_asm, ssd_c, ssd_asm );
- }
+#define TEST_PIXEL_VAR2( i ) \
+ if( pixel_asm.var2[i] != pixel_ref.var2[i] ) \
+ { \
+ int res_c, res_asm, ssd_c, ssd_asm; \
+ set_func_name( "%s_%s", "var2", pixel_names[i] ); \
+ used_asm = 1; \
+ res_c = call_c( pixel_c.var2[i], pbuf1, 16, pbuf2, 16, &ssd_c ); \
+ res_asm = call_a( pixel_asm.var2[i], pbuf1, 16, pbuf2, 16, &ssd_asm ); \
+ if( res_c != res_asm || ssd_c != ssd_asm ) \
+ { \
+ ok = 0; \
+ fprintf( stderr, "var2[%d]: %d != %d or %d != %d [FAILED]\n", i, res_c, res_asm, ssd_c, ssd_asm ); \
+ } \
}
+ ok = 1; used_asm = 0;
+ TEST_PIXEL_VAR2( PIXEL_8x16 );
+ TEST_PIXEL_VAR2( PIXEL_8x8 );
report( "pixel var2 :" );
ok = 1; used_asm = 0;
memcpy( pbuf3, pbuf2, 20*FDEC_STRIDE*sizeof(pixel) );
ok = 1; used_asm = 0;
TEST_INTRA_X3( intra_satd_x3_16x16, 0 );
+ TEST_INTRA_X3( intra_satd_x3_8x16c, 0 );
TEST_INTRA_X3( intra_satd_x3_8x8c, 0 );
TEST_INTRA_X3( intra_sa8d_x3_8x8, 1, edge );
TEST_INTRA_X3( intra_satd_x3_4x4, 0 );
report( "intra satd_x3 :" );
ok = 1; used_asm = 0;
TEST_INTRA_X3( intra_sad_x3_16x16, 0 );
+ TEST_INTRA_X3( intra_sad_x3_8x16c, 0 );
TEST_INTRA_X3( intra_sad_x3_8x8c, 0 );
TEST_INTRA_X3( intra_sad_x3_8x8, 1, edge );
TEST_INTRA_X3( intra_sad_x3_4x4, 0 );
ALIGNED_16( dctcoef dct2[16][16] );
ALIGNED_16( dctcoef dct4[16][16] );
ALIGNED_16( dctcoef dct8[4][64] );
- ALIGNED_16( dctcoef dctdc[2][4] );
+ ALIGNED_16( dctcoef dctdc[2][8] );
x264_t h_buf;
x264_t *h = &h_buf;
TEST_DCT( sub4x4_dct, dct1[0], dct2[0], 16 );
TEST_DCT( sub8x8_dct, dct1, dct2, 16*4 );
TEST_DCT( sub8x8_dct_dc, dctdc[0], dctdc[1], 4 );
+ TEST_DCT( sub8x16_dct_dc, dctdc[0], dctdc[1], 8 );
TEST_DCT( sub16x16_dct, dct1, dct2, 16*16 );
report( "sub_dct4 :" );
TEST_DCTDC( idct4x4dc );
#undef TEST_DCTDC
+#define TEST_DCTDC_CHROMA( name )\
+ ok = 1; used_asm = 0;\
+ if( dct_asm.name != dct_ref.name )\
+ {\
+ set_func_name( #name );\
+ used_asm = 1;\
+ uint16_t *p = (uint16_t*)buf1;\
+ for( int i = 0; i < 16 && ok; i++ )\
+ {\
+ for( int j = 0; j < 8; j++ )\
+ dct1[j][0] = !i ? (j^j>>1^j>>2)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max dc */\
+ : i<8 ? (*p++)&1 ? PIXEL_MAX*16 : -PIXEL_MAX*16 /* max elements */\
+ : ((*p++)&0x1fff)-0x1000; /* general case */\
+ memcpy( dct2, dct1, 8*16 * sizeof(dctcoef) );\
+ call_c1( dct_c.name, dctdc[0], dct1 );\
+ call_a1( dct_asm.name, dctdc[1], dct2 );\
+ if( memcmp( dctdc[0], dctdc[1], 8 * sizeof(dctcoef) ) || memcmp( dct1, dct2, 8*16 * sizeof(dctcoef) ) )\
+ {\
+ ok = 0;\
+ fprintf( stderr, #name " [FAILED]\n" ); \
+ }\
+ }\
+ call_c2( dct_c.name, dctdc[0], dct1 );\
+ call_a2( dct_asm.name, dctdc[1], dct2 );\
+ }\
+ report( #name " :" );
+
+ TEST_DCTDC_CHROMA( dct2x4dc );
+#undef TEST_DCTDC_CHROMA
+
x264_zigzag_function_t zigzag_c[2];
x264_zigzag_function_t zigzag_ref[2];
x264_zigzag_function_t zigzag_asm[2];
#define MC_TEST_AVG( name, weight ) \
{ \
ok = 1, used_asm = 0; \
- for( int i = 0; i < 10; i++ ) \
+ for( int i = 0; i < 12; i++ ) \
{ \
memcpy( pbuf3, pbuf1+320, 320 * sizeof(pixel) ); \
memcpy( pbuf4, pbuf1+320, 320 * sizeof(pixel) ); \
report( "mc offsetsub :" );
ok = 1; used_asm = 0;
- if( mc_a.store_interleave_8x8x2 != mc_ref.store_interleave_8x8x2 )
- {
- set_func_name( "store_interleave_8x8x2" );
- used_asm = 1;
- memset( pbuf3, 0, 64*8 );
- memset( pbuf4, 0, 64*8 );
- call_c( mc_c.store_interleave_8x8x2, pbuf3, 64, pbuf1, pbuf1+16 );
- call_a( mc_a.store_interleave_8x8x2, pbuf4, 64, pbuf1, pbuf1+16 );
- if( memcmp( pbuf3, pbuf4, 64*8 ) )
- ok = 0;
- }
- if( mc_a.load_deinterleave_8x8x2_fenc != mc_ref.load_deinterleave_8x8x2_fenc )
- {
- set_func_name( "load_deinterleave_8x8x2_fenc" );
- used_asm = 1;
- call_c( mc_c.load_deinterleave_8x8x2_fenc, pbuf3, pbuf1, 64 );
- call_a( mc_a.load_deinterleave_8x8x2_fenc, pbuf4, pbuf1, 64 );
- if( memcmp( pbuf3, pbuf4, FENC_STRIDE*8 ) )
- ok = 0;
- }
- if( mc_a.load_deinterleave_8x8x2_fdec != mc_ref.load_deinterleave_8x8x2_fdec )
+ for( int height = 8; height <= 16; height += 8 )
{
- set_func_name( "load_deinterleave_8x8x2_fdec" );
- used_asm = 1;
- call_c( mc_c.load_deinterleave_8x8x2_fdec, pbuf3, pbuf1, 64 );
- call_a( mc_a.load_deinterleave_8x8x2_fdec, pbuf4, pbuf1, 64 );
- if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*8 ) )
- ok = 0;
+ if( mc_a.store_interleave_chroma != mc_ref.store_interleave_chroma )
+ {
+ set_func_name( "store_interleave_chroma" );
+ used_asm = 1;
+ memset( pbuf3, 0, 64*height );
+ memset( pbuf4, 0, 64*height );
+ call_c( mc_c.store_interleave_chroma, pbuf3, 64, pbuf1, pbuf1+16, height );
+ call_a( mc_a.store_interleave_chroma, pbuf4, 64, pbuf1, pbuf1+16, height );
+ if( memcmp( pbuf3, pbuf4, 64*height ) )
+ {
+ ok = 0;
+ fprintf( stderr, "store_interleave_chroma FAILED: h=%d\n", height );
+ break;
+ }
+ }
+ if( mc_a.load_deinterleave_chroma_fenc != mc_ref.load_deinterleave_chroma_fenc )
+ {
+ set_func_name( "load_deinterleave_chroma_fenc" );
+ used_asm = 1;
+ call_c( mc_c.load_deinterleave_chroma_fenc, pbuf3, pbuf1, 64, height );
+ call_a( mc_a.load_deinterleave_chroma_fenc, pbuf4, pbuf1, 64, height );
+ if( memcmp( pbuf3, pbuf4, FENC_STRIDE*height ) )
+ {
+ ok = 0;
+ fprintf( stderr, "load_deinterleave_chroma_fenc FAILED: h=%d\n", height );
+ break;
+ }
+ }
+ if( mc_a.load_deinterleave_chroma_fdec != mc_ref.load_deinterleave_chroma_fdec )
+ {
+ set_func_name( "load_deinterleave_chroma_fdec" );
+ used_asm = 1;
+ call_c( mc_c.load_deinterleave_chroma_fdec, pbuf3, pbuf1, 64, height );
+ call_a( mc_a.load_deinterleave_chroma_fdec, pbuf4, pbuf1, 64, height );
+ if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*height ) )
+ {
+ ok = 0;
+ fprintf( stderr, "load_deinterleave_chroma_fdec FAILED: h=%d\n", height );
+ break;
+ }
+ }
}
report( "store_interleave :" );
TEST_DEBLOCK( deblock_luma[0], 0, tcs[i] );
TEST_DEBLOCK( deblock_luma[1], 1, tcs[i] );
- TEST_DEBLOCK( deblock_chroma[0], 0, tcs[i] );
+ TEST_DEBLOCK( deblock_h_chroma_420, 0, tcs[i] );
+ TEST_DEBLOCK( deblock_h_chroma_422, 0, tcs[i] );
TEST_DEBLOCK( deblock_chroma[1], 1, tcs[i] );
TEST_DEBLOCK( deblock_luma_intra[0], 0 );
TEST_DEBLOCK( deblock_luma_intra[1], 1 );
- TEST_DEBLOCK( deblock_chroma_intra[0], 0 );
+ TEST_DEBLOCK( deblock_h_chroma_420_intra, 0 );
+ TEST_DEBLOCK( deblock_h_chroma_422_intra, 0 );
TEST_DEBLOCK( deblock_chroma_intra[1], 1 );
if( db_a.deblock_strength != db_ref.deblock_strength )
x264_quant_function_t qf_a;
ALIGNED_16( dctcoef dct1[64] );
ALIGNED_16( dctcoef dct2[64] );
+ ALIGNED_16( dctcoef dct3[8][16] );
+ ALIGNED_16( dctcoef dct4[8][16] );
ALIGNED_16( uint8_t cqm_buf[64] );
int ret = 0, ok, used_asm;
int oks[3] = {1,1,1}, used_asms[3] = {0,0,0};
for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
{ \
INIT_QUANT##w(1) \
- call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
+ qf_c.qname( dct1, h->quant##w##_mf[block][qp], h->quant##w##_bias[block][qp] ); \
memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
{ \
for( int i = 0; i < 16; i++ ) \
dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16; \
- call_c1( qf_c.qname, dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
+ qf_c.qname( dct1, h->quant##w##_mf[block][qp][0]>>1, h->quant##w##_bias[block][qp][0]>>1 ); \
memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
call_c1( qf_c.dqname, dct1, h->dequant##w##_mf[block], qp ); \
call_a1( qf_a.dqname, dct2, h->dequant##w##_mf[block], qp ); \
TEST_DEQUANT_DC( quant_4x4_dc, dequant_4x4_dc, CQM_4IY, 4 );
-#define TEST_OPTIMIZE_CHROMA_DC( qname, optname, w ) \
+ if( qf_a.idct_dequant_2x4_dc != qf_ref.idct_dequant_2x4_dc )
+ {
+ set_func_name( "idct_dequant_2x4_dc_%s", i_cqm?"cqm":"flat" );
+ used_asms[1] = 1;
+ for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- )
+ {
+ for( int i = 0; i < 8; i++ )
+ dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16;
+ qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
+ qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
+ call_c( qf_c.idct_dequant_2x4_dc, dct1, dct3, h->dequant4_mf[CQM_4IC], qp+3 );
+ call_a( qf_a.idct_dequant_2x4_dc, dct1, dct4, h->dequant4_mf[CQM_4IC], qp+3 );
+ for( int i = 0; i < 8; i++ )
+ if( dct3[i][0] != dct4[i][0] )
+ {
+ oks[1] = 0;
+ fprintf( stderr, "idct_dequant_2x4_dc (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm );
+ break;
+ }
+ }
+ }
+
+ if( qf_a.idct_dequant_2x4_dconly != qf_ref.idct_dequant_2x4_dconly )
+ {
+ set_func_name( "idct_dequant_2x4_dc_%s", i_cqm?"cqm":"flat" );
+ used_asms[1] = 1;
+ for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- )
+ {
+ for( int i = 0; i < 8; i++ )
+ dct1[i] = rand()%(PIXEL_MAX*16*2+1) - PIXEL_MAX*16;
+ qf_c.quant_2x2_dc( &dct1[0], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
+ qf_c.quant_2x2_dc( &dct1[4], h->quant4_mf[CQM_4IC][qp+3][0]>>1, h->quant4_bias[CQM_4IC][qp+3][0]>>1 );
+ memcpy( dct2, dct1, 8*sizeof(dctcoef) );
+ call_c1( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 );
+ call_a1( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 );
+ if( memcmp( dct1, dct2, 8*sizeof(dctcoef) ) )
+ {
+ oks[1] = 0;
+ fprintf( stderr, "idct_dequant_2x4_dconly (qp=%d, cqm=%d): [FAILED]\n", qp, i_cqm );
+ break;
+ }
+ call_c2( qf_c.idct_dequant_2x4_dconly, dct1, h->dequant4_mf[CQM_4IC], qp+3 );
+ call_a2( qf_a.idct_dequant_2x4_dconly, dct2, h->dequant4_mf[CQM_4IC], qp+3 );
+ }
+ }
+
+#define TEST_OPTIMIZE_CHROMA_DC( optname, size ) \
if( qf_a.optname != qf_ref.optname ) \
{ \
set_func_name( #optname ); \
used_asms[2] = 1; \
for( int qp = h->param.rc.i_qp_max; qp >= h->param.rc.i_qp_min; qp-- ) \
{ \
- int dmf = h->dequant4_mf[CQM_4IC][qp%6][0] << qp/6; \
+ int qpdc = qp + (size == 8 ? 3 : 0); \
+ int dmf = h->dequant4_mf[CQM_4IC][qpdc%6][0] << qpdc/6; \
if( dmf > 32*64 ) \
continue; \
- for( int i = 16; ; i <<= 1 )\
+ for( int i = 16; ; i <<= 1 ) \
{ \
int res_c, res_asm; \
int max = X264_MIN( i, PIXEL_MAX*16 ); \
- for( int j = 0; j < w*w; j++ ) \
+ for( int j = 0; j < size; j++ ) \
dct1[j] = rand()%(max*2+1) - max; \
- call_c1( qf_c.qname, dct1, h->quant4_mf[CQM_4IC][qp][0]>>1, h->quant4_bias[CQM_4IC][qp][0]>>1 ); \
- memcpy( dct2, dct1, w*w*sizeof(dctcoef) ); \
+ for( int j = 0; i <= size; j += 4 ) \
+ qf_c.quant_2x2_dc( &dct1[j], h->quant4_mf[CQM_4IC][qpdc][0]>>1, h->quant4_bias[CQM_4IC][qpdc][0]>>1 ); \
+ memcpy( dct2, dct1, size*sizeof(dctcoef) ); \
res_c = call_c1( qf_c.optname, dct1, dmf ); \
res_asm = call_a1( qf_a.optname, dct2, dmf ); \
- if( res_c != res_asm || memcmp( dct1, dct2, w*w*sizeof(dctcoef) ) ) \
+ if( res_c != res_asm || memcmp( dct1, dct2, size*sizeof(dctcoef) ) ) \
{ \
oks[2] = 0; \
fprintf( stderr, #optname "(qp=%d, res_c=%d, res_asm=%d): [FAILED]\n", qp, res_c, res_asm ); \
} \
}
- TEST_OPTIMIZE_CHROMA_DC( quant_2x2_dc, optimize_chroma_dc, 2 );
+ TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x2_dc, 4 );
+ TEST_OPTIMIZE_CHROMA_DC( optimize_chroma_2x4_dc, 8 );
x264_cqm_delete( h );
}
TEST_DECIMATE( decimate_score15, 4, 1, 7 );
report( "decimate_score :" );
-#define TEST_LAST( last, lastname, w, ac ) \
+#define TEST_LAST( last, lastname, size, ac ) \
if( qf_a.last != qf_ref.last ) \
{ \
set_func_name( #lastname ); \
for( int i = 0; i < 100; i++ ) \
{ \
int nnz = 0; \
- int max = rand() & (w*w-1); \
- memset( dct1, 0, w*w*sizeof(dctcoef) ); \
+ int max = rand() & (size-1); \
+ memset( dct1, 0, size*sizeof(dctcoef) ); \
for( int idx = ac; idx < max; idx++ ) \
nnz |= dct1[idx] = !(rand()&3) + (!(rand()&15))*rand(); \
if( !nnz ) \
}
ok = 1; used_asm = 0;
- TEST_LAST( coeff_last[DCT_CHROMA_DC], coeff_last4, 2, 0 );
- TEST_LAST( coeff_last[ DCT_LUMA_AC], coeff_last15, 4, 1 );
- TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 4, 0 );
- TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 8, 0 );
+ TEST_LAST( coeff_last4 , coeff_last4, 4, 0 );
+ TEST_LAST( coeff_last8 , coeff_last8, 8, 0 );
+ TEST_LAST( coeff_last[ DCT_LUMA_AC], coeff_last15, 16, 1 );
+ TEST_LAST( coeff_last[ DCT_LUMA_4x4], coeff_last16, 16, 0 );
+ TEST_LAST( coeff_last[ DCT_LUMA_8x8], coeff_last64, 64, 0 );
report( "coeff_last :" );
-#define TEST_LEVELRUN( lastname, name, w, ac ) \
+#define TEST_LEVELRUN( lastname, name, size, ac ) \
if( qf_a.lastname != qf_ref.lastname ) \
{ \
set_func_name( #name ); \
{ \
x264_run_level_t runlevel_c, runlevel_a; \
int nnz = 0; \
- int max = rand() & (w*w-1); \
- memset( dct1, 0, w*w*sizeof(dctcoef) ); \
+ int max = rand() & (size-1); \
+ memset( dct1, 0, size*sizeof(dctcoef) ); \
memcpy( &runlevel_a, buf1+i, sizeof(x264_run_level_t) ); \
memcpy( &runlevel_c, buf1+i, sizeof(x264_run_level_t) ); \
for( int idx = ac; idx < max; idx++ ) \
}
ok = 1; used_asm = 0;
- TEST_LEVELRUN( coeff_level_run[DCT_CHROMA_DC], coeff_level_run4, 2, 0 );
- TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_AC], coeff_level_run15, 4, 1 );
- TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 4, 0 );
+ TEST_LEVELRUN( coeff_level_run4 , coeff_level_run4, 4, 0 );
+ TEST_LEVELRUN( coeff_level_run8 , coeff_level_run8, 8, 0 );
+ TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_AC], coeff_level_run15, 16, 1 );
+ TEST_LEVELRUN( coeff_level_run[ DCT_LUMA_4x4], coeff_level_run16, 16, 0 );
report( "coeff_level_run :" );
return ret;
{
x264_predict_t predict_16x16[4+3];
x264_predict_t predict_8x8c[4+3];
+ x264_predict_t predict_8x16c[4+3];
x264_predict8x8_t predict_8x8[9+3];
x264_predict_t predict_4x4[9+3];
x264_predict_8x8_filter_t predict_8x8_filter;
x264_predict_16x16_init( 0, ip_c.predict_16x16 );
x264_predict_8x8c_init( 0, ip_c.predict_8x8c );
+ x264_predict_8x16c_init( 0, ip_c.predict_8x16c );
x264_predict_8x8_init( 0, ip_c.predict_8x8, &ip_c.predict_8x8_filter );
x264_predict_4x4_init( 0, ip_c.predict_4x4 );
x264_predict_16x16_init( cpu_ref, ip_ref.predict_16x16 );
x264_predict_8x8c_init( cpu_ref, ip_ref.predict_8x8c );
+ x264_predict_8x16c_init( cpu_ref, ip_ref.predict_8x16c );
x264_predict_8x8_init( cpu_ref, ip_ref.predict_8x8, &ip_ref.predict_8x8_filter );
x264_predict_4x4_init( cpu_ref, ip_ref.predict_4x4 );
x264_predict_16x16_init( cpu_new, ip_a.predict_16x16 );
x264_predict_8x8c_init( cpu_new, ip_a.predict_8x8c );
+ x264_predict_8x16c_init( cpu_new, ip_a.predict_8x16c );
x264_predict_8x8_init( cpu_new, ip_a.predict_8x8, &ip_a.predict_8x8_filter );
x264_predict_4x4_init( cpu_new, ip_a.predict_4x4 );
ip_c.predict_8x8_filter( fdec+48, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
-#define INTRA_TEST( name, dir, w, align, bench, ... )\
+#define INTRA_TEST( name, dir, w, h, align, bench, ... )\
if( ip_a.name[dir] != ip_ref.name[dir] )\
{\
set_func_name( "intra_%s_%s", #name, intra_##name##_names[dir] );\
for( int k = -1; k < 16; k++ )\
printf( "%2x ", edge[16+k] );\
printf( "\n" );\
- for( int j = 0; j < w; j++ )\
+ for( int j = 0; j < h; j++ )\
{\
printf( "%2x ", edge[14-j] );\
for( int k = 0; k < w; k++ )\
printf( "\n" );\
}\
printf( "\n" );\
- for( int j = 0; j < w; j++ )\
+ for( int j = 0; j < h; j++ )\
{\
printf( " " );\
for( int k = 0; k < w; k++ )\
}
for( int i = 0; i < 12; i++ )
- INTRA_TEST( predict_4x4, i, 4, 4, );
+ INTRA_TEST( predict_4x4, i, 4, 4, 4, );
+ for( int i = 0; i < 7; i++ )
+ INTRA_TEST( predict_8x8c, i, 8, 8, 16, );
for( int i = 0; i < 7; i++ )
- INTRA_TEST( predict_8x8c, i, 8, 16, );
+ INTRA_TEST( predict_8x16c, i, 8, 16, 16, );
for( int i = 0; i < 7; i++ )
- INTRA_TEST( predict_16x16, i, 16, 16, );
+ INTRA_TEST( predict_16x16, i, 16, 16, 16, );
for( int i = 0; i < 12; i++ )
- INTRA_TEST( predict_8x8, i, 8, 8, , edge );
+ INTRA_TEST( predict_8x8, i, 8, 8, 8, , edge );
set_func_name("intra_predict_8x8_filter");
if( ip_a.predict_8x8_filter != ip_ref.predict_8x8_filter )
}
}
-#define EXTREMAL_PLANE(size) \
+#define EXTREMAL_PLANE( w, h ) \
{ \
int max[7]; \
for( int j = 0; j < 7; j++ ) \
max[j] = test ? rand()&PIXEL_MAX : PIXEL_MAX; \
fdec[48-1-FDEC_STRIDE] = (i&1)*max[0]; \
- for( int j = 0; j < size/2; j++ ) \
+ for( int j = 0; j < w/2; j++ ) \
fdec[48+j-FDEC_STRIDE] = (!!(i&2))*max[1]; \
- for( int j = size/2; j < size-1; j++ ) \
+ for( int j = w/2; j < w-1; j++ ) \
fdec[48+j-FDEC_STRIDE] = (!!(i&4))*max[2]; \
- fdec[48+(size-1)-FDEC_STRIDE] = (!!(i&8))*max[3]; \
- for( int j = 0; j < size/2; j++ ) \
+ fdec[48+(w-1)-FDEC_STRIDE] = (!!(i&8))*max[3]; \
+ for( int j = 0; j < h/2; j++ ) \
fdec[48+j*FDEC_STRIDE-1] = (!!(i&16))*max[4]; \
- for( int j = size/2; j < size-1; j++ ) \
+ for( int j = h/2; j < h-1; j++ ) \
fdec[48+j*FDEC_STRIDE-1] = (!!(i&32))*max[5]; \
- fdec[48+(size-1)*FDEC_STRIDE-1] = (!!(i&64))*max[6]; \
+ fdec[48+(h-1)*FDEC_STRIDE-1] = (!!(i&64))*max[6]; \
}
/* Extremal test case for planar prediction. */
for( int test = 0; test < 100 && ok; test++ )
for( int i = 0; i < 128 && ok; i++ )
{
- EXTREMAL_PLANE( 8 );
- INTRA_TEST( predict_8x8c, I_PRED_CHROMA_P, 8, 64, 1 );
- EXTREMAL_PLANE( 16 );
- INTRA_TEST( predict_16x16, I_PRED_16x16_P, 16, 64, 1 );
+ EXTREMAL_PLANE( 8, 8 );
+ INTRA_TEST( predict_8x8c, I_PRED_CHROMA_P, 8, 8, 64, 1 );
+ EXTREMAL_PLANE( 8, 16 );
+ INTRA_TEST( predict_8x16c, I_PRED_CHROMA_P, 8, 16, 64, 1 );
+ EXTREMAL_PLANE( 16, 16 );
+ INTRA_TEST( predict_16x16, I_PRED_16x16_P, 16, 16, 64, 1 );
}
report( "intra pred :" );
return ret;
static const char * const pulldown_names[] = { "none", "22", "32", "64", "double", "triple", "euro", 0 };
static const char * const log_level_names[] = { "none", "error", "warning", "info", "debug", 0 };
-static const char * const output_csp_names[] = { "i420", "i444", "rgb", 0 };
+static const char * const output_csp_names[] = { "i420", "i422", "i444", "rgb", 0 };
typedef struct
{
int csp = info->csp & X264_CSP_MASK;
if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp > X264_CSP_NV12) )
param->i_csp = X264_CSP_I420;
+ else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_NV16) )
+ param->i_csp = X264_CSP_I422;
else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp > X264_CSP_YV24) )
param->i_csp = X264_CSP_I444;
else if( output_csp == X264_CSP_RGB && (csp < X264_CSP_BGR || csp > X264_CSP_RGB) )
case OPT_OUTPUT_CSP:
FAIL_IF_ERROR( parse_enum_value( optarg, output_csp_names, &output_csp ), "Unknown output csp `%s'\n", optarg )
// correct the parsed value to the libx264 csp value
- output_csp = !output_csp ? X264_CSP_I420 : (output_csp == 1 ? X264_CSP_I444 : X264_CSP_RGB);
+ static const uint8_t output_csp_fix[] = { X264_CSP_I420, X264_CSP_I422, X264_CSP_I444, X264_CSP_RGB };
+ param->i_csp = output_csp = output_csp_fix[output_csp];
break;
default:
generic_option:
#include "x264_config.h"
-#define X264_BUILD 117
+#define X264_BUILD 118
/* x264_t:
* opaque handler for encoder */
#define X264_CSP_I420 0x0001 /* yuv 4:2:0 planar */
#define X264_CSP_YV12 0x0002 /* yvu 4:2:0 planar */
#define X264_CSP_NV12 0x0003 /* yuv 4:2:0, with one y plane and one packed u+v */
-#define X264_CSP_I444 0x0004 /* yuv 4:4:4 planar */
-#define X264_CSP_YV24 0x0005 /* yvu 4:4:4 planar */
-#define X264_CSP_BGR 0x0006 /* packed bgr 24bits */
-#define X264_CSP_BGRA 0x0007 /* packed bgr 32bits */
-#define X264_CSP_RGB 0x0008 /* packed rgb 24bits */
-#define X264_CSP_MAX 0x0009 /* end of list */
+#define X264_CSP_I422 0x0004 /* yuv 4:2:2 planar */
+#define X264_CSP_YV16 0x0005 /* yvu 4:2:2 planar */
+#define X264_CSP_NV16 0x0006 /* yuv 4:2:2, with one y plane and one packed u+v */
+#define X264_CSP_I444 0x0007 /* yuv 4:4:4 planar */
+#define X264_CSP_YV24 0x0008 /* yvu 4:4:4 planar */
+#define X264_CSP_BGR 0x0009 /* packed bgr 24bits */
+#define X264_CSP_BGRA 0x000a /* packed bgr 32bits */
+#define X264_CSP_RGB 0x000b /* packed rgb 24bits */
+#define X264_CSP_MAX 0x000c /* end of list */
#define X264_CSP_VFLIP 0x1000 /* the csp is vertically flipped */
#define X264_CSP_HIGH_DEPTH 0x2000 /* the csp has a depth of 16 bits per pixel component */
/* Video Properties */
int i_width;
int i_height;
- int i_csp; /* CSP of encoded bitstream, only i420 supported */
+ int i_csp; /* CSP of encoded bitstream */
int i_level_idc;
int i_frame_total; /* number of frames to encode if known, else 0 */
/* x264_param_apply_profile:
* Applies the restrictions of the given profile.
* Currently available profiles are, from most to least restrictive: */
-static const char * const x264_profile_names[] = { "baseline", "main", "high", "high10", 0 };
+static const char * const x264_profile_names[] = { "baseline", "main", "high", "high10", "high422", "high444", 0 };
/* (can be NULL, in which case the function will do nothing)
*