Assembly based on code by Henrik Gramner and Loren Merritt.
};
int csp = i_csp & X264_CSP_MASK;
- if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX )
+ if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX || csp == X264_CSP_V210 )
return -1;
x264_picture_init( pic );
pic->img.i_csp = i_csp;
case X264_CSP_NV16:
case X264_CSP_I422:
case X264_CSP_YV16:
+ case X264_CSP_V210:
return X264_CSP_NV16;
case X264_CSP_I444:
case X264_CSP_YV24:
}
#endif
+ if( BIT_DEPTH != 10 && i_csp == X264_CSP_V210 )
+ {
+ x264_log( h, X264_LOG_ERROR, "v210 input is only compatible with bit-depth of 10 bits\n" );
+ return -1;
+ }
+
dst->i_type = src->i_type;
dst->i_qpplus1 = src->i_qpplus1;
dst->i_pts = dst->i_reordered_pts = src->i_pts;
uint8_t *pix[3];
int stride[3];
- if ( i_csp >= X264_CSP_BGR )
+ if( i_csp == X264_CSP_V210 )
+ {
+ stride[0] = src->img.i_stride[0];
+ pix[0] = src->img.plane[0];
+
+ h->mc.plane_copy_deinterleave_v210( dst->plane[0], dst->i_stride[0],
+ dst->plane[1], dst->i_stride[1],
+ (uint32_t *)pix[0], stride[0]/sizeof(uint32_t), h->param.i_width, h->param.i_height );
+ }
+ else if( i_csp >= X264_CSP_BGR )
{
stride[0] = src->img.i_stride[0];
pix[0] = src->img.plane[0];
}
}
+void x264_plane_copy_deinterleave_v210_c( pixel *dsty, intptr_t i_dsty,
+ pixel *dstc, intptr_t i_dstc,
+ uint32_t *src, intptr_t i_src, int w, int h )
+{
+ for( int l = 0; l < h; l++ )
+ {
+ pixel *dsty0 = dsty;
+ pixel *dstc0 = dstc;
+ uint32_t *src0 = src;
+
+ for( int n = 0; n < w; n += 3 )
+ {
+ *(dstc0++) = *src0 & 0x03FF;
+ *(dsty0++) = ( *src0 >> 10 ) & 0x03FF;
+ *(dstc0++) = ( *src0 >> 20 ) & 0x03FF;
+ src0++;
+ *(dsty0++) = *src0 & 0x03FF;
+ *(dstc0++) = ( *src0 >> 10 ) & 0x03FF;
+ *(dsty0++) = ( *src0 >> 20 ) & 0x03FF;
+ src0++;
+ }
+
+ dsty += i_dsty;
+ dstc += i_dstc;
+ src += i_src;
+ }
+}
+
static void store_interleave_chroma( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height )
{
for( int y=0; y<height; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
pf->plane_copy_interleave = x264_plane_copy_interleave_c;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_c;
+ pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_c;
pf->hpel_filter = hpel_filter;
pixel *src, intptr_t i_src, int w, int h );
void (*plane_copy_deinterleave_rgb)( pixel *dsta, intptr_t i_dsta, pixel *dstb, intptr_t i_dstb,
pixel *dstc, intptr_t i_dstc, pixel *src, intptr_t i_src, int pw, int w, int h );
+ void (*plane_copy_deinterleave_v210)( pixel *dsty, intptr_t i_dsty,
+ pixel *dstc, intptr_t i_dstc,
+ uint32_t *src, intptr_t i_src, int w, int h );
void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
intptr_t i_stride, int i_width, int i_height, int16_t *buf );
hpel_shuf: times 2 db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
deinterleave_shuf: times 2 db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
+v210_mask: times 4 dq 0xc00ffc003ff003ff
+v210_luma_shuf: times 2 db 1,2,4,5,6,7,9,10,12,13,14,15,12,13,14,15
+v210_chroma_shuf: times 2 db 0,1,2,3,5,6,8,9,10,11,13,14,10,11,13,14
+; vpermd indices {0,1,2,4,5,7,_,_} merged in the 3 lsb of each dword to save a register
+v210_mult: dw 0x2000,0x7fff,0x0801,0x2000,0x7ffa,0x0800,0x7ffc,0x0800
+ dw 0x1ffd,0x7fff,0x07ff,0x2000,0x7fff,0x0800,0x7fff,0x0800
+
%if HIGH_BIT_DEPTH
deinterleave_shuf32a: SHUFFLE_MASK_W 0,2,4,6,8,10,12,14
deinterleave_shuf32b: SHUFFLE_MASK_W 1,3,5,7,9,11,13,15
RET
%endmacro ; PLANE_DEINTERLEAVE
+%macro PLANE_DEINTERLEAVE_V210 0
+;-----------------------------------------------------------------------------
+; void x264_plane_copy_deinterleave_v210( uint16_t *dsty, intptr_t i_dsty,
+; uint16_t *dstc, intptr_t i_dstc,
+; uint32_t *src, intptr_t i_src, int w, int h )
+;-----------------------------------------------------------------------------
+%if ARCH_X86_64
+cglobal plane_copy_deinterleave_v210, 8,10,7
+%define src r8
+%define org_w r9
+%define h r7d
+%else
+cglobal plane_copy_deinterleave_v210, 7,7,7
+%define src r4m
+%define org_w r6m
+%define h dword r7m
+%endif
+ FIX_STRIDES r1, r3, r6d
+ shl r5, 2
+ add r0, r6
+ add r2, r6
+ neg r6
+ mov src, r4
+ mov org_w, r6
+ mova m2, [v210_mask]
+ mova m3, [v210_luma_shuf]
+ mova m4, [v210_chroma_shuf]
+ mova m5, [v210_mult] ; also functions as vpermd index for avx2
+ pshufd m6, m5, q1102
+
+ALIGN 16
+.loop:
+ movu m1, [r4]
+ pandn m0, m2, m1
+ pand m1, m2
+ pshufb m0, m3
+ pshufb m1, m4
+ pmulhrsw m0, m5 ; y0 y1 y2 y3 y4 y5 __ __
+ pmulhrsw m1, m6 ; u0 v0 u1 v1 u2 v2 __ __
+%if mmsize == 32
+ vpermd m0, m5, m0
+ vpermd m1, m5, m1
+%endif
+ movu [r0+r6], m0
+ movu [r2+r6], m1
+ add r4, mmsize
+ add r6, 3*mmsize/4
+ jl .loop
+ add r0, r1
+ add r2, r3
+ add src, r5
+ mov r4, src
+ mov r6, org_w
+ dec h
+ jg .loop
+ RET
+%endmacro ; PLANE_DEINTERLEAVE_V210
+
%if HIGH_BIT_DEPTH
INIT_MMX mmx2
PLANE_INTERLEAVE
INIT_XMM sse2
PLANE_INTERLEAVE
PLANE_DEINTERLEAVE
+INIT_XMM ssse3
+PLANE_DEINTERLEAVE_V210
INIT_XMM avx
PLANE_INTERLEAVE
PLANE_DEINTERLEAVE
+PLANE_DEINTERLEAVE_V210
+INIT_YMM avx2
+PLANE_DEINTERLEAVE_V210
%else
INIT_MMX mmx2
PLANE_INTERLEAVE
void x264_plane_copy_deinterleave_avx( uint16_t *dstu, intptr_t i_dstu,
uint16_t *dstv, intptr_t i_dstv,
uint16_t *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_ssse3( uint16_t *dstu, intptr_t i_dstu,
+ uint16_t *dstv, intptr_t i_dstv,
+ uint32_t *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_avx ( uint16_t *dstu, intptr_t i_dstu,
+ uint16_t *dstv, intptr_t i_dstv,
+ uint32_t *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_v210_avx2 ( uint16_t *dstu, intptr_t i_dstu,
+ uint16_t *dstv, intptr_t i_dstv,
+ uint32_t *src, intptr_t i_src, int w, int h );
void x264_store_interleave_chroma_mmx2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_sse2( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
return;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
+ pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_ssse3;
if( !(cpu&(X264_CPU_SLOW_SHUFFLE|X264_CPU_SLOW_ATOM|X264_CPU_SLOW_PALIGNR)) )
pf->integral_init4v = x264_integral_init4v_ssse3;
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx;
pf->plane_copy_interleave = x264_plane_copy_interleave_avx;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx;
+ pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx;
pf->store_interleave_chroma = x264_store_interleave_chroma_avx;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_avx;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_xop;
if( cpu&X264_CPU_AVX2 )
+ {
pf->mc_luma = mc_luma_avx2;
+ pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
+ }
#else // !HIGH_BIT_DEPTH
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:0 support\n" );
return -1;
}
- else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp <= X264_CSP_NV16 )
+ else if( CHROMA_FORMAT != CHROMA_422 && i_csp >= X264_CSP_I422 && i_csp <= X264_CSP_V210 )
{
x264_log( h, X264_LOG_ERROR, "not compiled with 4:2:2 support\n" );
return -1;
for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ )
{
- printf( "%s", x264_cli_csps[i].name );
- if( i+1 < X264_CSP_CLI_MAX )
- printf( ", " );
+ if( x264_cli_csps[i].name )
+ {
+ printf( "%s", x264_cli_csps[i].name );
+ if( i+1 < X264_CSP_CLI_MAX )
+ printf( ", " );
+ }
}
printf( "\n"
" - depth: 8 or 16 bits per pixel [keep current]\n"
if( strlen( str_csp ) == 0 )
csp = info->csp & X264_CSP_MASK;
else
- for( csp = X264_CSP_CLI_MAX-1; x264_cli_csps[csp].name && strcasecmp( x264_cli_csps[csp].name, str_csp ); )
- csp--;
+ for( csp = X264_CSP_CLI_MAX-1; csp > X264_CSP_NONE; csp-- )
+ {
+ if( x264_cli_csps[csp].name && !strcasecmp( x264_cli_csps[csp].name, str_csp ) )
+ break;
+ }
FAIL_IF_ERROR( csp == X264_CSP_NONE, "unsupported colorspace `%s'\n", str_csp );
h->dst_csp = csp;
if( depth == 16 )
int x264_cli_csp_is_invalid( int csp )
{
int csp_mask = csp & X264_CSP_MASK;
- return csp_mask <= X264_CSP_NONE || csp_mask >= X264_CSP_CLI_MAX || csp & X264_CSP_OTHER;
+ return csp_mask <= X264_CSP_NONE || csp_mask >= X264_CSP_CLI_MAX ||
+ csp_mask == X264_CSP_V210 || csp & X264_CSP_OTHER;
}
int x264_cli_csp_depth_factor( int csp )
FAIL_IF_ERROR( !info->width || !info->height, "raw input requires a resolution.\n" )
if( opt->colorspace )
{
- for( info->csp = X264_CSP_CLI_MAX-1; x264_cli_csps[info->csp].name && strcasecmp( x264_cli_csps[info->csp].name, opt->colorspace ); )
- info->csp--;
+ for( info->csp = X264_CSP_CLI_MAX-1; info->csp > X264_CSP_NONE; info->csp-- )
+ {
+ if( x264_cli_csps[info->csp].name && !strcasecmp( x264_cli_csps[info->csp].name, opt->colorspace ) )
+ break;
+ }
FAIL_IF_ERROR( info->csp == X264_CSP_NONE, "unsupported colorspace `%s'\n", opt->colorspace );
}
else /* default */
}
report( "plane_copy :" );
+ if( mc_a.plane_copy_deinterleave_v210 != mc_ref.plane_copy_deinterleave_v210 )
+ {
+ set_func_name( "plane_copy_deinterleave_v210" );
+ used_asm = 1;
+ for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
+ {
+ int w = (plane_specs[i].w + 1) >> 1;
+ int h = plane_specs[i].h;
+ intptr_t dst_stride = ALIGN( w, 16 );
+ intptr_t src_stride = (w + 47) / 48 * 128 / sizeof(uint32_t);
+ intptr_t offv = dst_stride*h + 32;
+ memset( pbuf3, 0, 0x1000 );
+ memset( pbuf4, 0, 0x1000 );
+ call_c( mc_c.plane_copy_deinterleave_v210, pbuf3, dst_stride, pbuf3+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h );
+ call_a( mc_a.plane_copy_deinterleave_v210, pbuf4, dst_stride, pbuf4+offv, dst_stride, (uint32_t *)buf1, src_stride, w, h );
+ for( int y = 0; y < h; y++ )
+ if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(uint16_t) ) ||
+ memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w*sizeof(uint16_t) ) )
+ {
+ ok = 0;
+ fprintf( stderr, "plane_copy_deinterleave_v210 FAILED: w=%d h=%d stride=%d\n", w, h, (int)src_stride );
+ break;
+ }
+ }
+ }
+ report( "v210 :" );
+
if( mc_a.hpel_filter != mc_ref.hpel_filter )
{
pixel *srchpel = pbuf1+8+2*64;
printf( INDENT );
for( int i = X264_CSP_NONE+1; i < X264_CSP_CLI_MAX; i++ )
{
- printf( "%s", x264_cli_csps[i].name );
- if( i+1 < X264_CSP_CLI_MAX )
- printf( ", " );
+ if( x264_cli_csps[i].name )
+ {
+ printf( "%s", x264_cli_csps[i].name );
+ if( i+1 < X264_CSP_CLI_MAX )
+ printf( ", " );
+ }
}
#if HAVE_LAVF
printf( "\n" );
int csp = info->csp & X264_CSP_MASK;
if( output_csp == X264_CSP_I420 && (csp < X264_CSP_I420 || csp > X264_CSP_NV12) )
param->i_csp = X264_CSP_I420;
- else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_NV16) )
+ else if( output_csp == X264_CSP_I422 && (csp < X264_CSP_I422 || csp > X264_CSP_V210) )
param->i_csp = X264_CSP_I422;
else if( output_csp == X264_CSP_I444 && (csp < X264_CSP_I444 || csp > X264_CSP_YV24) )
param->i_csp = X264_CSP_I444;
#include "x264_config.h"
-#define X264_BUILD 140
+#define X264_BUILD 141
/* Application developers planning to link against a shared library version of
* libx264 from a Microsoft Visual Studio or similar development environment
#define X264_CSP_I422 0x0004 /* yuv 4:2:2 planar */
#define X264_CSP_YV16 0x0005 /* yvu 4:2:2 planar */
#define X264_CSP_NV16 0x0006 /* yuv 4:2:2, with one y plane and one packed u+v */
-#define X264_CSP_I444 0x0007 /* yuv 4:4:4 planar */
-#define X264_CSP_YV24 0x0008 /* yvu 4:4:4 planar */
-#define X264_CSP_BGR 0x0009 /* packed bgr 24bits */
-#define X264_CSP_BGRA 0x000a /* packed bgr 32bits */
-#define X264_CSP_RGB 0x000b /* packed rgb 24bits */
-#define X264_CSP_MAX 0x000c /* end of list */
+#define X264_CSP_V210 0x0007 /* 10-bit yuv 4:2:2 packed in 32 */
+#define X264_CSP_I444 0x0008 /* yuv 4:4:4 planar */
+#define X264_CSP_YV24 0x0009 /* yvu 4:4:4 planar */
+#define X264_CSP_BGR 0x000a /* packed bgr 24bits */
+#define X264_CSP_BGRA 0x000b /* packed bgr 32bits */
+#define X264_CSP_RGB 0x000c /* packed rgb 24bits */
+#define X264_CSP_MAX 0x000d /* end of list */
#define X264_CSP_VFLIP 0x1000 /* the csp is vertically flipped */
#define X264_CSP_HIGH_DEPTH 0x2000 /* the csp has a depth of 16 bits per pixel component */