~1% faster overall on Conroe, mostly due to improved cache locality.
Also allows improved SIMD on some chroma functions (e.g. deblock).
This change also extends the API to allow direct NV12 input, which should be a bit faster than YV12.
This isn't currently used in the x264cli, as swscale does not have fast NV12 conversion routines, but it might be useful for other applications.
Note this patch disables the chroma SIMD code for PPC and ARM until new versions are written.
pf->offsetsub = x264_mc_offsetsub_wtab_neon;
pf->weight_cache = x264_weight_cache_neon;
- pf->mc_chroma = x264_mc_chroma_neon;
+// pf->mc_chroma = x264_mc_chroma_neon;
pf->mc_luma = mc_luma_neon;
pf->get_ref = get_ref_neon;
pf->hpel_filter = hpel_filter_neon;
****************************************************************************/
int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height )
{
+ int csp = i_csp & X264_CSP_MASK;
+ if( csp <= X264_CSP_NONE || csp >= X264_CSP_MAX )
+ return -1;
x264_picture_init( pic );
pic->img.i_csp = i_csp;
- pic->img.i_plane = 3;
+ pic->img.i_plane = csp == X264_CSP_NV12 ? 2 : 3;
pic->img.plane[0] = x264_malloc( 3 * i_width * i_height / 2 );
if( !pic->img.plane[0] )
return -1;
pic->img.plane[1] = pic->img.plane[0] + i_width * i_height;
- pic->img.plane[2] = pic->img.plane[1] + i_width * i_height / 4;
+ if( csp != X264_CSP_NV12 )
+ pic->img.plane[2] = pic->img.plane[1] + i_width * i_height / 4;
pic->img.i_stride[0] = i_width;
- pic->img.i_stride[1] = i_width / 2;
- pic->img.i_stride[2] = i_width / 2;
+ if( csp == X264_CSP_NV12 )
+ pic->img.i_stride[1] = i_width;
+ else
+ {
+ pic->img.i_stride[1] = i_width / 2;
+ pic->img.i_stride[2] = i_width / 2;
+ }
return 0;
}
ALIGNED_16( uint32_t fenc_satd_cache[32] );
/* pointer over mb of the frame to be compressed */
- pixel *p_fenc[3];
+ pixel *p_fenc[3]; /* y,u,v */
/* pointer to the actual source frame, not a block copy */
- pixel *p_fenc_plane[3];
+ pixel *p_fenc_plane[2]; /* y,uv */
/* pointer over mb of the frame to be reconstructed */
pixel *p_fdec[3];
/* pointer over mb of the references */
int i_fref[2];
- pixel *p_fref[2][32][4+2]; /* last: lN, lH, lV, lHV, cU, cV */
+ pixel *p_fref[2][32][4+1]; /* last: yN, yH, yV, yHV, uv */
pixel *p_fref_w[32]; /* weighted fullpel luma */
uint16_t *p_integral[2][16];
/* Buffers that are allocated per-thread even in sliced threads. */
void *scratch_buffer; /* for any temporary storage that doesn't want repeated malloc */
- pixel *intra_border_backup[2][3]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
+ pixel *intra_border_backup[2][2]; /* bottom pixels of the previous mb row, used for intra prediction after the framebuffer has been deblocked */
uint8_t (*deblock_strength[2])[2][4][4];
/* CPU functions dependents */
pix += 2*ystride;
continue;
}
- for( int d = 0; d < 2; d++ )
+ for( int d = 0; d < 2; d++, pix += ystride-2 )
+ for( int e = 0; e < 2; e++, pix++ )
{
int p1 = pix[-2*xstride];
int p0 = pix[-1*xstride];
pix[-1*xstride] = x264_clip_pixel( p0 + delta ); /* p0' */
pix[ 0*xstride] = x264_clip_pixel( q0 - delta ); /* q0' */
}
- pix += ystride;
}
}
}
static void deblock_v_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
- deblock_chroma_c( pix, stride, 1, alpha, beta, tc0 );
+ deblock_chroma_c( pix, stride, 2, alpha, beta, tc0 );
}
static void deblock_h_chroma_c( pixel *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
- deblock_chroma_c( pix, 1, stride, alpha, beta, tc0 );
+ deblock_chroma_c( pix, 2, stride, alpha, beta, tc0 );
}
static inline void deblock_luma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta )
deblock_luma_intra_c( pix, 1, stride, alpha, beta );
}
-static inline void deblock_chroma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta )
+static inline void deblock_chroma_intra_c( pixel *pix, int xstride, int ystride, int alpha, int beta, int dir )
{
- for( int d = 0; d < 8; d++ )
+ for( int d = 0; d < (dir?16:8); d++, pix += ystride-2 )
+ for( int e = 0; e < (dir?1:2); e++, pix++ )
{
int p1 = pix[-2*xstride];
int p0 = pix[-1*xstride];
pix[-1*xstride] = (2*p1 + p0 + q1 + 2) >> 2; /* p0' */
pix[ 0*xstride] = (2*q1 + q0 + p1 + 2) >> 2; /* q0' */
}
- pix += ystride;
}
}
static void deblock_v_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
- deblock_chroma_intra_c( pix, stride, 1, alpha, beta );
+ deblock_chroma_intra_c( pix, stride, 2, alpha, beta, 1 );
}
static void deblock_h_chroma_intra_c( pixel *pix, int stride, int alpha, int beta )
{
- deblock_chroma_intra_c( pix, 1, stride, alpha, beta );
+ deblock_chroma_intra_c( pix, 2, stride, alpha, beta, 0 );
}
static void deblock_strength_c( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
}
}
-static inline void deblock_edge( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
+static inline void deblock_edge( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_inter_t pf_inter )
{
int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset;
int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset;
tc[2] = (tc0_table(index_a)[bS[2]] << (BIT_DEPTH-8)) + b_chroma;
tc[3] = (tc0_table(index_a)[bS[3]] << (BIT_DEPTH-8)) + b_chroma;
- pf_inter( pix1, i_stride, alpha, beta, tc );
- if( b_chroma )
- pf_inter( pix2, i_stride, alpha, beta, tc );
+ pf_inter( pix, i_stride, alpha, beta, tc );
}
-static inline void deblock_edge_intra( x264_t *h, pixel *pix1, pixel *pix2, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
+static inline void deblock_edge_intra( x264_t *h, pixel *pix, int i_stride, uint8_t bS[4], int i_qp, int b_chroma, x264_deblock_intra_t pf_intra )
{
int index_a = i_qp-QP_BD_OFFSET + h->sh.i_alpha_c0_offset;
int index_b = i_qp-QP_BD_OFFSET + h->sh.i_beta_offset;
if( !alpha || !beta )
return;
- pf_intra( pix1, i_stride, alpha, beta );
- if( b_chroma )
- pf_intra( pix2, i_stride, alpha, beta );
+ pf_intra( pix, i_stride, alpha, beta );
}
void x264_frame_deblock_row( x264_t *h, int mb_y )
uint8_t (*bs)[4][4] = h->deblock_strength[mb_y&b_interlaced][mb_x];
pixel *pixy = h->fdec->plane[0] + 16*mb_y*stridey + 16*mb_x;
- pixel *pixu = h->fdec->plane[1] + 8*mb_y*strideuv + 8*mb_x;
- pixel *pixv = h->fdec->plane[2] + 8*mb_y*strideuv + 8*mb_x;
+ pixel *pixuv = h->fdec->plane[1] + 8*mb_y*strideuv + 16*mb_x;
if( mb_y & b_interlaced )
{
pixy -= 15*stridey;
- pixu -= 7*strideuv;
- pixv -= 7*strideuv;
+ pixuv -= 7*strideuv;
}
int qp = h->mb.qp[mb_xy];
#define FILTER( intra, dir, edge, qp, chroma_qp )\
do\
{\
- deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1), NULL,\
+ deblock_edge##intra( h, pixy + 4*edge*(dir?stride2y:1),\
stride2y, bs[dir][edge], qp, 0,\
h->loopf.deblock_luma##intra[dir] );\
if( !(edge & 1) )\
- deblock_edge##intra( h, pixu + 2*edge*(dir?stride2uv:1), pixv + 2*edge*(dir?stride2uv:1),\
+ deblock_edge##intra( h, pixuv + 2*edge*(dir?stride2uv:2),\
stride2uv, bs[dir][edge], chroma_qp, 1,\
h->loopf.deblock_chroma##intra[dir] );\
} while(0)
}
#if HAVE_MMX
-void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
-void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
-void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
-
void x264_deblock_v_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_chroma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_sse2( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_h_luma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
+void x264_deblock_v_chroma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_sse2( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_strength_mmxext( uint8_t nnz[X264_SCAN8_SIZE], int8_t ref[2][X264_SCAN8_LUMA_SIZE],
int16_t mv[2][X264_SCAN8_LUMA_SIZE][2], uint8_t bs[2][4][4],
int mvy_limit, int bframe );
#if ARCH_X86
void x264_deblock_h_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_v8_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_v_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
+void x264_deblock_h_chroma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 );
void x264_deblock_h_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
void x264_deblock_v8_luma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
+void x264_deblock_v_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
+void x264_deblock_h_chroma_intra_mmxext( uint8_t *pix, int stride, int alpha, int beta );
+// FIXME this wrapper has a significant cpu cost
static void x264_deblock_v_luma_mmxext( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
{
x264_deblock_v8_luma_mmxext( pix, stride, alpha, beta, tc0 );
if( cpu&X264_CPU_MMXEXT )
{
#if !X264_HIGH_BIT_DEPTH
- pf->deblock_chroma[1] = x264_deblock_v_chroma_mmxext;
- pf->deblock_chroma[0] = x264_deblock_h_chroma_mmxext;
- pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmxext;
- pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_mmxext;
#if ARCH_X86
pf->deblock_luma[1] = x264_deblock_v_luma_mmxext;
pf->deblock_luma[0] = x264_deblock_h_luma_mmxext;
+ pf->deblock_chroma[1] = x264_deblock_v_chroma_mmxext;
+ pf->deblock_chroma[0] = x264_deblock_h_chroma_mmxext;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_mmxext;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_mmxext;
+ pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_mmxext;
+ pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_mmxext;
#endif
#endif // !X264_HIGH_BIT_DEPTH
pf->deblock_strength = x264_deblock_strength_mmxext;
{
pf->deblock_luma[1] = x264_deblock_v_luma_sse2;
pf->deblock_luma[0] = x264_deblock_h_luma_sse2;
+ pf->deblock_chroma[1] = x264_deblock_v_chroma_sse2;
+ pf->deblock_chroma[0] = x264_deblock_h_chroma_sse2;
pf->deblock_luma_intra[1] = x264_deblock_v_luma_intra_sse2;
pf->deblock_luma_intra[0] = x264_deblock_h_luma_intra_sse2;
+ pf->deblock_chroma_intra[1] = x264_deblock_v_chroma_intra_sse2;
+ pf->deblock_chroma_intra[0] = x264_deblock_h_chroma_intra_sse2;
}
#endif // !X264_HIGH_BIT_DEPTH
}
{
pf->deblock_luma[1] = x264_deblock_v_luma_neon;
pf->deblock_luma[0] = x264_deblock_h_luma_neon;
- pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
- pf->deblock_chroma[0] = x264_deblock_h_chroma_neon;
+// pf->deblock_chroma[1] = x264_deblock_v_chroma_neon;
+// pf->deblock_chroma[0] = x264_deblock_h_chroma_neon;
}
#endif
#endif // !X264_HIGH_BIT_DEPTH
i_stride = ALIGN( i_width + 2*PADH, align );
i_lines = h->mb.i_mb_height*16;
- frame->i_plane = 3;
- for( int i = 0; i < 3; i++ )
+ frame->i_plane = 2;
+ for( int i = 0; i < 2; i++ )
{
- frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
- frame->i_width[i] = i_width >> !!i;
- frame->i_lines[i] = i_lines >> !!i;
- }
-
- luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
- chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + 2*i_padv));
- for( int i = 1; i < 3; i++ )
- {
- CHECKED_MALLOC( frame->buffer[i], chroma_plane_size * sizeof(pixel) );
- frame->plane[i] = frame->buffer[i] + (frame->i_stride[i] * i_padv + PADH)/2;
+ frame->i_stride[i] = ALIGN( i_stride, align );
+ frame->i_width[i] = i_width >> i;
+ frame->i_lines[i] = i_lines >> i;
}
for( int i = 0; i < h->param.i_bframe + 2; i++ )
frame->orig = frame;
+ luma_plane_size = (frame->i_stride[0] * (frame->i_lines[0] + 2*i_padv));
+ chroma_plane_size = (frame->i_stride[1] * (frame->i_lines[1] + i_padv));
+
+ CHECKED_MALLOC( frame->buffer[1], chroma_plane_size * sizeof(pixel) );
+ frame->plane[1] = frame->buffer[1] + frame->i_stride[1] * i_padv/2 + PADH;
+
/* all 4 luma planes allocated together, since the cacheline split code
* requires them to be in-phase wrt cacheline alignment. */
if( h->param.analyse.i_subpel_refine && b_fdec )
x264_free( frame );
}
+static int get_plane_ptr( x264_t *h, x264_picture_t *src, uint8_t **pix, int *stride, int plane, int xshift, int yshift )
+{
+ int width = h->param.i_width >> xshift;
+ int height = h->param.i_height >> yshift;
+ *pix = src->img.plane[plane];
+ *stride = src->img.i_stride[plane];
+ if( src->img.i_csp & X264_CSP_VFLIP )
+ {
+ *pix += (height-1) * *stride;
+ *stride = -*stride;
+ }
+ if( width > abs(*stride) )
+ {
+ x264_log( h, X264_LOG_ERROR, "Input picture width (%d) is greater than stride (%d)\n", width, *stride );
+ return -1;
+ }
+ return 0;
+}
+
+#define get_plane_ptr(...) do{ if( get_plane_ptr(__VA_ARGS__) < 0 ) return -1; }while(0)
+
int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
{
int i_csp = src->img.i_csp & X264_CSP_MASK;
dst->param = src->param;
dst->i_pic_struct = src->i_pic_struct;
- for( int i = 0; i < 3; i++ )
+ uint8_t *pix[3];
+ int stride[3];
+ get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 );
+ h->mc.plane_copy( dst->plane[0], dst->i_stride[0], pix[0], stride[0],
+ h->param.i_width, h->param.i_height );
+ if( i_csp == X264_CSP_NV12 )
{
- int s = (i_csp == X264_CSP_YV12 && i) ? i^3 : i;
- uint8_t *plane = src->img.plane[s];
- int stride = src->img.i_stride[s];
- int width = h->param.i_width >> !!i;
- int height = h->param.i_height >> !!i;
- if( src->img.i_csp & X264_CSP_VFLIP )
- {
- plane += (height-1)*stride;
- stride = -stride;
- }
- if( width > abs(stride) )
- {
- x264_log( h, X264_LOG_ERROR, "Input picture width is greater than stride\n" );
- return -1;
- }
- h->mc.plane_copy( dst->plane[i], dst->i_stride[i], plane, stride, width, height );
+ get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, 1 );
+ h->mc.plane_copy( dst->plane[1], dst->i_stride[1], pix[1], stride[1],
+ h->param.i_width, h->param.i_height>>1 );
+ }
+ else
+ {
+ get_plane_ptr( h, src, &pix[1], &stride[1], i_csp==X264_CSP_I420 ? 1 : 2, 1, 1 );
+ get_plane_ptr( h, src, &pix[2], &stride[2], i_csp==X264_CSP_I420 ? 2 : 1, 1, 1 );
+ h->mc.plane_copy_interleave( dst->plane[1], dst->i_stride[1],
+ pix[1], stride[1], pix[2], stride[2],
+ h->param.i_width>>1, h->param.i_height>>1 );
}
return 0;
}
-static void ALWAYS_INLINE pixel_memset( pixel *dst, int value, int size )
+static void ALWAYS_INLINE pixel_memset( pixel *dst, pixel *src, int len, int size )
{
- for( int i = 0; i < size; i++ )
- dst[i] = value;
+ uint8_t *dstp = (uint8_t*)dst;
+ if(size == 1) {
+ memset(dst, *src, len);
+ } else if(size == 2) {
+ int v = M16( src );
+ for(int i=0; i<len; i++)
+ M16( dstp+i*2 ) = v;
+ } else if(size == 4) {
+ int v = M32( src );
+ for(int i=0; i<len; i++)
+ M32( dstp+i*4 ) = v;
+ }
}
-static void plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom )
+static void plane_expand_border( pixel *pix, int i_stride, int i_width, int i_height, int i_padh, int i_padv, int b_pad_top, int b_pad_bottom, int b_chroma )
{
#define PPIXEL(x, y) ( pix + (x) + (y)*i_stride )
for( int y = 0; y < i_height; y++ )
{
/* left band */
- pixel_memset( PPIXEL(-i_padh, y), PPIXEL(0, y)[0], i_padh );
+ pixel_memset( PPIXEL(-i_padh, y), PPIXEL(0, y), i_padh>>b_chroma, sizeof(pixel)<<b_chroma );
/* right band */
- pixel_memset( PPIXEL(i_width, y), PPIXEL(i_width-1, y)[0], i_padh );
+ pixel_memset( PPIXEL(i_width, y), PPIXEL(i_width-1-b_chroma, y), i_padh>>b_chroma, sizeof(pixel)<<b_chroma );
}
/* upper band */
if( b_pad_top )
for( int i = 0; i < frame->i_plane; i++ )
{
int stride = frame->i_stride[i];
- int width = 16*h->mb.i_mb_width >> !!i;
+ int width = 16*h->sps->i_mb_width;
int height = (b_end ? 16*(h->mb.i_mb_height - mb_y) >> h->sh.b_mbaff : 16) >> !!i;
- int padh = PADH >> !!i;
+ int padh = PADH;
int padv = PADV >> !!i;
// buffer: 2 chroma, 3 luma (rounded to 4) because deblocking goes beyond the top of the mb
pixel *pix = frame->plane[i] + X264_MAX(0, (16*mb_y-4)*stride >> !!i);
height += 4 >> (!!i + h->sh.b_mbaff);
if( h->sh.b_mbaff )
{
- plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
- plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
+ plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, i );
+ plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, i );
}
else
{
- plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
+ plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, i );
}
}
}
pixel *pix = frame->filtered[i] + (16*mb_y - (8 << h->sh.b_mbaff)) * stride - 4;
if( h->sh.b_mbaff )
{
- plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end );
- plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end );
+ plane_expand_border( pix, stride*2, width, height, padh, padv, b_start, b_end, 0 );
+ plane_expand_border( pix+stride, stride*2, width, height, padh, padv, b_start, b_end, 0 );
}
else
- plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end );
+ plane_expand_border( pix, stride, width, height, padh, padv, b_start, b_end, 0 );
}
}
void x264_frame_expand_border_lowres( x264_frame_t *frame )
{
for( int i = 0; i < 4; i++ )
- plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1 );
+ plane_expand_border( frame->lowres[i], frame->i_stride_lowres, frame->i_width_lowres, frame->i_lines_lowres, PADH, PADV, 1, 1, 0 );
}
void x264_frame_expand_border_mod16( x264_t *h, x264_frame_t *frame )
{
for( int i = 0; i < frame->i_plane; i++ )
{
- int i_subsample = i ? 1 : 0;
- int i_width = h->param.i_width >> i_subsample;
- int i_height = h->param.i_height >> i_subsample;
- int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width) >> i_subsample;
- int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> i_subsample;
+ int i_width = h->param.i_width;
+ int i_height = h->param.i_height >> !!i;
+ int i_padx = (h->mb.i_mb_width * 16 - h->param.i_width);
+ int i_pady = (h->mb.i_mb_height * 16 - h->param.i_height) >> !!i;
if( i_padx )
{
for( int y = 0; y < i_height; y++ )
- {
- pixel value = frame->plane[i][y*frame->i_stride[i] + i_width - 1];
- pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width], value, i_padx );
- }
+ pixel_memset( &frame->plane[i][y*frame->i_stride[i] + i_width],
+ &frame->plane[i][y*frame->i_stride[i] + i_width - 1-i],
+ i_padx>>i, sizeof(pixel)<<i );
}
if( i_pady )
{
/* YUV buffer */
int i_plane;
- int i_stride[3];
- int i_width[3];
- int i_lines[3];
+ int i_stride[2];
+ int i_width[2];
+ int i_lines[2];
int i_stride_lowres;
int i_width_lowres;
int i_lines_lowres;
- pixel *plane[3];
+ pixel *plane[2];
pixel *filtered[4]; /* plane[0], H, V, HV */
pixel *lowres[4]; /* half-size copy of input frame: Orig, H, V, HV */
uint16_t *integral;
if( h->mb.b_interlaced & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2;
- h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+ h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x],
+ &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
h->mb.pic.p_fref[0][i_ref][4], h->mb.pic.i_stride[1],
mvx, mvy, 2*width, 2*height );
h->sh.weight[i_ref][1].weightfn[width>>1]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->sh.weight[i_ref][1], height*2 );
-
- h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- h->mb.pic.p_fref[0][i_ref][5], h->mb.pic.i_stride[2],
- mvx, mvy, 2*width, 2*height );
-
if( h->sh.weight[i_ref][2].weightfn )
h->sh.weight[i_ref][2].weightfn[width>>1]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
&h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
if( h->mb.b_interlaced & i_ref )
mvy += (h->mb.i_mb_y & 1)*4 - 2;
- h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
+ h->mc.mc_chroma( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x],
+ &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
h->mb.pic.p_fref[1][i_ref][4], h->mb.pic.i_stride[1],
mvx, mvy, 2*width, 2*height );
-
- h->mc.mc_chroma( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE,
- h->mb.pic.p_fref[1][i_ref][5], h->mb.pic.i_stride[2],
- mvx, mvy, 2*width, 2*height );
}
static NOINLINE void x264_mb_mc_01xywh( x264_t *h, int x, int y, int width, int height )
if( h->mb.b_interlaced & i_ref1 )
mvy1 += (h->mb.i_mb_y & 1)*4 - 2;
- h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
+ h->mc.mc_chroma( tmp0, tmp0+8, 16, h->mb.pic.p_fref[0][i_ref0][4], h->mb.pic.i_stride[1],
mvx0, mvy0, 2*width, 2*height );
- h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
+ h->mc.mc_chroma( tmp1, tmp1+8, 16, h->mb.pic.p_fref[1][i_ref1][4], h->mb.pic.i_stride[1],
mvx1, mvy1, 2*width, 2*height );
h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[1][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
- h->mc.mc_chroma( tmp0, 16, h->mb.pic.p_fref[0][i_ref0][5], h->mb.pic.i_stride[2],
- mvx0, mvy0, 2*width, 2*height );
- h->mc.mc_chroma( tmp1, 16, h->mb.pic.p_fref[1][i_ref1][5], h->mb.pic.i_stride[2],
- mvx1, mvy1, 2*width, 2*height );
- h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0, 16, tmp1, 16, weight );
+ h->mc.avg[i_mode+3]( &h->mb.pic.p_fdec[2][2*y*FDEC_STRIDE+2*x], FDEC_STRIDE, tmp0+8, 16, tmp1+8, 16, weight );
}
void x264_mb_mc_8x8( x264_t *h, int i8 )
if( !b_lookahead )
for( int i = 0; i <= h->param.b_interlaced; i++ )
{
- for( int j = 0; j < 3; j++ )
+ for( int j = 0; j < 2; j++ )
{
/* shouldn't really be initialized, just silences a valgrind false-positive in predict_8x8_filter_mmx */
- CHECKED_MALLOCZERO( h->intra_border_backup[i][j], ((h->mb.i_mb_width*16+32)>>!!j) * sizeof(pixel) );
- h->intra_border_backup[i][j] += 8;
+ CHECKED_MALLOCZERO( h->intra_border_backup[i][j], (h->sps->i_mb_width*16+32) * sizeof(pixel) );
+ h->intra_border_backup[i][j] += 16;
}
CHECKED_MALLOC( h->deblock_strength[i], sizeof(**h->deblock_strength) * h->mb.i_mb_width );
}
for( int i = 0; i <= h->param.b_interlaced; i++ )
{
x264_free( h->deblock_strength[i] );
- for( int j = 0; j < 3; j++ )
- x264_free( h->intra_border_backup[i][j] - 8 );
+ for( int j = 0; j < 2; j++ )
+ x264_free( h->intra_border_backup[i][j] - 16 );
}
x264_free( h->scratch_buffer );
}
{
int stride_y = fenc->i_stride[0];
int stride_uv = fenc->i_stride[1];
- int off_y = 16 * (i_mb_x + i_mb_y * stride_y);
- int off_uv = 8 * (i_mb_x + i_mb_y * stride_uv);
+ int off_y = 16 * i_mb_x + 16 * i_mb_y * stride_y;
+ int off_uv = 16 * i_mb_x + 8 * i_mb_y * stride_uv;
h->mc.prefetch_fenc( fenc->plane[0]+off_y, stride_y,
- fenc->plane[1+(i_mb_x&1)]+off_uv, stride_uv, i_mb_x );
+ fenc->plane[1]+off_uv, stride_uv, i_mb_x );
}
-static NOINLINE void copy_column8( pixel *dst, pixel *src )
+NOINLINE void x264_copy_column8( pixel *dst, pixel *src )
{
// input pointers are offset by 4 rows because that's faster (smaller instruction size on x86)
for( int i = -4; i < 4; i++ )
static void ALWAYS_INLINE x264_macroblock_load_pic_pointers( x264_t *h, int mb_x, int mb_y, int i, int b_interlaced )
{
- const int w = (i == 0 ? 16 : 8);
- const int i_stride = h->fdec->i_stride[!!i];
- const int i_stride2 = i_stride << b_interlaced;
- const int i_pix_offset = b_interlaced
- ? w * (mb_x + (mb_y&~1) * i_stride) + (mb_y&1) * i_stride
- : w * (mb_x + mb_y * i_stride);
- const pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset];
- const pixel *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16>>!!i];
+ int w = (i ? 8 : 16);
+ int i_stride = h->fdec->i_stride[i];
+ int i_stride2 = i_stride << b_interlaced;
+ int i_pix_offset = b_interlaced
+ ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
+ : 16 * mb_x + w * mb_y * i_stride;
+ pixel *plane_fdec = &h->fdec->plane[i][i_pix_offset];
+ pixel *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16];
int ref_pix_offset[2] = { i_pix_offset, i_pix_offset };
x264_frame_t **fref[2] = { h->fref0, h->fref1 };
if( b_interlaced )
ref_pix_offset[1] += (1-2*(mb_y&1)) * i_stride;
h->mb.pic.i_stride[i] = i_stride2;
h->mb.pic.p_fenc_plane[i] = &h->fenc->plane[i][i_pix_offset];
- h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( h->mb.pic.p_fenc[i], FENC_STRIDE,
- h->mb.pic.p_fenc_plane[i], i_stride2, w );
- memcpy( &h->mb.pic.p_fdec[i][-1-FDEC_STRIDE], intra_fdec-1, (w*3/2+1) * sizeof(pixel) );
+ if( i )
+ {
+ h->mc.load_deinterleave_8x8x2_fenc( h->mb.pic.p_fenc[1], h->mb.pic.p_fenc_plane[1], i_stride2 );
+ memcpy( h->mb.pic.p_fdec[1]-FDEC_STRIDE, intra_fdec, 8*sizeof(pixel) );
+ memcpy( h->mb.pic.p_fdec[2]-FDEC_STRIDE, intra_fdec+8, 8*sizeof(pixel) );
+ }
+ else
+ {
+ h->mc.copy[PIXEL_16x16]( h->mb.pic.p_fenc[0], FENC_STRIDE, h->mb.pic.p_fenc_plane[0], i_stride2, 16 );
+ memcpy( h->mb.pic.p_fdec[0]-FDEC_STRIDE, intra_fdec, 24*sizeof(pixel) );
+ }
if( b_interlaced )
+ {
for( int j = 0; j < w; j++ )
- h->mb.pic.p_fdec[i][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
+ if( i )
+ {
+ h->mb.pic.p_fdec[1][-1+j*FDEC_STRIDE] = plane_fdec[-2+j*i_stride2];
+ h->mb.pic.p_fdec[2][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
+ }
+ else
+ h->mb.pic.p_fdec[0][-1+j*FDEC_STRIDE] = plane_fdec[-1+j*i_stride2];
+ }
for( int j = 0; j < h->mb.pic.i_fref[0]; j++ )
{
- h->mb.pic.p_fref[0][j][i==0 ? 0:i+3] = &fref[0][j >> b_interlaced]->plane[i][ref_pix_offset[j&1]];
- if( i == 0 )
+ h->mb.pic.p_fref[0][j][i?4:0] = &fref[0][j >> b_interlaced]->plane[i][ref_pix_offset[j&1]];
+ if( !i )
{
for( int k = 1; k < 4; k++ )
h->mb.pic.p_fref[0][j][k] = &fref[0][j >> b_interlaced]->filtered[k][ref_pix_offset[j&1]];
if( h->sh.i_type == SLICE_TYPE_B )
for( int j = 0; j < h->mb.pic.i_fref[1]; j++ )
{
- h->mb.pic.p_fref[1][j][i==0 ? 0:i+3] = &fref[1][j >> b_interlaced]->plane[i][ref_pix_offset[j&1]];
- if( i == 0 )
+ h->mb.pic.p_fref[1][j][i?4:0] = &fref[1][j >> b_interlaced]->plane[i][ref_pix_offset[j&1]];
+ if( !i )
for( int k = 1; k < 4; k++ )
h->mb.pic.p_fref[1][j][k] = &fref[1][j >> b_interlaced]->filtered[k][ref_pix_offset[j&1]];
}
if( !h->mb.b_interlaced )
{
- copy_column8( h->mb.pic.p_fdec[0]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4*FDEC_STRIDE );
- copy_column8( h->mb.pic.p_fdec[0]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12*FDEC_STRIDE );
- copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE );
- copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE );
+ x264_copy_column8( h->mb.pic.p_fdec[0]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+ 4*FDEC_STRIDE );
+ x264_copy_column8( h->mb.pic.p_fdec[0]-1+12*FDEC_STRIDE, h->mb.pic.p_fdec[0]+15+12*FDEC_STRIDE );
+ x264_copy_column8( h->mb.pic.p_fdec[1]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+ 7+ 4*FDEC_STRIDE );
+ x264_copy_column8( h->mb.pic.p_fdec[2]-1+ 4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+ 7+ 4*FDEC_STRIDE );
x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 0, 0 );
x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 0 );
- x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 2, 0 );
}
else
{
x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 0, 1 );
x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 1, 1 );
- x264_macroblock_load_pic_pointers( h, mb_x, mb_y, 2, 1 );
}
if( h->fdec->integral )
}
}
-static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int i )
+static void ALWAYS_INLINE twiddle_topleft_pixel( pixel *dst, pixel *src, int b_interlaced )
+{
+ // We update intra_border_backup in-place, so the topleft neighbor will no longer
+ // exist there when load_pic_pointers wants it. Move it within p_fdec instead.
+ if( b_interlaced )
+ {
+ dst[0] = dst[-1];
+ dst[-1] = src[0];
+ }
+ else
+ dst[0] = src[0];
+}
+
+static void ALWAYS_INLINE x264_macroblock_store_pic( x264_t *h, int mb_x, int mb_y, int i, int b_interlaced )
{
int w = i ? 8 : 16;
- int i_stride = h->fdec->i_stride[!!i];
- int i_stride2 = i_stride << h->mb.b_interlaced;
- int i_pix_offset = h->mb.b_interlaced
- ? w * (h->mb.i_mb_x + (h->mb.i_mb_y&~1) * i_stride) + (h->mb.i_mb_y&1) * i_stride
- : w * (h->mb.i_mb_x + h->mb.i_mb_y * i_stride);
- h->mc.copy[i?PIXEL_8x8:PIXEL_16x16]( &h->fdec->plane[i][i_pix_offset], i_stride2,
- h->mb.pic.p_fdec[i], FDEC_STRIDE, w );
+ int i_stride = h->fdec->i_stride[i];
+ int i_stride2 = i_stride << b_interlaced;
+ int i_pix_offset = b_interlaced
+ ? 16 * mb_x + w * (mb_y&~1) * i_stride + (mb_y&1) * i_stride
+ : 16 * mb_x + w * mb_y * i_stride;
+ pixel *intra_fdec = &h->intra_border_backup[mb_y & h->sh.b_mbaff][i][mb_x*16];
+ if( i )
+ {
+ h->mc.store_interleave_8x8x2( &h->fdec->plane[1][i_pix_offset], i_stride2, h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2] );
+ memcpy( intra_fdec, h->mb.pic.p_fdec[1]+FDEC_STRIDE*7, 8*sizeof(pixel) );
+ memcpy( intra_fdec+8, h->mb.pic.p_fdec[2]+FDEC_STRIDE*7, 8*sizeof(pixel) );
+ twiddle_topleft_pixel( h->mb.pic.p_fdec[1]-FDEC_STRIDE-1, h->mb.pic.p_fdec[1]-FDEC_STRIDE+7, b_interlaced );
+ twiddle_topleft_pixel( h->mb.pic.p_fdec[2]-FDEC_STRIDE-1, h->mb.pic.p_fdec[2]-FDEC_STRIDE+7, b_interlaced );
+ }
+ else
+ {
+ h->mc.copy[PIXEL_16x16]( &h->fdec->plane[0][i_pix_offset], i_stride2, h->mb.pic.p_fdec[0], FDEC_STRIDE, 16 );
+ memcpy( intra_fdec, h->mb.pic.p_fdec[0]+FDEC_STRIDE*15, 16*sizeof(pixel) );
+ twiddle_topleft_pixel( h->mb.pic.p_fdec[0]-FDEC_STRIDE-1, h->mb.pic.p_fdec[0]-FDEC_STRIDE+15, b_interlaced );
+ }
}
void x264_macroblock_cache_save( x264_t *h )
int8_t *i4x4 = h->mb.intra4x4_pred_mode[i_mb_xy];
uint8_t *nnz = h->mb.non_zero_count[i_mb_xy];
- x264_macroblock_store_pic( h, 0 );
- x264_macroblock_store_pic( h, 1 );
- x264_macroblock_store_pic( h, 2 );
+ if( h->mb.b_interlaced )
+ {
+ x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 1 );
+ x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 1 );
+ }
+ else
+ {
+ x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 0, 0 );
+ x264_macroblock_store_pic( h, h->mb.i_mb_x, h->mb.i_mb_y, 1, 0 );
+ }
x264_prefetch_fenc( h, h->fdec, h->mb.i_mb_x, h->mb.i_mb_y );
void x264_prefetch_fenc( x264_t *h, x264_frame_t *fenc, int i_mb_x, int i_mb_y );
+void x264_copy_column8( pixel *dst, pixel *src );
+
/* x264_mb_predict_mv_16x16:
* set mvp with predicted mv for D_16x16 block
* h->mb. need only valid values from other blocks */
}
/* full chroma mc (ie until 1/8 pixel)*/
-static void mc_chroma( pixel *dst, int i_dst_stride,
+static void mc_chroma( pixel *dstu, pixel *dstv, int i_dst_stride,
pixel *src, int i_src_stride,
int mvx, int mvy,
int i_width, int i_height )
int cC = (8-d8x)*d8y;
int cD = d8x *d8y;
- src += (mvy >> 3) * i_src_stride + (mvx >> 3);
+ src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
srcp = &src[i_src_stride];
for( int y = 0; y < i_height; y++ )
{
for( int x = 0; x < i_width; x++ )
- dst[x] = ( cA*src[x] + cB*src[x+1] + cC*srcp[x] + cD*srcp[x+1] + 32 ) >> 6;
- dst += i_dst_stride;
+ {
+ dstu[x] = ( cA*src[2*x] + cB*src[2*x+2] +
+ cC*srcp[2*x] + cD*srcp[2*x+2] + 32 ) >> 6;
+ dstv[x] = ( cA*src[2*x+1] + cB*src[2*x+3] +
+ cC*srcp[2*x+1] + cD*srcp[2*x+3] + 32 ) >> 6;
+ }
+ dstu += i_dst_stride;
+ dstv += i_dst_stride;
src = srcp;
srcp += i_src_stride;
}
MC_COPY( 4 )
void x264_plane_copy_c( pixel *dst, int i_dst,
- uint8_t *src, int i_src, int w, int h)
+ uint8_t *src, int i_src, int w, int h )
{
while( h-- )
{
}
}
+void x264_plane_copy_interleave_c( pixel *dst, int i_dst,
+ uint8_t *srcu, int i_srcu,
+ uint8_t *srcv, int i_srcv, int w, int h )
+{
+ for( int y=0; y<h; y++, dst+=i_dst, srcu+=i_srcu, srcv+=i_srcv )
+ for( int x=0; x<w; x++ )
+ {
+ dst[2*x] = srcu[x] << (BIT_DEPTH-8);
+ dst[2*x+1] = srcv[x] << (BIT_DEPTH-8);
+ }
+}
+
+void x264_plane_copy_deinterleave_c( pixel *dstu, int i_dstu,
+ pixel *dstv, int i_dstv,
+ pixel *src, int i_src, int w, int h )
+{
+ for( int y=0; y<h; y++, dstu+=i_dstu, dstv+=i_dstv, src+=i_src )
+ for( int x=0; x<w; x++ )
+ {
+ dstu[x] = src[2*x];
+ dstv[x] = src[2*x+1];
+ }
+}
+
+static void store_interleave_8x8x2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv )
+{
+ for( int y=0; y<8; y++, dst+=i_dst, srcu+=FDEC_STRIDE, srcv+=FDEC_STRIDE )
+ for( int x=0; x<8; x++ )
+ {
+ dst[2*x] = srcu[x];
+ dst[2*x+1] = srcv[x];
+ }
+}
+
+static void load_deinterleave_8x8x2_fenc( pixel *dst, pixel *src, int i_src )
+{
+ x264_plane_copy_deinterleave_c( dst, FENC_STRIDE, dst+FENC_STRIDE/2, FENC_STRIDE, src, i_src, 8, 8 );
+}
+
+static void load_deinterleave_8x8x2_fdec( pixel *dst, pixel *src, int i_src )
+{
+ x264_plane_copy_deinterleave_c( dst, FDEC_STRIDE, dst+FDEC_STRIDE/2, FDEC_STRIDE, src, i_src, 8, 8 );
+}
+
static void prefetch_fenc_null( pixel *pix_y, int stride_y,
pixel *pix_uv, int stride_uv, int mb_x )
{}
pf->copy[PIXEL_8x8] = mc_copy_w8;
pf->copy[PIXEL_4x4] = mc_copy_w4;
+ pf->store_interleave_8x8x2 = store_interleave_8x8x2;
+ pf->load_deinterleave_8x8x2_fenc = load_deinterleave_8x8x2_fenc;
+ pf->load_deinterleave_8x8x2_fdec = load_deinterleave_8x8x2_fdec;
+
pf->plane_copy = x264_plane_copy_c;
+ pf->plane_copy_interleave = x264_plane_copy_interleave_c;
+ pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_c;
+
pf->hpel_filter = hpel_filter;
pf->prefetch_fenc = prefetch_fenc_null;
/* mc_chroma may write up to 2 bytes of garbage to the right of dst,
* so it must be run from left to right. */
- void (*mc_chroma)(pixel *dst, int i_dst, pixel *src, int i_src,
+ void (*mc_chroma)(pixel *dstu, pixel *dstv, int i_dst, pixel *src, int i_src,
int mvx, int mvy,
int i_width, int i_height );
void (*copy[7])( pixel *dst, int, pixel *src, int, int i_height );
void (*copy_16x16_unaligned)( pixel *dst, int, pixel *src, int, int i_height );
+ void (*store_interleave_8x8x2)( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
+ void (*load_deinterleave_8x8x2_fenc)( pixel *dst, pixel *src, int i_src );
+ void (*load_deinterleave_8x8x2_fdec)( pixel *dst, pixel *src, int i_src );
+
void (*plane_copy)( pixel *dst, int i_dst,
- uint8_t *src, int i_src, int w, int h);
+ uint8_t *src, int i_src, int w, int h );
+ void (*plane_copy_interleave)( pixel *dst, int i_dst,
+ uint8_t *srcu, int i_srcu,
+ uint8_t *srcv, int i_srcv, int w, int h );
+ void (*plane_copy_deinterleave)( pixel *dstu, int i_dstu,
+ pixel *dstv, int i_dstv,
+ pixel *src, int i_src, int w, int h );
void (*hpel_filter)( pixel *dsth, pixel *dstv, pixel *dstc, pixel *src,
int i_stride, int i_width, int i_height, dctcoef *buf );
PIXEL_SSD_C( x264_pixel_ssd_4x8, 4, 8 )
PIXEL_SSD_C( x264_pixel_ssd_4x4, 4, 4 )
-int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height )
+uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height )
{
- int64_t i_ssd = 0;
+ uint64_t i_ssd = 0;
int y;
int align = !(((intptr_t)pix1 | (intptr_t)pix2 | i_pix1 | i_pix2) & 15);
return i_ssd;
}
+static uint64_t pixel_ssd_nv12_core( pixel *pixuv1, int stride1, pixel *pixuv2, int stride2, int width, int height )
+{
+ uint32_t ssd_u=0, ssd_v=0;
+ for( int y = 0; y < height; y++, pixuv1+=stride1, pixuv2+=stride2 )
+ for( int x = 0; x < width; x++ )
+ {
+ int du = pixuv1[2*x] - pixuv2[2*x];
+ int dv = pixuv1[2*x+1] - pixuv2[2*x+1];
+ ssd_u += du*du;
+ ssd_v += dv*dv;
+ }
+ return ssd_u + ((uint64_t)ssd_v<<32);
+}
+
+// SSD in uint32 (i.e. packing two into uint64) can potentially overflow on
+// image widths >= 11008 (or 6604 if interlaced), since this is called on blocks
+// of height up to 12 (resp 20). Though it will probably take significantly more
+// than that at sane distortion levels.
+uint64_t x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height )
+{
+ uint64_t ssd = pf->ssd_nv12_core( pix1, i_pix1, pix2, i_pix2, i_width&~7, i_height );
+ if( i_width&7 )
+ ssd += pixel_ssd_nv12_core( pix1+(i_width&~7), i_pix1, pix2+(i_width&~7), i_pix2, i_width&7, i_height );
+ return ssd;
+}
/****************************************************************************
* pixel_var_wxh
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8;
+ pixf->ssd_nv12_core = pixel_ssd_nv12_core;
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
pixf->ssim_end4 = ssim_end4;
pixf->var2_8x8 = pixel_var2_8x8;
INIT_ADS( _mmxext );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_mmxext;
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_mmxext;
+ pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_mmxext;
#if ARCH_X86
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_mmxext;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_mmxext;
INIT5( ssd, _sse2slow );
INIT2_NAME( sad_aligned, sad, _sse2_aligned );
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_sse2;
+ pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_sse2;
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
uint64_t (*var[4])( pixel *pix, int stride );
uint64_t (*hadamard_ac[4])( pixel *pix, int stride );
+ uint64_t (*ssd_nv12_core)( pixel *pixuv1, int stride1,
+ pixel *pixuv2, int stride2, int width, int height );
void (*ssim_4x4x2_core)( const pixel *pix1, int stride1,
const pixel *pix2, int stride2, int sums[2][4] );
float (*ssim_end4)( int sum0[5][4], int sum1[5][4], int width );
} x264_pixel_function_t;
void x264_pixel_init( int cpu, x264_pixel_function_t *pixf );
-int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height );
+uint64_t x264_pixel_ssd_nv12( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height );
+uint64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height );
float x264_pixel_ssim_wxh( x264_pixel_function_t *pf, pixel *pix1, int i_pix1, pixel *pix2, int i_pix2, int i_width, int i_height, void *buf );
#endif
#if !X264_HIGH_BIT_DEPTH
pf->mc_luma = mc_luma_altivec;
pf->get_ref = get_ref_altivec;
- pf->mc_chroma = mc_chroma_altivec;
+// pf->mc_chroma = mc_chroma_altivec;
pf->copy_16x16_unaligned = x264_mc_copy_w16_altivec;
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_altivec;
[base], [base+stride], [base+stride*2], [base3], \
[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
-; in: 8 rows of 4 bytes in %1..%8
+%define PASS8ROWS(base, base3, stride, stride3, offset) \
+ PASS8ROWS(base+offset, base3+offset, stride, stride3)
+
+; in: 8 rows of 4 bytes in %4..%11
; out: 4 rows of 8 bytes in m0..m3
-%macro TRANSPOSE4x8_LOAD 8
- movd m0, %1
- movd m2, %2
- movd m1, %3
- movd m3, %4
- punpcklbw m0, m2
- punpcklbw m1, m3
- movq m2, m0
- punpcklwd m0, m1
- punpckhwd m2, m1
-
- movd m4, %5
- movd m6, %6
- movd m5, %7
- movd m7, %8
- punpcklbw m4, m6
- punpcklbw m5, m7
- movq m6, m4
- punpcklwd m4, m5
- punpckhwd m6, m5
-
- movq m1, m0
- movq m3, m2
- punpckldq m0, m4
- punpckhdq m1, m4
- punpckldq m2, m6
- punpckhdq m3, m6
+%macro TRANSPOSE4x8_LOAD 11
+ movh m0, %4
+ movh m2, %5
+ movh m1, %6
+ movh m3, %7
+ punpckl%1 m0, m2
+ punpckl%1 m1, m3
+ mova m2, m0
+ punpckl%2 m0, m1
+ punpckh%2 m2, m1
+
+ movh m4, %8
+ movh m6, %9
+ movh m5, %10
+ movh m7, %11
+ punpckl%1 m4, m6
+ punpckl%1 m5, m7
+ mova m6, m4
+ punpckl%2 m4, m5
+ punpckh%2 m6, m5
+
+ mova m1, m0
+ mova m3, m2
+ punpckl%3 m0, m4
+ punpckh%3 m1, m4
+ punpckl%3 m2, m6
+ punpckh%3 m3, m6
%endmacro
; in: 4 rows of 8 bytes in m0..m3
; out: 8 rows of 4 bytes in %1..%8
-%macro TRANSPOSE8x4_STORE 8
- movq m4, m0
- movq m5, m1
- movq m6, m2
+%macro TRANSPOSE8x4B_STORE 8
+ mova m4, m0
+ mova m5, m1
+ mova m6, m2
punpckhdq m4, m4
punpckhdq m5, m5
punpckhdq m6, m6
punpcklbw m0, m1
punpcklbw m2, m3
- movq m1, m0
+ mova m1, m0
punpcklwd m0, m2
punpckhwd m1, m2
- movd %1, m0
+ movh %1, m0
punpckhdq m0, m0
- movd %2, m0
- movd %3, m1
+ movh %2, m0
+ movh %3, m1
punpckhdq m1, m1
- movd %4, m1
+ movh %4, m1
punpckhdq m3, m3
punpcklbw m4, m5
punpcklbw m6, m3
- movq m5, m4
+ mova m5, m4
punpcklwd m4, m6
punpckhwd m5, m6
- movd %5, m4
+ movh %5, m4
punpckhdq m4, m4
- movd %6, m4
- movd %7, m5
+ movh %6, m4
+ movh %7, m5
punpckhdq m5, m5
- movd %8, m5
+ movh %8, m5
+%endmacro
+
+%macro TRANSPOSE4x8B_LOAD 8
+ TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
+%endmacro
+
+%macro TRANSPOSE4x8W_LOAD 8
+%if mmsize==16
+ TRANSPOSE4x8_LOAD wd, dq, qdq, %1, %2, %3, %4, %5, %6, %7, %8
+%else
+ SWAP 1, 4, 2, 3
+ mova m0, [t5]
+ mova m1, [t5+r1]
+ mova m2, [t5+r1*2]
+ mova m3, [t5+t6]
+ TRANSPOSE4x4W 0, 1, 2, 3, 4
+%endif
+%endmacro
+
+%macro TRANSPOSE8x2W_STORE 8
+ mova m0, m1
+ punpcklwd m1, m2
+ punpckhwd m0, m2
+%if mmsize==8
+ movd %1, m1
+ movd %3, m0
+ psrlq m1, 32
+ psrlq m0, 32
+ movd %2, m1
+ movd %4, m0
+%else
+ movd %1, m1
+ movd %5, m0
+ psrldq m1, 4
+ psrldq m0, 4
+ movd %2, m1
+ movd %6, m0
+ psrldq m1, 4
+ psrldq m0, 4
+ movd %3, m1
+ movd %7, m0
+ psrldq m1, 4
+ psrldq m0, 4
+ movd %4, m1
+ movd %8, m0
+%endif
%endmacro
%macro SBUTTERFLY3 4
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
%macro TRANSPOSE6x8_MEM 9
+ RESET_MM_PERMUTATION
movq m0, %1
movq m1, %2
movq m2, %3
movq m4, %5
movq m5, %6
movq m6, %7
- SBUTTERFLY3 bw, m0, m1, m7
- SBUTTERFLY3 bw, m2, m3, m1
- SBUTTERFLY3 bw, m4, m5, m3
- movq [%9+0x10], m1
- SBUTTERFLY3 bw, m6, %8, m5
- SBUTTERFLY3 wd, m0, m2, m1
- SBUTTERFLY3 wd, m4, m6, m2
+ SBUTTERFLY bw, 0, 1, 7
+ SBUTTERFLY bw, 2, 3, 7
+ SBUTTERFLY bw, 4, 5, 7
+ movq [%9+0x10], m3
+ SBUTTERFLY3 bw, m6, %8, m7
+ SBUTTERFLY wd, 0, 2, 3
+ SBUTTERFLY wd, 4, 6, 3
punpckhdq m0, m4
movq [%9+0x00], m0
- SBUTTERFLY3 wd, m7, [%9+0x10], m6
- SBUTTERFLY3 wd, m3, m5, m4
- SBUTTERFLY3 dq, m7, m3, m0
- SBUTTERFLY3 dq, m1, m2, m5
- punpckldq m6, m4
- movq [%9+0x10], m1
- movq [%9+0x20], m5
- movq [%9+0x30], m7
- movq [%9+0x40], m0
- movq [%9+0x50], m6
+ SBUTTERFLY3 wd, m1, [%9+0x10], m3
+ SBUTTERFLY wd, 5, 7, 0
+ SBUTTERFLY dq, 1, 5, 0
+ SBUTTERFLY dq, 2, 6, 0
+ punpckldq m3, m7
+ movq [%9+0x10], m2
+ movq [%9+0x20], m6
+ movq [%9+0x30], m1
+ movq [%9+0x40], m5
+ movq [%9+0x50], m3
+ RESET_MM_PERMUTATION
%endmacro
; in: 8 rows of 8 in %1..%8
; out: 8 rows of 8 in %9..%16
%macro TRANSPOSE8x8_MEM 16
+ RESET_MM_PERMUTATION
movq m0, %1
movq m1, %2
movq m2, %3
movq m4, %5
movq m5, %6
movq m6, %7
- SBUTTERFLY3 bw, m0, m1, m7
- SBUTTERFLY3 bw, m2, m3, m1
- SBUTTERFLY3 bw, m4, m5, m3
- SBUTTERFLY3 bw, m6, %8, m5
- movq %9, m3
- SBUTTERFLY3 wd, m0, m2, m3
- SBUTTERFLY3 wd, m4, m6, m2
- SBUTTERFLY3 wd, m7, m1, m6
- movq %11, m2
- movq m2, %9
- SBUTTERFLY3 wd, m2, m5, m1
- SBUTTERFLY3 dq, m0, m4, m5
- SBUTTERFLY3 dq, m7, m2, m4
+ SBUTTERFLY bw, 0, 1, 7
+ SBUTTERFLY bw, 2, 3, 7
+ SBUTTERFLY bw, 4, 5, 7
+ SBUTTERFLY3 bw, m6, %8, m7
+ movq %9, m5
+ SBUTTERFLY wd, 0, 2, 5
+ SBUTTERFLY wd, 4, 6, 5
+ SBUTTERFLY wd, 1, 3, 5
+ movq %11, m6
+ movq m6, %9
+ SBUTTERFLY wd, 6, 7, 5
+ SBUTTERFLY dq, 0, 4, 5
+ SBUTTERFLY dq, 1, 6, 5
movq %9, m0
- movq %10, m5
- movq %13, m7
- movq %14, m4
- SBUTTERFLY3 dq, m3, %11, m0
- SBUTTERFLY3 dq, m6, m1, m5
- movq %11, m3
+ movq %10, m4
+ movq %13, m1
+ movq %14, m6
+ SBUTTERFLY3 dq, m2, %11, m0
+ SBUTTERFLY dq, 3, 7, 4
+ movq %11, m2
movq %12, m0
- movq %15, m6
- movq %16, m5
+ movq %15, m3
+ movq %16, m7
+ RESET_MM_PERMUTATION
%endmacro
; out: %4 = |%1-%2|>%3
movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48]
- TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
+ TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
shl r10, 3
sub r6, r10
movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40]
- TRANSPOSE8x4_STORE PASS8ROWS(r6, r5, r10, r11)
+ TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r10, r11)
%ifdef WIN64
add rsp, 0x98
movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40]
- TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
+ TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
lea r0, [r0+r3*8]
lea r1, [r1+r3*8]
movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48]
- TRANSPOSE8x4_STORE PASS8ROWS(r0, r1, r3, r4)
+ TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
ADD esp, pad
RET
-INIT_MMX
-
%macro CHROMA_V_START 0
dec r2d ; alpha-1
dec r3d ; beta-1
mov t5, r0
sub t5, r1
sub t5, r1
+%if mmsize==8
+ mov dword r0m, 2
+.skip_prologue:
+%endif
%endmacro
%macro CHROMA_H_START 0
dec r2d
dec r3d
- sub r0, 2
+ sub r0, 4
lea t6, [r1*3]
mov t5, r0
add r0, t6
+%if mmsize==8
+ mov dword r0m, 2
+.skip_prologue:
+%endif
+%endmacro
+
+%macro CHROMA_V_LOOP 1
+%if mmsize==8
+ add r0, 8
+ add t5, 8
+%if %1
+ add r4, 2
+%endif
+ dec dword r0m
+ jg .skip_prologue
+%endif
+%endmacro
+
+%macro CHROMA_H_LOOP 1
+%if mmsize==8
+ lea r0, [r0+r1*4]
+ lea t5, [t5+r1*4]
+%if %1
+ add r4, 2
+%endif
+ dec dword r0m
+ jg .skip_prologue
+%endif
%endmacro
%define t5 r5
%define t6 r6
+%macro DEBLOCK_CHROMA 1
;-----------------------------------------------------------------------------
; void deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_mmxext, 5,6
+cglobal deblock_v_chroma_%1, 5,6,8
CHROMA_V_START
- movq m0, [t5]
- movq m1, [t5+r1]
- movq m2, [r0]
- movq m3, [r0+r1]
- call chroma_inter_body_mmxext
- movq [t5+r1], m1
- movq [r0], m2
+ mova m0, [t5]
+ mova m1, [t5+r1]
+ mova m2, [r0]
+ mova m3, [r0+r1]
+ call chroma_inter_body_%1
+ mova [t5+r1], m1
+ mova [r0], m2
+ CHROMA_V_LOOP 1
RET
;-----------------------------------------------------------------------------
; void deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_mmxext, 5,7
-%ifdef ARCH_X86_64
- %define buf0 [rsp-24]
- %define buf1 [rsp-16]
-%else
- %define buf0 r0m
- %define buf1 r2m
-%endif
+cglobal deblock_h_chroma_%1, 5,7,8
CHROMA_H_START
- TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
- movq buf0, m0
- movq buf1, m3
- call chroma_inter_body_mmxext
- movq m0, buf0
- movq m3, buf1
- TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
+ TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
+ call chroma_inter_body_%1
+ TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
+ CHROMA_H_LOOP 1
RET
ALIGN 16
-chroma_inter_body_mmxext:
+RESET_MM_PERMUTATION
+chroma_inter_body_%1:
LOAD_MASK r2d, r3d
movd m6, [r4] ; tc0
punpcklbw m6, m6
+ punpcklbw m6, m6
pand m7, m6
DEBLOCK_P0_Q0
ret
+%endmacro ; DEBLOCK_CHROMA
+INIT_XMM
+DEBLOCK_CHROMA sse2
+%ifndef ARCH_X86_64
+INIT_MMX
+DEBLOCK_CHROMA mmxext
+%endif
; in: %1=p0 %2=p1 %3=q1
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
%macro CHROMA_INTRA_P0 3
- movq m4, %1
+ mova m4, %1
pxor m4, %3
pand m4, [pb_1] ; m4 = (p0^q1)&1
pavgb %1, %3
psubusb %1, m4
- pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
+ pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
%endmacro
%define t5 r4
%define t6 r5
+%macro DEBLOCK_CHROMA_INTRA 1
;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_v_chroma_intra_mmxext, 4,5
+cglobal deblock_v_chroma_intra_%1, 4,5,8
CHROMA_V_START
- movq m0, [t5]
- movq m1, [t5+r1]
- movq m2, [r0]
- movq m3, [r0+r1]
- call chroma_intra_body_mmxext
- movq [t5+r1], m1
- movq [r0], m2
+ mova m0, [t5]
+ mova m1, [t5+r1]
+ mova m2, [r0]
+ mova m3, [r0+r1]
+ call chroma_intra_body_%1
+ mova [t5+r1], m1
+ mova [r0], m2
+ CHROMA_V_LOOP 0
RET
;-----------------------------------------------------------------------------
; void deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
-cglobal deblock_h_chroma_intra_mmxext, 4,6
+cglobal deblock_h_chroma_intra_%1, 4,6,8
CHROMA_H_START
- TRANSPOSE4x8_LOAD PASS8ROWS(t5, r0, r1, t6)
- call chroma_intra_body_mmxext
- TRANSPOSE8x4_STORE PASS8ROWS(t5, r0, r1, t6)
+ TRANSPOSE4x8W_LOAD PASS8ROWS(t5, r0, r1, t6)
+ call chroma_intra_body_%1
+ TRANSPOSE8x2W_STORE PASS8ROWS(t5, r0, r1, t6, 2)
+ CHROMA_H_LOOP 0
RET
ALIGN 16
-chroma_intra_body_mmxext:
+RESET_MM_PERMUTATION
+chroma_intra_body_%1:
LOAD_MASK r2d, r3d
- movq m5, m1
- movq m6, m2
+ mova m5, m1
+ mova m6, m2
CHROMA_INTRA_P0 m1, m0, m3
CHROMA_INTRA_P0 m2, m3, m0
psubb m1, m5
paddb m1, m5
paddb m2, m6
ret
+%endmacro ; DEBLOCK_CHROMA_INTRA
+
+INIT_XMM
+DEBLOCK_CHROMA_INTRA sse2
+%ifndef ARCH_X86_64
+INIT_MMX
+DEBLOCK_CHROMA_INTRA mmxext
+%endif
+
+
;-----------------------------------------------------------------------------
; static void deblock_strength( uint8_t nnz[48], int8_t ref[2][40], int16_t mv[2][40][2],
SECTION_RODATA 32
-ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0
+ch_shuf: db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9
+ch_shuf_adj: times 8 db 0
+ times 8 db 2
+ times 8 db 4
+ times 8 db 6
SECTION .text
-cextern pw_1
cextern pw_4
cextern pw_8
cextern pw_32
cextern pw_64
+cextern pw_00ff
cextern sw_64
;=============================================================================
;-----------------------------------------------------------------------------
%ifdef ARCH_X86_64
cglobal prefetch_fenc_mmxext, 5,5
+ and r4d, 3
mov eax, r4d
- and eax, 3
- imul eax, r1d
- lea r0, [r0+rax*4+64]
+ imul r4d, r1d
+ lea r0, [r0+r4*4+64]
prefetcht0 [r0]
prefetcht0 [r0+r1]
lea r0, [r0+r1*2]
prefetcht0 [r0]
prefetcht0 [r0+r1]
- and r4d, 6
- imul r4d, r3d
- lea r2, [r2+r4+64]
+ imul eax, r3d
+ lea r2, [r2+rax*2+64]
prefetcht0 [r2]
prefetcht0 [r2+r3]
RET
%else
-cglobal prefetch_fenc_mmxext
- mov r2, [esp+20]
- mov r1, [esp+8]
- mov r0, [esp+4]
+cglobal prefetch_fenc_mmxext, 0,3
+ mov r2, r4m
+ mov r1, r1m
+ mov r0, r0m
and r2, 3
imul r2, r1
lea r0, [r0+r2*4+64]
prefetcht0 [r0]
prefetcht0 [r0+r1]
- mov r2, [esp+20]
- mov r1, [esp+16]
- mov r0, [esp+12]
- and r2, 6
+ mov r2, r4m
+ mov r1, r3m
+ mov r0, r2m
+ and r2, 3
imul r2, r1
- lea r0, [r0+r2+64]
+ lea r0, [r0+r2*2+64]
prefetcht0 [r0]
prefetcht0 [r0+r1]
ret
; chroma MC
;=============================================================================
- %define t0 rax
%ifdef ARCH_X86_64
- %define t1 r10
+ DECLARE_REG_TMP 10,11,6
%else
- %define t1 r1
+ DECLARE_REG_TMP 0,1,2
%endif
%macro MC_CHROMA_START 0
- movifnidn r2, r2mp
- movifnidn r3d, r3m
+ movifnidn r3, r3mp
movifnidn r4d, r4m
movifnidn r5d, r5m
- mov t0d, r5d
- mov t1d, r4d
+ movifnidn t2d, r6m
+ mov t0d, t2d
+ mov t1d, r5d
sar t0d, 3
sar t1d, 3
- imul t0d, r3d
- add t0d, t1d
+ imul t0d, r4d
+ lea t0d, [t0+t1*2]
movsxdifnidn t0, t0d
- add r2, t0 ; src += (dx>>3) + (dy>>3) * src_stride
+ add r3, t0 ; src += (dx>>3) + (dy>>3) * src_stride
+%endmacro
+
+%macro UNPACK_UNALIGNED_MEM 3
+ punpcklwd %1, %3
+%endmacro
+
+%macro UNPACK_UNALIGNED_LOAD 3
+ movh %2, %3
+ punpcklwd %1, %2
%endmacro
;-----------------------------------------------------------------------------
-; void mc_chroma( uint8_t *dst, int dst_stride,
+; void mc_chroma( uint8_t *dstu, uint8_t *dstv, int dst_stride,
; uint8_t *src, int src_stride,
; int dx, int dy,
; int width, int height )
;-----------------------------------------------------------------------------
-%macro MC_CHROMA 1-2 0
-cglobal mc_chroma_%1
-%if mmsize == 16
- cmp dword r6m, 4
- jle mc_chroma_mmxext
-%endif
- PROLOGUE 0,6,%2
+%macro MC_CHROMA 1
+cglobal mc_chroma_%1, 0,6
MC_CHROMA_START
- pxor m3, m3
- and r4d, 7 ; dx &= 7
+ and r5d, 7
+%ifdef ARCH_X86_64
jz .mc1dy
- and r5d, 7 ; dy &= 7
- jz .mc1dx
-
- movd m5, r4d
- movd m6, r5d
- SPLATW m5, m5 ; m5 = dx
- SPLATW m6, m6 ; m6 = dy
-
- mova m4, [pw_8]
- mova m0, m4
- psubw m4, m5 ; m4 = 8-dx
- psubw m0, m6 ; m0 = 8-dy
-
- mova m7, m5
- pmullw m5, m0 ; m5 = dx*(8-dy) = cB
- pmullw m7, m6 ; m7 = dx*dy = cD
- pmullw m6, m4 ; m6 = (8-dx)*dy = cC
- pmullw m4, m0 ; m4 = (8-dx)*(8-dy) = cA
-
- mov r4d, r7m
+%endif
+ and t2d, 7
%ifdef ARCH_X86_64
- mov r10, r0
- mov r11, r2
+ jz .mc1dx
+%endif
+ shl r5d, 16
+ add t2d, r5d
+ mov t0d, t2d
+ shl t2d, 8
+ sub t2d, t0d
+ add t2d, 0x80008 ; (x<<24) + ((8-x)<<16) + (y<<8) + (8-y)
+ cmp dword r7m, 4
+%if mmsize==8
+.skip_prologue:
%else
- mov r0, r0mp
- mov r1, r1m
- mov r5, r2
+ jl mc_chroma_mmxext %+ .skip_prologue
+ WIN64_SPILL_XMM 9
%endif
-
-.loop2d:
- movh m1, [r2+r3]
- movh m0, [r2]
- punpcklbw m1, m3 ; 00 px1 | 00 px2 | 00 px3 | 00 px4
- punpcklbw m0, m3
- pmullw m1, m6 ; 2nd line * cC
- pmullw m0, m4 ; 1st line * cA
- paddw m0, m1 ; m0 <- result
-
- movh m2, [r2+1]
- movh m1, [r2+r3+1]
- punpcklbw m2, m3
- punpcklbw m1, m3
-
- paddw m0, [pw_32]
-
- pmullw m2, m5 ; line * cB
- pmullw m1, m7 ; line * cD
+ movd m5, t2d
+ movifnidn r0, r0mp
+ movifnidn r1, r1mp
+ movifnidn r2d, r2m
+ movifnidn r5d, r8m
+ pxor m6, m6
+ punpcklbw m5, m6
+%if mmsize==8
+ pshufw m7, m5, 0xee
+ pshufw m6, m5, 0x00
+ pshufw m5, m5, 0x55
+ jge .width4
+%else
+%ifdef WIN64
+ cmp dword r7m, 4 ; flags were clobbered by WIN64_SPILL_XMM
+%endif
+ pshufd m7, m5, 0x55
+ punpcklwd m5, m5
+ pshufd m6, m5, 0x00
+ pshufd m5, m5, 0x55
+ jg .width8
+%endif
+ movu m0, [r3]
+ UNPACK_UNALIGNED m0, m1, [r3+2]
+ mova m1, m0
+ pand m0, [pw_00ff]
+ psrlw m1, 8
+ pmaddwd m0, m7
+ pmaddwd m1, m7
+ packssdw m0, m1
+ SWAP m3, m0
+ALIGN 4
+.loop2:
+ movu m0, [r3+r4]
+ UNPACK_UNALIGNED m0, m1, [r3+r4+2]
+ pmullw m3, m6
+ mova m1, m0
+ pand m0, [pw_00ff]
+ psrlw m1, 8
+ pmaddwd m0, m7
+ pmaddwd m1, m7
+ mova m2, [pw_32]
+ packssdw m0, m1
+ paddw m2, m3
+ mova m3, m0
+ pmullw m0, m5
paddw m0, m2
- paddw m0, m1
psrlw m0, 6
+ packuswb m0, m0
+ movd [r0], m0
+%if mmsize==8
+ psrlq m0, 16
+%else
+ psrldq m0, 4
+%endif
+ movd [r1], m0
+ add r3, r4
+ add r0, r2
+ add r1, r2
+ dec r5d
+ jg .loop2
+ REP_RET
- packuswb m0, m3 ; 00 00 00 00 px1 px2 px3 px4
- movh [r0], m0
-
- add r2, r3
- add r0, r1 ; dst_stride
- dec r4d
- jnz .loop2d
-
-%if mmsize == 8
- sub dword r6m, 8
- jnz .finish ; width != 8 so assume 4
+%if mmsize==8
+.width4:
%ifdef ARCH_X86_64
- lea r0, [r10+4] ; dst
- lea r2, [r11+4] ; src
+ mov t0, r0
+ mov t1, r1
+ mov t2, r3
+ %define multy0 [rsp-8]
+ mova multy0, m5
%else
- mov r0, r0mp
- lea r2, [r5+4]
- add r0, 4
+ mov r3m, r3
+ %define multy0 r4m
+ mova multy0, m5
%endif
- mov r4d, r7m ; height
- jmp .loop2d
%else
+.width8:
+%ifdef ARCH_X86_64
+ %define multy0 m8
+ SWAP m8, m5
+%else
+ %define multy0 r0m
+ mova multy0, m5
+%endif
+%endif
+.loopx:
+ movu m0, [r3]
+ movu m1, [r3+mmsize/2]
+ UNPACK_UNALIGNED m0, m2, [r3+2]
+ UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
+ mova m2, m0
+ mova m3, m1
+ pand m0, [pw_00ff]
+ pand m1, [pw_00ff]
+ psrlw m2, 8
+ psrlw m3, 8
+ pmaddwd m0, m7
+ pmaddwd m2, m7
+ pmaddwd m1, m7
+ pmaddwd m3, m7
+ packssdw m0, m2
+ packssdw m1, m3
+ SWAP m4, m0
+ SWAP m5, m1
+ add r3, r4
+ALIGN 4
+.loop4:
+ movu m0, [r3]
+ movu m1, [r3+mmsize/2]
+ UNPACK_UNALIGNED m0, m2, [r3+2]
+ UNPACK_UNALIGNED m1, m3, [r3+2+mmsize/2]
+ mova m2, m0
+ mova m3, m1
+ pand m0, [pw_00ff]
+ pand m1, [pw_00ff]
+ psrlw m2, 8
+ psrlw m3, 8
+ pmaddwd m0, m7
+ pmaddwd m2, m7
+ pmaddwd m1, m7
+ pmaddwd m3, m7
+ packssdw m0, m2
+ packssdw m1, m3
+ pmullw m4, m6
+ pmullw m5, m6
+ mova m2, [pw_32]
+ mova m3, m2
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, m0
+ mova m5, m1
+ pmullw m0, multy0
+ pmullw m1, multy0
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 6
+ psrlw m1, 6
+ packuswb m0, m1
+%if mmsize==8
+ pshufw m1, m0, 0x8
+ pshufw m0, m0, 0xd
+ movd [r0], m1
+ movd [r1], m0
+%else
+ pshufd m0, m0, 0xd8
+ movq [r0], m0
+ movhps [r1], m0
+%endif
+ add r3, r4
+ add r0, r2
+ add r1, r2
+ dec r5d
+ jg .loop4
+%if mmsize!=8
REP_RET
-%endif ; mmsize
+%else
+ sub dword r7m, 4
+ jg .width8
+ REP_RET
+.width8:
+%ifdef ARCH_X86_64
+ lea r3, [t2+8]
+ lea r0, [t0+4]
+ lea r1, [t1+4]
+%else
+ mov r3, r3m
+ mov r0, r0m
+ mov r1, r1m
+ add r3, 8
+ add r0, 4
+ add r1, 4
+%endif
+ mov r5d, r8m
+ jmp .loopx
+%endif
+%ifdef ARCH_X86_64 ; too many regs for x86_32
+ RESET_MM_PERMUTATION
+%ifdef WIN64
+%if xmm_regs_used > 6
+ %assign stack_offset stack_offset-(xmm_regs_used-6)*16-16
+ %assign xmm_regs_used 6
+%endif
+%endif
.mc1dy:
- and r5d, 7
- movd m6, r5d
- mov r5, r3 ; pel_offset = dx ? 1 : src_stride
+ and t2d, 7
+ movd m5, t2d
+ mov r6d, r4d ; pel_offset = dx ? 2 : src_stride
jmp .mc1d
.mc1dx:
- movd m6, r4d
- mov r5d, 1
+ movd m5, r5d
+ mov r6d, 2
.mc1d:
- mova m5, [pw_8]
- SPLATW m6, m6
- mova m7, [pw_4]
- psubw m5, m6
- movifnidn r0, r0mp
- movifnidn r1d, r1m
- mov r4d, r7m
-%if mmsize == 8
- cmp dword r6m, 8
- je .loop1d_w8
+ mova m4, [pw_8]
+ SPLATW m5, m5
+ psubw m4, m5
+ movifnidn r0, r0mp
+ movifnidn r1, r1mp
+ movifnidn r2d, r2m
+ movifnidn r5d, r8m
+ cmp dword r7m, 4
+ jg .mc1d_w8
+ mov r10, r2
+ mov r11, r4
+%if mmsize!=8
+ shr r5d, 1
%endif
-
.loop1d_w4:
- movh m0, [r2+r5]
- movh m1, [r2]
- punpcklbw m0, m3
- punpcklbw m1, m3
- pmullw m0, m6
- pmullw m1, m5
- paddw m0, m7
- paddw m0, m1
- psrlw m0, 3
- packuswb m0, m3
- movh [r0], m0
- add r2, r3
- add r0, r1
- dec r4d
- jnz .loop1d_w4
-.finish:
- REP_RET
-
-%if mmsize == 8
-.loop1d_w8:
- movu m0, [r2+r5]
- mova m1, [r2]
+ movq m0, [r3]
+ movq m1, [r3+r6]
+%if mmsize!=8
+ add r3, r11
+ movhps m0, [r3]
+ movhps m1, [r3+r6]
+%endif
mova m2, m0
- mova m4, m1
- punpcklbw m0, m3
- punpcklbw m1, m3
- punpckhbw m2, m3
- punpckhbw m4, m3
- pmullw m0, m6
+ mova m3, m1
+ pand m0, [pw_00ff]
+ pand m1, [pw_00ff]
+ psrlw m2, 8
+ psrlw m3, 8
+ pmullw m0, m4
pmullw m1, m5
- pmullw m2, m6
- pmullw m4, m5
- paddw m0, m7
- paddw m2, m7
+ pmullw m2, m4
+ pmullw m3, m5
+ paddw m0, [pw_4]
+ paddw m2, [pw_4]
paddw m0, m1
- paddw m2, m4
+ paddw m2, m3
psrlw m0, 3
psrlw m2, 3
packuswb m0, m2
- mova [r0], m0
- add r2, r3
- add r0, r1
- dec r4d
- jnz .loop1d_w8
+%if mmsize==8
+ xchg r4, r11
+ xchg r2, r10
+ movd [r0], m0
+ psrlq m0, 32
+ movd [r1], m0
+%else
+ movhlps m1, m0
+ movd [r0], m0
+ movd [r1], m1
+ add r0, r10
+ add r1, r10
+ psrldq m0, 4
+ psrldq m1, 4
+ movd [r0], m0
+ movd [r1], m1
+%endif
+ add r3, r4
+ add r0, r2
+ add r1, r2
+ dec r5d
+ jg .loop1d_w4
REP_RET
-%endif ; mmsize
+.mc1d_w8:
+ sub r2, 4
+ sub r4, 8
+ mov r10, 4
+ mov r11, 8
+%if mmsize==8
+ shl r5d, 1
+%endif
+ jmp .loop1d_w4
+%endif ; ARCH_X86_64
%endmacro ; MC_CHROMA
-INIT_MMX
-MC_CHROMA mmxext
-INIT_XMM
-MC_CHROMA sse2, 8
-%macro MC_CHROMA_SSSE3 2
-INIT_MMX
-cglobal mc_chroma_ssse3%1, 0,6,%2
+%macro MC_CHROMA_SSSE3 0-1
+INIT_XMM
+cglobal mc_chroma_ssse3%1, 0,6,9
MC_CHROMA_START
- and r4d, 7
and r5d, 7
- mov t0d, r4d
+ and t2d, 7
+ mov t0d, r5d
shl t0d, 8
- sub t0d, r4d
- mov r4d, 8
+ sub t0d, r5d
+ mov r5d, 8
add t0d, 8
- sub r4d, r5d
- imul r5d, t0d ; (x*255+8)*y
- imul r4d, t0d ; (x*255+8)*(8-y)
- cmp dword r6m, 4
- jg .width8
- mova m5, [pw_32]
- movd m6, r5d
- movd m7, r4d
- movifnidn r0, r0mp
- movifnidn r1d, r1m
- movifnidn r4d, r7m
- SPLATW m6, m6
- SPLATW m7, m7
- mov r5, r2
- and r2, ~3
- and r5, 3
+ sub r5d, t2d
+ imul t2d, t0d ; (x*255+8)*y
+ imul r5d, t0d ; (x*255+8)*(8-y)
+ movd m6, t2d
+ movd m7, r5d
+%ifidn %1, _cache64
+ mov t0d, r3d
+ and t0d, 7
%ifdef PIC
- lea r11, [ch_shuffle]
- movu m5, [r11 + r5*2]
+ lea t1, [ch_shuf_adj]
+ movddup m5, [t1 + t0*4]
+%else
+ movddup m5, [ch_shuf_adj + t0*4]
+%endif
+ paddb m5, [ch_shuf]
+ and r3, ~7
%else
- movu m5, [ch_shuffle + r5*2]
+ mova m5, [ch_shuf]
%endif
- movu m0, [r2]
+ movifnidn r0, r0mp
+ movifnidn r1, r1mp
+ movifnidn r2d, r2m
+ movifnidn r5d, r8m
+ SPLATW m6, m6
+ SPLATW m7, m7
+ cmp dword r7m, 4
+ jg .width8
+ movu m0, [r3]
pshufb m0, m5
.loop4:
- movu m1, [r2+r3]
+ movu m1, [r3+r4]
pshufb m1, m5
- movu m3, [r2+2*r3]
+ movu m3, [r3+r4*2]
pshufb m3, m5
- lea r2, [r2+2*r3]
mova m2, m1
mova m4, m3
pmaddubsw m0, m7
mova m0, m4
psrlw m1, 6
psrlw m3, 6
- packuswb m1, m1
- packuswb m3, m3
- movh [r0], m1
- movh [r0+r1], m3
- sub r4d, 2
- lea r0, [r0+2*r1]
+ packuswb m1, m3
+ movhlps m3, m1
+ movd [r0], m1
+ movd [r0+r2], m3
+ psrldq m1, 4
+ psrldq m3, 4
+ movd [r1], m1
+ movd [r1+r2], m3
+ lea r3, [r3+r4*2]
+ lea r0, [r0+r2*2]
+ lea r1, [r1+r2*2]
+ sub r5d, 2
jg .loop4
REP_RET
-INIT_XMM
.width8:
- movd m6, r5d
- movd m7, r4d
- movifnidn r0, r0mp
- movifnidn r1d, r1m
- movifnidn r4d, r7m
- SPLATW m6, m6
- SPLATW m7, m7
-%ifidn %1, _cache64
- mov r5, r2
- and r5, 0x3f
- cmp r5, 0x38
- jge .split
-%endif
- mova m5, [pw_32]
- movh m0, [r2]
- movh m1, [r2+1]
- punpcklbw m0, m1
-.loop8:
- movh m1, [r2+1*r3]
- movh m2, [r2+1*r3+1]
- movh m3, [r2+2*r3]
- movh m4, [r2+2*r3+1]
- punpcklbw m1, m2
- punpcklbw m3, m4
- lea r2, [r2+2*r3]
- mova m2, m1
- mova m4, m3
- pmaddubsw m0, m7
- pmaddubsw m1, m6
- pmaddubsw m2, m7
- pmaddubsw m3, m6
- paddw m0, m5
- paddw m2, m5
- paddw m1, m0
- paddw m3, m2
- mova m0, m4
- psrlw m1, 6
- psrlw m3, 6
- packuswb m1, m3
- movh [r0], m1
- movhps [r0+r1], m1
- sub r4d, 2
- lea r0, [r0+2*r1]
- jg .loop8
- REP_RET
-%ifidn %1, _cache64
-.split:
- and r2, ~7
- and r5, 7
-%ifdef PIC
- lea r11, [ch_shuffle]
- movu m5, [r11 + r5*2]
-%else
- movu m5, [ch_shuffle + r5*2]
-%endif
- movu m0, [r2]
+ movu m0, [r3]
pshufb m0, m5
+ movu m1, [r3+8]
+ pshufb m1, m5
%ifdef ARCH_X86_64
- mova m8, [pw_32]
- %define round m8
+ SWAP m8, m6
+ %define mult1 m8
%else
- %define round [pw_32]
+ mova r0m, m6
+ %define mult1 r0m
%endif
-.splitloop8:
- movu m1, [r2+r3]
- pshufb m1, m5
- movu m3, [r2+2*r3]
+.loop8:
+ movu m2, [r3+r4]
+ pshufb m2, m5
+ movu m3, [r3+r4+8]
pshufb m3, m5
- lea r2, [r2+2*r3]
- mova m2, m1
- mova m4, m3
+ mova m4, m2
+ mova m6, m3
pmaddubsw m0, m7
- pmaddubsw m1, m6
- pmaddubsw m2, m7
- pmaddubsw m3, m6
- paddw m0, round
- paddw m2, round
- paddw m1, m0
- paddw m3, m2
- mova m0, m4
+ pmaddubsw m1, m7
+ pmaddubsw m2, mult1
+ pmaddubsw m3, mult1
+ paddw m0, [pw_32]
+ paddw m1, [pw_32]
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 6
psrlw m1, 6
+ packuswb m0, m1
+ pshufd m0, m0, 0xd8
+ movq [r0], m0
+ movhps [r1], m0
+
+ movu m2, [r3+r4*2]
+ pshufb m2, m5
+ movu m3, [r3+r4*2+8]
+ pshufb m3, m5
+ mova m0, m2
+ mova m1, m3
+ pmaddubsw m4, m7
+ pmaddubsw m6, m7
+ pmaddubsw m2, mult1
+ pmaddubsw m3, mult1
+ paddw m4, [pw_32]
+ paddw m6, [pw_32]
+ paddw m2, m4
+ paddw m3, m6
+ psrlw m2, 6
psrlw m3, 6
- packuswb m1, m3
- movh [r0], m1
- movhps [r0+r1], m1
- sub r4d, 2
- lea r0, [r0+2*r1]
- jg .splitloop8
+ packuswb m2, m3
+ pshufd m2, m2, 0xd8
+ movq [r0+r2], m2
+ movhps [r1+r2], m2
+ lea r3, [r3+r4*2]
+ lea r0, [r0+r2*2]
+ lea r1, [r1+r2*2]
+ sub r5d, 2
+ jg .loop8
REP_RET
-%endif
-; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size
%endmacro
-MC_CHROMA_SSSE3 , 8
-MC_CHROMA_SSSE3 _cache64, 9
+INIT_MMX
+%define UNPACK_UNALIGNED UNPACK_UNALIGNED_MEM
+MC_CHROMA mmxext
+INIT_XMM
+MC_CHROMA sse2_misalign
+%define UNPACK_UNALIGNED UNPACK_UNALIGNED_LOAD
+MC_CHROMA sse2
+MC_CHROMA_SSSE3
+MC_CHROMA_SSSE3 _cache64
filt_mul20: times 16 db 20
filt_mul15: times 8 db 1, -5
filt_mul51: times 8 db -5, 1
-hpel_shuf: db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
+hpel_shuf: db 0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15
+deinterleave_shuf: db 0,2,4,6,8,10,12,14,1,3,5,7,9,11,13,15
SECTION .text
cextern pw_1
cextern pw_16
cextern pw_32
-cextern pd_128
+cextern pw_00ff
cextern pw_3fff
+cextern pd_128
%macro LOAD_ADD 4
movh %4, %3
mova [r2+r4*2], m1
mova [r2+r4*2+mmsize], m4
FILT_PACK m1, m4, 5, m7
- movnt [r0+r4], m1
+ movnta [r0+r4], m1
add r1, mmsize
add r5, mmsize
add r4, mmsize
RET
+%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
+ movq m0, [%2]
+%if mmsize==16
+%if %4
+ punpcklbw m0, [%3]
+%else
+ movq m1, [%3]
+ punpcklbw m0, m1
+%endif
+ mov%5a [%1], m0
+%else
+ movq m1, [%3]
+ mova m2, m0
+ punpcklbw m0, m1
+ punpckhbw m2, m1
+ mov%5a [%1], m0
+ mov%5a [%1+8], m2
+%endif
+%endmacro
+
+%macro DEINTERLEAVE 6 ; dstu, dstv, src, dstv==dstu+8, cpu, shuffle constant
+%if mmsize==16
+ mova m0, [%3]
+%ifidn %5, ssse3
+ pshufb m0, %6
+%else
+ mova m1, m0
+ pand m0, %6
+ psrlw m1, 8
+ packuswb m0, m1
+%endif
+%if %4
+ mova [%1], m0
+%else
+ movq [%1], m0
+ movhps [%2], m0
+%endif
+%else
+ mova m0, [%3]
+ mova m1, [%3+8]
+ mova m2, m0
+ mova m3, m1
+ pand m0, %6
+ pand m1, %6
+ psrlw m2, 8
+ psrlw m3, 8
+ packuswb m0, m1
+ packuswb m2, m3
+ mova [%1], m0
+ mova [%2], m2
+%endif
+%endmacro
+
+%macro PLANE_INTERLEAVE 1
+;-----------------------------------------------------------------------------
+; void plane_copy_interleave_core( uint8_t *dst, int i_dst,
+; uint8_t *srcu, int i_srcu,
+; uint8_t *srcv, int i_srcv, int w, int h )
+;-----------------------------------------------------------------------------
+; assumes i_dst and w are multiples of 16, and i_dst>2*w
+cglobal plane_copy_interleave_core_%1, 6,7
+ mov r6d, r6m
+ movsxdifnidn r1, r1d
+ movsxdifnidn r3, r3d
+ movsxdifnidn r5, r5d
+ lea r0, [r0+r6*2]
+ add r2, r6
+ add r4, r6
+%ifdef ARCH_X86_64
+ DECLARE_REG_TMP 10,11
+%else
+ DECLARE_REG_TMP 1,3
+%endif
+ mov t0d, r7m
+ mov t1d, r1d
+ shr t1d, 1
+ sub t1d, r6d
+.loopy:
+ mov r6d, r6m
+ neg r6
+.prefetch:
+ prefetchnta [r2+r6]
+ prefetchnta [r4+r6]
+ add r6, 64
+ jl .prefetch
+ mov r6d, r6m
+ neg r6
+.loopx:
+ INTERLEAVE r0+r6*2, r2+r6, r4+r6, 0, nt
+ INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8, 0, nt
+ add r6, 16
+ jl .loopx
+.pad:
+%if mmsize==8
+ movntq [r0+r6*2], m0
+ movntq [r0+r6*2+8], m0
+ movntq [r0+r6*2+16], m0
+ movntq [r0+r6*2+24], m0
+%else
+ movntdq [r0+r6*2], m0
+ movntdq [r0+r6*2+16], m0
+%endif
+ add r6, 16
+ cmp r6, t1
+ jl .pad
+ add r0, r1mp
+ add r2, r3mp
+ add r4, r5
+ dec t0d
+ jg .loopy
+ sfence
+ emms
+ RET
+
+;-----------------------------------------------------------------------------
+; void store_interleave_8x8x2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv )
+;-----------------------------------------------------------------------------
+cglobal store_interleave_8x8x2_%1, 4,5
+ mov r4d, 4
+.loop:
+ INTERLEAVE r0, r2, r3, 1
+ INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE, 1
+ add r2, FDEC_STRIDE*2
+ add r3, FDEC_STRIDE*2
+ lea r0, [r0+r1*2]
+ dec r4d
+ jg .loop
+ REP_RET
+%endmacro ; PLANE_INTERLEAVE
+
+%macro DEINTERLEAVE_START 1
+%ifidn %1, ssse3
+ mova m4, [deinterleave_shuf]
+%else
+ mova m4, [pw_00ff]
+%endif
+%endmacro
+
+%macro PLANE_DEINTERLEAVE 1
+;-----------------------------------------------------------------------------
+; void plane_copy_deinterleave( uint8_t *dstu, int i_dstu,
+; uint8_t *dstv, int i_dstv,
+; uint8_t *src, int i_src, int w, int h )
+;-----------------------------------------------------------------------------
+cglobal plane_copy_deinterleave_%1, 6,7
+ DEINTERLEAVE_START %1
+ mov r6d, r6m
+ movsxdifnidn r1, r1d
+ movsxdifnidn r3, r3d
+ movsxdifnidn r5, r5d
+ add r0, r6
+ add r2, r6
+ lea r4, [r4+r6*2]
+.loopy:
+ mov r6d, r6m
+ neg r6
+.loopx:
+ DEINTERLEAVE r0+r6, r2+r6, r4+r6*2, 0, %1, m4
+ DEINTERLEAVE r0+r6+8, r2+r6+8, r4+r6*2+16, 0, %1, m4
+ add r6, 16
+ jl .loopx
+ add r0, r1
+ add r2, r3
+ add r4, r5
+ dec dword r7m
+ jg .loopy
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void load_deinterleave_8x8x2_fenc( uint8_t *dst, uint8_t *src, int i_src )
+;-----------------------------------------------------------------------------
+cglobal load_deinterleave_8x8x2_fenc_%1, 3,4
+ DEINTERLEAVE_START %1
+ mov r3d, 4
+.loop:
+ DEINTERLEAVE r0, r0+FENC_STRIDE/2, r1, 1, %1, m4
+ DEINTERLEAVE r0+FENC_STRIDE, r0+FENC_STRIDE*3/2, r1+r2, 1, %1, m4
+ add r0, FENC_STRIDE*2
+ lea r1, [r1+r2*2]
+ dec r3d
+ jg .loop
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void load_deinterleave_8x8x2_fdec( uint8_t *dst, uint8_t *src, int i_src )
+;-----------------------------------------------------------------------------
+cglobal load_deinterleave_8x8x2_fdec_%1, 3,4
+ DEINTERLEAVE_START %1
+ mov r3d, 4
+.loop:
+ DEINTERLEAVE r0, r0+FDEC_STRIDE/2, r1, 0, %1, m4
+ DEINTERLEAVE r0+FDEC_STRIDE, r0+FDEC_STRIDE*3/2, r1+r2, 0, %1, m4
+ add r0, FDEC_STRIDE*2
+ lea r1, [r1+r2*2]
+ dec r3d
+ jg .loop
+ REP_RET
+%endmacro ; PLANE_DEINTERLEAVE
+
+INIT_MMX
+PLANE_INTERLEAVE mmxext
+PLANE_DEINTERLEAVE mmx
+INIT_XMM
+PLANE_INTERLEAVE sse2
+PLANE_DEINTERLEAVE sse2
+PLANE_DEINTERLEAVE ssse3
+
; These functions are not general-use; not only do the SSE ones require aligned input,
; but they also will fail if given a non-mod16 size or a size less than 64.
void x264_mc_copy_w16_aligned_sse2( uint8_t *, int, uint8_t *, int, int );
void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
void x264_prefetch_ref_mmxext( uint8_t *, int, int );
-void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
- int dx, int dy, int i_width, int i_height );
-void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
- int dx, int dy, int i_width, int i_height );
-void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
- int dx, int dy, int i_width, int i_height );
-void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride,
- uint8_t *dst, int i_dst_stride,
- int dx, int dy, int i_width, int i_height );
void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
-void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h);
+void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h );
+void x264_plane_copy_interleave_core_mmxext( uint8_t *dst, int i_dst,
+ uint8_t *srcu, int i_srcu,
+ uint8_t *srcv, int i_srcv, int w, int h );
+void x264_plane_copy_interleave_core_sse2( uint8_t *dst, int i_dst,
+ uint8_t *srcu, int i_srcu,
+ uint8_t *srcv, int i_srcv, int w, int h );
+void x264_plane_copy_interleave_c( uint8_t *dst, int i_dst,
+ uint8_t *srcu, int i_srcu,
+ uint8_t *srcv, int i_srcv, int w, int h );
+void x264_plane_copy_deinterleave_mmx( uint8_t *dstu, int i_dstu,
+ uint8_t *dstv, int i_dstv,
+ uint8_t *src, int i_src, int w, int h );
+void x264_plane_copy_deinterleave_sse2( uint8_t *dstu, int i_dstu,
+ uint8_t *dstv, int i_dstv,
+ uint8_t *src, int i_src, int w, int h );
+void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu,
+ uint8_t *dstv, int i_dstv,
+ uint8_t *src, int i_src, int w, int h );
+void x264_store_interleave_8x8x2_mmxext( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv );
+void x264_store_interleave_8x8x2_sse2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv );
+void x264_load_deinterleave_8x8x2_fenc_mmx( uint8_t *dst, uint8_t *src, int i_src );
+void x264_load_deinterleave_8x8x2_fenc_sse2( uint8_t *dst, uint8_t *src, int i_src );
+void x264_load_deinterleave_8x8x2_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src );
+void x264_load_deinterleave_8x8x2_fdec_mmx( uint8_t *dst, uint8_t *src, int i_src );
+void x264_load_deinterleave_8x8x2_fdec_sse2( uint8_t *dst, uint8_t *src, int i_src );
+void x264_load_deinterleave_8x8x2_fdec_ssse3( uint8_t *dst, uint8_t *src, int i_src );
void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
void x264_memzero_aligned_mmx( void * dst, int n );
void x264_integral_init4v_ssse3( uint16_t *sum8, uint16_t *sum4, int stride );
void x264_mbtree_propagate_cost_sse2( int *dst, uint16_t *propagate_in, uint16_t *intra_costs,
uint16_t *inter_costs, uint16_t *inv_qscales, int len );
+
+#define MC_CHROMA(cpu)\
+void x264_mc_chroma_##cpu( uint8_t *dstu, uint8_t *dstv, int i_dst,\
+ uint8_t *src, int i_src,\
+ int dx, int dy, int i_width, int i_height );
+MC_CHROMA(mmxext)
+MC_CHROMA(sse2)
+MC_CHROMA(sse2_misalign)
+MC_CHROMA(ssse3)
+MC_CHROMA(ssse3_cache64)
+
#define LOWRES(cpu)\
void x264_frame_init_lowres_core_##cpu( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,\
int src_stride, int dst_stride, int width, int height );
#endif
HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
-static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h)
+static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h )
{
if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold.
x264_plane_copy_c( dst, i_dst, src, i_src, w, h );
- } else if(i_src > 0) {
+ } else if( !(w&15) ) {
+ x264_plane_copy_core_mmxext( dst, i_dst, src, i_src, w, h );
+ } else if( i_src > 0 ) {
// have to use plain memcpy on the last line (in memory order) to avoid overreading src
x264_plane_copy_core_mmxext( dst, i_dst, src, i_src, (w+15)&~15, h-1 );
memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w );
x264_plane_copy_core_mmxext( dst+i_dst, i_dst, src+i_src, i_src, (w+15)&~15, h-1 );
}
}
+
+#define PLANE_INTERLEAVE(cpu) \
+static void x264_plane_copy_interleave_##cpu( uint8_t *dst, int i_dst,\
+ uint8_t *srcu, int i_srcu,\
+ uint8_t *srcv, int i_srcv, int w, int h )\
+{\
+ if( !(w&15) ) {\
+ x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
+ } else if( w < 16 || (i_srcu ^ i_srcv) ) {\
+ x264_plane_copy_interleave_c( dst, i_dst, srcu, i_srcu, srcv, i_srcv, w, h );\
+ } else if( i_srcu > 0 ) {\
+ x264_plane_copy_interleave_core_##cpu( dst, i_dst, srcu, i_srcu, srcv, i_srcv, (w+15)&~15, h-1 );\
+ x264_plane_copy_interleave_c( dst+i_dst*(h-1), 0, srcu+i_srcu*(h-1), 0, srcv+i_srcv*(h-1), 0, w, 1 );\
+ } else {\
+ x264_plane_copy_interleave_c( dst, 0, srcu, 0, srcv, 0, w, 1 );\
+ x264_plane_copy_interleave_core_##cpu( dst+i_dst, i_dst, srcu+i_srcu, i_srcu, srcv+i_srcv, i_srcv, (w+15)&~15, h-1 );\
+ }\
+}
+
+PLANE_INTERLEAVE(mmxext)
+PLANE_INTERLEAVE(sse2)
#endif // !X264_HIGH_BIT_DEPTH
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_mmxext;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_mmxext;
+ pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmxext;
+ pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_mmx;
+ pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_mmx;
+
pf->plane_copy = x264_plane_copy_mmxext;
+ pf->plane_copy_interleave = x264_plane_copy_interleave_mmxext;
+ pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx;
+
pf->hpel_filter = x264_hpel_filter_mmxext;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext;
if( cpu&X264_CPU_SSE_MISALIGN )
pf->hpel_filter = x264_hpel_filter_sse2_misalign;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_sse2;
- pf->mc_chroma = x264_mc_chroma_sse2;
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ pf->mc_chroma = x264_mc_chroma_sse2;
if( cpu&X264_CPU_SSE2_IS_FAST )
{
+ pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2; // FIXME sse2fast? sse2medium?
+ pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_sse2;
+ pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_sse2;
+ pf->plane_copy_interleave = x264_plane_copy_interleave_sse2;
+ pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_sse2;
pf->mc_luma = mc_luma_sse2;
pf->get_ref = get_ref_sse2;
if( cpu&X264_CPU_CACHELINE_64 )
pf->get_ref = get_ref_cache64_sse2;
}
if( cpu&X264_CPU_SSE_MISALIGN )
+ {
pf->get_ref = get_ref_sse2_misalign;
+ pf->mc_chroma = x264_mc_chroma_sse2_misalign;
+ }
}
if( !(cpu&X264_CPU_SSSE3) )
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_ssse3;
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_ssse3;
+ pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_ssse3;
+ pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_ssse3;
+ pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_ssse3;
+
pf->hpel_filter = x264_hpel_filter_ssse3;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
- pf->mc_chroma = x264_mc_chroma_ssse3;
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ pf->mc_chroma = x264_mc_chroma_ssse3;
+
if( cpu&X264_CPU_CACHELINE_64 )
{
- pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
pf->mc_luma = mc_luma_cache64_ssse3;
pf->get_ref = get_ref_cache64_ssse3;
times 4 db 1, -1
mask_10: times 4 dw 0, -1
mask_1100: times 2 dd 0, -1
+deinterleave_shuf: db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
SECTION .text
SSD 4, 8, ssse3
%assign function_align 16
+;-----------------------------------------------------------------------------
+; uint64_t pixel_ssd_nv12_core( uint8_t *pixuv1, int stride1, uint8_t *pixuv2, int stride2, int width, int height )
+;-----------------------------------------------------------------------------
+%macro SSD_NV12 1-2 0
+cglobal pixel_ssd_nv12_core_%1, 6,7
+ shl r4d, 1
+ add r0, r4
+ add r2, r4
+ pxor m3, m3
+ pxor m4, m4
+ mova m5, [pw_00ff]
+.loopy:
+ mov r6, r4
+ neg r6
+.loopx:
+ mova m0, [r0+r6]
+ mova m1, [r2+r6]
+ psubusb m0, m1
+ psubusb m1, [r0+r6]
+ por m0, m1
+ mova m2, m0
+ pand m0, m5
+ psrlw m2, 8
+ pmaddwd m0, m0
+ pmaddwd m2, m2
+ paddd m3, m0
+ paddd m4, m2
+ add r6, mmsize
+ jl .loopx
+ add r0, r1
+ add r2, r3
+ dec r5d
+ jg .loopy
+ HADDD m3, m0
+ HADDD m4, m0
+ movd eax, m3
+ movd edx, m4
+%ifdef ARCH_X86_64
+ shl rdx, 32
+ add rax, rdx
+%endif
+ RET
+%endmacro ; SSD_NV12
+
+INIT_MMX
+SSD_NV12 mmxext
+INIT_XMM
+SSD_NV12 sse2
+
;=============================================================================
; variance
;=============================================================================
add r6, 4*%1
sub r0d, 4*%1
jg .loop
-%ifdef WIN64
- RESTORE_XMM rsp
-%endif
+ WIN64_RESTORE_XMM rsp
jmp ads_mvs
%endmacro
void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * );
void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * );
+uint64_t x264_pixel_ssd_nv12_core_mmxext( uint8_t *pixuv1, int stride1,
+ uint8_t *pixuv2, int stride2, int width, int height );
+uint64_t x264_pixel_ssd_nv12_core_sse2( uint8_t *pixuv1, int stride1,
+ uint8_t *pixuv2, int stride2, int width, int height );
void x264_pixel_ssim_4x4x2_core_mmxext( const uint8_t *pix1, int stride1,
const uint8_t *pix2, int stride2, int sums[2][4] );
void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
ASSERT %2 >= %1
%assign regs_used %2
ASSERT regs_used <= 7
- %assign xmm_regs_used %3
- ASSERT xmm_regs_used <= 16
%if regs_used > 4
push r4
push r5
%assign stack_offset stack_offset+16
%endif
+ WIN64_SPILL_XMM %3
+ LOAD_IF_USED 4, %1
+ LOAD_IF_USED 5, %1
+ LOAD_IF_USED 6, %1
+ DEFINE_ARGS %4
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+ %assign xmm_regs_used %1
+ ASSERT xmm_regs_used <= 16
%if xmm_regs_used > 6
sub rsp, (xmm_regs_used-6)*16+16
%assign stack_offset stack_offset+(xmm_regs_used-6)*16+16
movdqa [rsp + (%%i-6)*16+8], xmm %+ %%i
%endrep
%endif
- LOAD_IF_USED 4, %1
- LOAD_IF_USED 5, %1
- LOAD_IF_USED 6, %1
- DEFINE_ARGS %4
%endmacro
-%macro RESTORE_XMM_INTERNAL 1
+%macro WIN64_RESTORE_XMM_INTERNAL 1
%if xmm_regs_used > 6
%assign %%i xmm_regs_used
%rep (xmm_regs_used-6)
%endif
%endmacro
-%macro RESTORE_XMM 1
- RESTORE_XMM_INTERNAL %1
+%macro WIN64_RESTORE_XMM 1
+ WIN64_RESTORE_XMM_INTERNAL %1
%assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
%assign xmm_regs_used 0
%endmacro
%macro RET 0
- RESTORE_XMM_INTERNAL rsp
+ WIN64_RESTORE_XMM_INTERNAL rsp
%if regs_used > 4
pop r5
pop r4
%endif ;======================================================================
+%ifndef WIN64
+%macro WIN64_SPILL_XMM 1
+%endmacro
+%macro WIN64_RESTORE_XMM 1
+%endmacro
+%endif
+
;=============================================================================
%define mova movq
%define movu movq
%define movh movd
- %define movnt movntq
+ %define movnta movntq
%assign %%i 0
%rep 8
CAT_XDEFINE m, %%i, mm %+ %%i
%define mova movdqa
%define movu movdqu
%define movh movq
- %define movnt movntdq
+ %define movnta movntdq
%assign %%i 0
%rep num_mmregs
CAT_XDEFINE m, %%i, xmm %+ %%i
(m)->p_fref[1] = &(src)[1][(xoff)+(yoff)*(m)->i_stride[0]]; \
(m)->p_fref[2] = &(src)[2][(xoff)+(yoff)*(m)->i_stride[0]]; \
(m)->p_fref[3] = &(src)[3][(xoff)+(yoff)*(m)->i_stride[0]]; \
- (m)->p_fref[4] = &(src)[4][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
- (m)->p_fref[5] = &(src)[5][((xoff)>>1)+((yoff)>>1)*(m)->i_stride[1]]; \
+ (m)->p_fref[4] = &(src)[4][(xoff)+((yoff)>>1)*(m)->i_stride[1]]; \
(m)->integral = &h->mb.pic.p_integral[list][ref][(xoff)+(yoff)*(m)->i_stride[0]]; \
(m)->weight = weight_none; \
(m)->i_ref = ref;
ALIGNED_ARRAY_8( pixel, pix1,[16*8] );
pixel *pix2 = pix1+8;
const int i_stride = h->mb.pic.i_stride[1];
- const int or = 4*(i8x8&1) + 2*(i8x8&2)*i_stride;
+ const int or = 8*(i8x8&1) + 2*(i8x8&2)*i_stride;
const int oe = 4*(i8x8&1) + 2*(i8x8&2)*FENC_STRIDE;
const int i_ref = a->l0.me8x8[i8x8].i_ref;
const int mvy_offset = h->mb.b_interlaced & i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
x264_weight_t *weight = h->sh.weight[i_ref];
+ // FIXME weight can be done on 4x4 blocks even if mc is smaller
#define CHROMA4x4MC( width, height, me, x, y ) \
- h->mc.mc_chroma( &pix1[x+y*16], 16, &p_fref[4][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
+ h->mc.mc_chroma( &pix1[x+y*16], &pix2[x+y*16], 16, &p_fref[4][or+x*2+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
if( weight[1].weightfn ) \
weight[1].weightfn[width>>2]( &pix1[x+y*16], 16, &pix1[x+y*16], 16, &weight[1], height ); \
- h->mc.mc_chroma( &pix2[x+y*16], 16, &p_fref[5][or+x+y*i_stride], i_stride, (me).mv[0], (me).mv[1]+mvy_offset, width, height ); \
if( weight[2].weightfn ) \
weight[2].weightfn[width>>2]( &pix2[x+y*16], 16, &pix2[x+y*16], 16, &weight[2], height );
FILE *f = fopen( h->param.psz_dump_yuv, "r+b" );
if( !f )
return;
- int bytes_per_pixel = (BIT_DEPTH+7)/8;
/* Write the frame in display order */
- fseek( f, (uint64_t)h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2 * bytes_per_pixel, SEEK_SET );
- for( int i = 0; i < h->fdec->i_plane; i++ )
- for( int y = 0; y < h->param.i_height >> !!i; y++ )
- for( int j = 0; j < h->param.i_width >> !!i; j++ )
- fwrite( &h->fdec->plane[i][y*h->fdec->i_stride[i]]+j, bytes_per_pixel, 1, f );
+ fseek( f, (uint64_t)h->fdec->i_frame * h->param.i_height * h->param.i_width * 3/2 * sizeof(pixel), SEEK_SET );
+ for( int y = 0; y < h->param.i_height; y++ )
+ fwrite( &h->fdec->plane[0][y*h->fdec->i_stride[0]], sizeof(pixel), h->param.i_width, f );
+ int cw = h->param.i_width>>1;
+ int ch = h->param.i_height>>1;
+ pixel *planeu = x264_malloc( cw*ch*2*sizeof(pixel) );
+ pixel *planev = planeu + cw*ch;
+ h->mc.plane_copy_deinterleave( planeu, cw, planev, cw, h->fdec->plane[1], h->fdec->i_stride[1], cw, ch );
+ fwrite( planeu, 1, cw*ch*2*sizeof(pixel), f );
+ x264_free( planeu );
fclose( f );
}
int i_csp = h->param.i_csp & X264_CSP_MASK;
if( i_csp <= X264_CSP_NONE || i_csp >= X264_CSP_MAX )
{
- x264_log( h, X264_LOG_ERROR, "invalid CSP\n" );
+ x264_log( h, X264_LOG_ERROR, "invalid CSP (only I420/YV12/NV12 supported)\n" );
return -1;
}
if( min_y < h->i_threadslice_start )
return;
- if( !b_end && b_inloop )
- for( int j = 0; j <= h->sh.b_mbaff; j++ )
- for( int i = 0; i < 3; i++ )
- {
- memcpy( h->intra_border_backup[j][i],
- h->fdec->plane[i] + ((mb_y*16 >> !!i) + j - 1 - h->sh.b_mbaff) * h->fdec->i_stride[i],
- (h->mb.i_mb_width*16 >> !!i) * sizeof(pixel) );
- }
-
if( b_deblock )
for( int y = min_y; y < max_y; y += (1 << h->sh.b_mbaff) )
x264_frame_deblock_row( h, y );
if( b_measure_quality )
{
if( h->param.analyse.b_psnr )
- for( int i = 0; i < 3; i++ )
- h->stat.frame.i_ssd[i] +=
- x264_pixel_ssd_wxh( &h->pixf,
- h->fdec->plane[i] + (min_y>>!!i) * h->fdec->i_stride[i], h->fdec->i_stride[i],
- h->fenc->plane[i] + (min_y>>!!i) * h->fenc->i_stride[i], h->fenc->i_stride[i],
- h->param.i_width >> !!i, (max_y-min_y) >> !!i );
+ {
+ uint64_t ssd_y = x264_pixel_ssd_wxh( &h->pixf,
+ h->fdec->plane[0] + min_y * h->fdec->i_stride[0], h->fdec->i_stride[0],
+ h->fenc->plane[0] + min_y * h->fenc->i_stride[0], h->fenc->i_stride[0],
+ h->param.i_width, max_y-min_y );
+ uint64_t ssd_uv = x264_pixel_ssd_nv12( &h->pixf,
+ h->fdec->plane[1] + (min_y>>1) * h->fdec->i_stride[1], h->fdec->i_stride[1],
+ h->fenc->plane[1] + (min_y>>1) * h->fenc->i_stride[1], h->fenc->i_stride[1],
+ h->param.i_width>>1, (max_y-min_y)>>1 );
+ h->stat.frame.i_ssd[0] += ssd_y;
+ h->stat.frame.i_ssd[1] += (uint32_t)ssd_uv;
+ h->stat.frame.i_ssd[2] += ssd_uv>>32;
+ }
if( h->param.analyse.b_ssim )
{
if( pic_out->i_pts < pic_out->i_dts )
x264_log( h, X264_LOG_WARNING, "invalid DTS: PTS is less than DTS\n" );
+ pic_out->img.i_csp = X264_CSP_NV12;
pic_out->img.i_plane = h->fdec->i_plane;
- for( int i = 0; i < 3; i++ )
+ for( int i = 0; i < 2; i++ )
{
pic_out->img.i_stride[i] = h->fdec->i_stride[i];
// FIXME This breaks the API when pixel != uint8_t.
/* Special case for mv0, which is (of course) very common in P-skip mode. */
if( mvx | mvy )
- {
- h->mc.mc_chroma( h->mb.pic.p_fdec[1], FDEC_STRIDE,
+ h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
mvx, mvy, 8, 8 );
- h->mc.mc_chroma( h->mb.pic.p_fdec[2], FDEC_STRIDE,
- h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2],
- mvx, mvy, 8, 8 );
- }
else
- {
- h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1], 8 );
- h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fref[0][0][5], h->mb.pic.i_stride[2], 8 );
- }
+ h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] );
if( h->sh.weight[0][1].weightfn )
h->sh.weight[0][1].weightfn[8>>2]( h->mb.pic.p_fdec[1], FDEC_STRIDE,
h->mb.pic.p_fdec[1], FDEC_STRIDE,
&h->sh.weight[0][1], 8 );
-
if( h->sh.weight[0][2].weightfn )
h->sh.weight[0][2].weightfn[8>>2]( h->mb.pic.p_fdec[2], FDEC_STRIDE,
h->mb.pic.p_fdec[2], FDEC_STRIDE,
* Intra prediction for predictive lossless mode.
*****************************************************************************/
-/* Note that these functions take a shortcut (mc.copy instead of actual pixel prediction) which assumes
- * that the edge pixels of the reconstructed frame are the same as that of the source frame. This means
- * they will only work correctly if the neighboring blocks are losslessly coded. In practice, this means
- * lossless mode cannot be mixed with lossy mode within a frame. */
-/* This can be resolved by explicitly copying the edge pixels after doing the mc.copy, but this doesn't
- * need to be done unless we decide to allow mixing lossless and lossy compression. */
-
void x264_predict_lossless_8x8_chroma( x264_t *h, int i_mode )
{
- int stride = h->fenc->i_stride[1] << h->mb.b_interlaced;
if( i_mode == I_PRED_CHROMA_V )
{
- h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-stride, stride, 8 );
- h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-stride, stride, 8 );
+ h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-FENC_STRIDE, FENC_STRIDE, 8 );
+ h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-FENC_STRIDE, FENC_STRIDE, 8 );
+ memcpy( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[1]-FDEC_STRIDE, 8*sizeof(pixel) );
+ memcpy( h->mb.pic.p_fdec[2], h->mb.pic.p_fdec[2]-FDEC_STRIDE, 8*sizeof(pixel) );
}
else if( i_mode == I_PRED_CHROMA_H )
{
- h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc_plane[1]-1, stride, 8 );
- h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc_plane[2]-1, stride, 8 );
+ h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1], FDEC_STRIDE, h->mb.pic.p_fenc[1]-1, FENC_STRIDE, 8 );
+ h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[2], FDEC_STRIDE, h->mb.pic.p_fenc[2]-1, FENC_STRIDE, 8 );
+ x264_copy_column8( h->mb.pic.p_fdec[1]+4*FDEC_STRIDE, h->mb.pic.p_fdec[1]+4*FDEC_STRIDE-1 );
+ x264_copy_column8( h->mb.pic.p_fdec[2]+4*FDEC_STRIDE, h->mb.pic.p_fdec[2]+4*FDEC_STRIDE-1 );
}
else
{
i_qp = h->mb.i_chroma_qp;
thresh = (x264_lambda2_tab[i_qp] + 32) >> 6;
+ if( !b_bidir )
+ {
+ /* Special case for mv0, which is (of course) very common in P-skip mode. */
+ if( M32( mvp ) )
+ h->mc.mc_chroma( h->mb.pic.p_fdec[1], h->mb.pic.p_fdec[2], FDEC_STRIDE,
+ h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1],
+ mvp[0], mvp[1], 8, 8 );
+ else
+ h->mc.load_deinterleave_8x8x2_fdec( h->mb.pic.p_fdec[1], h->mb.pic.p_fref[0][0][4], h->mb.pic.i_stride[1] );
+ }
+
for( int ch = 0; ch < 2; ch++ )
{
pixel *p_src = h->mb.pic.p_fenc[1+ch];
pixel *p_dst = h->mb.pic.p_fdec[1+ch];
- if( !b_bidir )
- {
- /* Special case for mv0, which is (of course) very common in P-skip mode. */
- if( M32( mvp ) )
- {
- h->mc.mc_chroma( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
- h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch],
- mvp[0], mvp[1], 8, 8 );
- }
- else
- h->mc.copy[PIXEL_8x8]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE, h->mb.pic.p_fref[0][0][4+ch], h->mb.pic.i_stride[1+ch], 8 );
-
- if( h->sh.weight[0][1+ch].weightfn )
- h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
- h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
- &h->sh.weight[0][1+ch], 8 );
- }
+ if( !b_bidir && h->sh.weight[0][1+ch].weightfn )
+ h->sh.weight[0][1+ch].weightfn[8>>2]( h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
+ h->mb.pic.p_fdec[1+ch], FDEC_STRIDE,
+ &h->sh.weight[0][1+ch], 8 );
/* there is almost never a termination during chroma, but we can't avoid the check entirely */
/* so instead we check SSD and skip the actual check if the score is low enough. */
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
if( b_chroma_me && cost < bcost ) \
{ \
- h->mc.mc_chroma( pix, 8, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
+ h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
if( m->weight[1].weightfn ) \
- m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 8, pix, 8, \
+ m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 16, pix, 16, \
&m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \
- cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 8 ); \
+ cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \
if( cost < bcost ) \
{ \
- h->mc.mc_chroma( pix, 8, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
if( m->weight[2].weightfn ) \
- m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 8, pix, 8, \
+ m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix+8, 16, pix+8, 16, \
&m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \
- cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix, 8 ); \
+ cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \
} \
} \
COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, bdir, dir ); \
stride[list][i] = bw;\
src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none );\
if( rd )\
- {\
- h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
- h->mc.mc_chroma( pixv_buf[list][i], 8, m->p_fref[5], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
- }\
+ h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
}
#define SATD_THRESH 17/16
uint64_t cost; \
M32( cache_mv ) = pack16to32_mask(mx,my); \
if( m->i_pixel <= PIXEL_8x8 ) \
- { \
- h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
- h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
- } \
+ h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
} \
+ rce->misc_bits;
}
+static ALWAYS_INLINE uint32_t ac_energy_var( uint64_t sum_ssd, int shift, x264_frame_t *frame, int i )
+{
+ uint32_t sum = sum_ssd;
+ uint32_t ssd = sum_ssd >> 32;
+ frame->i_pixel_sum[i] += sum;
+ frame->i_pixel_ssd[i] += ssd;
+ return ssd - ((uint64_t)sum * sum >> shift);
+}
+
static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x264_frame_t *frame, int i )
{
int w = i ? 8 : 16;
- int shift = i ? 6 : 8;
int stride = frame->i_stride[i];
int offset = h->mb.b_interlaced
- ? w * (mb_x + (mb_y&~1) * stride) + (mb_y&1) * stride
- : w * (mb_x + mb_y * stride);
- int pix = i ? PIXEL_8x8 : PIXEL_16x16;
+ ? 16 * mb_x + w * (mb_y&~1) * stride + (mb_y&1) * stride
+ : 16 * mb_x + w * mb_y * stride;
stride <<= h->mb.b_interlaced;
- uint64_t res = h->pixf.var[pix]( frame->plane[i] + offset, stride );
- uint32_t sum = (uint32_t)res;
- uint32_t ssd = res >> 32;
- frame->i_pixel_sum[i] += sum;
- frame->i_pixel_ssd[i] += ssd;
- return ssd - ((uint64_t)sum * sum >> shift);
+ if( i )
+ {
+ ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*8] );
+ h->mc.load_deinterleave_8x8x2_fenc( pix, frame->plane[1] + offset, stride );
+ return ac_energy_var( h->pixf.var[PIXEL_8x8]( pix, FENC_STRIDE ), 6, frame, i )
+ + ac_energy_var( h->pixf.var[PIXEL_8x8]( pix+FENC_STRIDE/2, FENC_STRIDE ), 6, frame, i );
+ }
+ else
+ return ac_energy_var( h->pixf.var[PIXEL_16x16]( frame->plane[0] + offset, stride ), 8, frame, i );
}
// Find the total AC energy of the block in all planes.
* sure no reordering goes on. */
uint32_t var = ac_energy_plane( h, mb_x, mb_y, frame, 0 );
var += ac_energy_plane( h, mb_x, mb_y, frame, 1 );
- var += ac_energy_plane( h, mb_x, mb_y, frame, 2 );
x264_emms();
return var;
}
case X264_CSP_I420: return PIX_FMT_YUV420P;
case X264_CSP_I422: return PIX_FMT_YUV422P;
case X264_CSP_I444: return PIX_FMT_YUV444P;
+ case X264_CSP_NV12: return PIX_FMT_NV12;
case X264_CSP_YV12: return PIX_FMT_YUV420P; /* specially handled via swapping chroma */
case X264_CSP_BGR: return PIX_FMT_BGR24;
case X264_CSP_BGRA: return PIX_FMT_BGRA;
case PIX_FMT_ABGR:
case PIX_FMT_BGRA:
return X264_CSP_BGRA;
+ case PIX_FMT_NV12:
+ case PIX_FMT_NV21:
+ return X264_CSP_NV12;
default:
return X264_CSP_I420;
}
[X264_CSP_I422] = { "i422", 3, { 1, .5, .5 }, { 1, 1, 1 }, 2, 1 },
[X264_CSP_I444] = { "i444", 3, { 1, 1, 1 }, { 1, 1, 1 }, 1, 1 },
[X264_CSP_YV12] = { "yv12", 3, { 1, .5, .5 }, { 1, .5, .5 }, 2, 2 },
+ [X264_CSP_NV12] = { "nv12", 2, { 1, 1 }, { 1, .5 }, 2, 2 },
[X264_CSP_BGR] = { "bgr", 1, { 3 }, { 1 }, 1, 1 },
[X264_CSP_BGRA] = { "bgra", 1, { 4 }, { 1 }, 1, 1 }
};
TEST_INTRA_MBCMP( intra_sad_x3_4x4 , predict_4x4 , sad [PIXEL_4x4] , 0 );
report( "intra sad_x3 :" );
+ ok = 1; used_asm = 0;
+ if( pixel_asm.ssd_nv12_core != pixel_ref.ssd_nv12_core )
+ {
+ used_asm = 1;
+ set_func_name( "ssd_nv12" );
+ uint64_t res_c = pixel_c.ssd_nv12_core( pbuf1, 368, pbuf2, 368, 360, 8 );
+ uint64_t res_a = pixel_asm.ssd_nv12_core( pbuf1, 368, pbuf2, 368, 360, 8 );
+ if( res_c != res_a )
+ {
+ ok = 0;
+ fprintf( stderr, "ssd_nv12: %u,%u != %u,%u\n",
+ (uint32_t)res_c, (uint32_t)(res_c>>32),
+ (uint32_t)res_a, (uint32_t)(res_a>>32) );
+ }
+ call_c( pixel_c.ssd_nv12_core, pbuf1, 368, pbuf2, 368, 360, 8 );
+ call_a( pixel_asm.ssd_nv12_core, pbuf1, 368, pbuf2, 368, 360, 8 );
+ }
+ report( "ssd_nv12 :" );
+
if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
pixel_asm.ssim_end4 != pixel_ref.ssim_end4 )
{
used_asm = 1; \
for( int i = 0; i < 1024; i++ ) \
pbuf3[i] = pbuf4[i] = 0xCD; \
- call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \
- call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \
+ call_c( mc_c.mc_chroma, dst1, dst1+8, 16, src, 64, dx, dy, w, h ); \
+ call_a( mc_a.mc_chroma, dst2, dst2+8, 16, src, 64, dx, dy, w, h ); \
/* mc_chroma width=2 may write garbage to the right of dst. ignore that. */ \
for( int j = 0; j < h; j++ ) \
- for( int i = w; i < 4; i++ ) \
+ for( int i = w; i < 8; i++ ) \
+ { \
+ dst2[i+j*16+8] = dst1[i+j*16+8]; \
dst2[i+j*16] = dst1[i+j*16]; \
+ } \
if( memcmp( pbuf3, pbuf4, 1024 * sizeof(pixel) ) ) \
{ \
fprintf( stderr, "mc_chroma[mv(%d,%d) %2dx%-2d] [FAILED]\n", dx, dy, w, h ); \
}
report( "mc offsetsub :" );
+ ok = 1; used_asm = 0;
+ if( mc_a.store_interleave_8x8x2 != mc_ref.store_interleave_8x8x2 )
+ {
+ set_func_name( "store_interleave_8x8x2" );
+ used_asm = 1;
+ memset( pbuf3, 0, 64*8 );
+ memset( pbuf4, 0, 64*8 );
+ call_c( mc_c.store_interleave_8x8x2, pbuf3, 64, pbuf1, pbuf1+16 );
+ call_a( mc_a.store_interleave_8x8x2, pbuf4, 64, pbuf1, pbuf1+16 );
+ if( memcmp( pbuf3, pbuf4, 64*8 ) )
+ ok = 0;
+ }
+ if( mc_a.load_deinterleave_8x8x2_fenc != mc_ref.load_deinterleave_8x8x2_fenc )
+ {
+ set_func_name( "load_deinterleave_8x8x2_fenc" );
+ used_asm = 1;
+ call_c( mc_c.load_deinterleave_8x8x2_fenc, pbuf3, pbuf1, 64 );
+ call_a( mc_a.load_deinterleave_8x8x2_fenc, pbuf4, pbuf1, 64 );
+ if( memcmp( pbuf3, pbuf4, FENC_STRIDE*8 ) )
+ ok = 0;
+ }
+ if( mc_a.load_deinterleave_8x8x2_fdec != mc_ref.load_deinterleave_8x8x2_fdec )
+ {
+ set_func_name( "load_deinterleave_8x8x2_fdec" );
+ used_asm = 1;
+ call_c( mc_c.load_deinterleave_8x8x2_fdec, pbuf3, pbuf1, 64 );
+ call_a( mc_a.load_deinterleave_8x8x2_fdec, pbuf4, pbuf1, 64 );
+ if( memcmp( pbuf3, pbuf4, FDEC_STRIDE*8 ) )
+ ok = 0;
+ }
+ report( "store_interleave :" );
+
+ struct plane_spec {
+ int w, h, src_stride;
+ } plane_specs[] = { {2,2,2}, {8,6,8}, {20,31,24}, {32,8,40}, {256,10,272}, {504,7,505}, {528,6,528}, {256,10,-256}, {263,9,-264}, {1904,1,0} };
+ ok = 1; used_asm = 0;
+ if( mc_a.plane_copy != mc_ref.plane_copy )
+ {
+ set_func_name( "plane_copy" );
+ used_asm = 1;
+ for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
+ {
+ int w = plane_specs[i].w;
+ int h = plane_specs[i].h;
+ int src_stride = plane_specs[i].src_stride;
+ int dst_stride = (w + 127) & ~63;
+ assert( dst_stride * h <= 0x1000 );
+ uint8_t *src1 = buf1 + X264_MAX(0, -src_stride) * (h-1);
+ memset( pbuf3, 0, 0x1000*sizeof(pixel) );
+ memset( pbuf4, 0, 0x1000*sizeof(pixel) );
+ call_c( mc_c.plane_copy, pbuf3, dst_stride, src1, src_stride, w, h );
+ call_a( mc_a.plane_copy, pbuf4, dst_stride, src1, src_stride, w, h );
+ for( int y = 0; y < h; y++ )
+ if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w*sizeof(pixel) ) )
+ {
+ ok = 0;
+ fprintf( stderr, "plane_copy FAILED: w=%d h=%d stride=%d\n", w, h, src_stride );
+ break;
+ }
+ }
+ }
+
+ if( mc_a.plane_copy_interleave != mc_ref.plane_copy_interleave )
+ {
+ set_func_name( "plane_copy_interleave" );
+ used_asm = 1;
+ for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
+ {
+ int w = (plane_specs[i].w + 1) >> 1;
+ int h = plane_specs[i].h;
+ int src_stride = (plane_specs[i].src_stride + 1) >> 1;
+ int dst_stride = (2*w + 127) & ~63;
+ assert( dst_stride * h <= 0x1000 );
+ uint8_t *src1 = buf1 + X264_MAX(0, -src_stride) * (h-1);
+ memset( pbuf3, 0, 0x1000*sizeof(pixel) );
+ memset( pbuf4, 0, 0x1000*sizeof(pixel) );
+ call_c( mc_c.plane_copy_interleave, pbuf3, dst_stride, src1, src_stride, src1+1024, src_stride+16, w, h );
+ call_a( mc_a.plane_copy_interleave, pbuf4, dst_stride, src1, src_stride, src1+1024, src_stride+16, w, h );
+ for( int y = 0; y < h; y++ )
+ if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, 2*w*sizeof(pixel) ) )
+ {
+ ok = 0;
+ fprintf( stderr, "plane_copy_interleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride );
+ break;
+ }
+ }
+ }
+
+ if( mc_a.plane_copy_deinterleave != mc_ref.plane_copy_deinterleave )
+ {
+ set_func_name( "plane_copy_deinterleave" );
+ used_asm = 1;
+ for( int i = 0; i < sizeof(plane_specs)/sizeof(*plane_specs); i++ )
+ {
+ int w = (plane_specs[i].w + 1) >> 1;
+ int h = plane_specs[i].h;
+ int dst_stride = w;
+ int src_stride = (2*w + 127) & ~63;
+ int offv = (dst_stride*h + 31) & ~15;
+ memset( pbuf3, 0, 0x1000 );
+ memset( pbuf4, 0, 0x1000 );
+ call_c( mc_c.plane_copy_deinterleave, pbuf3, dst_stride, pbuf3+offv, dst_stride, pbuf1, src_stride, w, h );
+ call_a( mc_a.plane_copy_deinterleave, pbuf4, dst_stride, pbuf4+offv, dst_stride, pbuf1, src_stride, w, h );
+ for( int y = 0; y < h; y++ )
+ if( memcmp( pbuf3+y*dst_stride, pbuf4+y*dst_stride, w ) ||
+ memcmp( pbuf3+y*dst_stride+offv, pbuf4+y*dst_stride+offv, w ) )
+ {
+ ok = 0;
+ fprintf( stderr, "plane_copy_deinterleave FAILED: w=%d h=%d stride=%d\n", w, h, src_stride );
+ break;
+ }
+ }
+ }
+ report( "plane_copy :" );
+
if( mc_a.hpel_filter != mc_ref.hpel_filter )
{
pixel *srchpel = pbuf1+8+2*64;
#include <stdarg.h>
-#define X264_BUILD 103
+#define X264_BUILD 104
/* x264_t:
* opaque handler for encoder */
static const char * const x264_nal_hrd_names[] = { "none", "vbr", "cbr", 0 };
static const char * const x264_open_gop_names[] = { "none", "normal", "bluray", 0 };
-/* Colorspace type
- * legacy only; nothing other than I420 is really supported. */
+/* Colorspace type */
#define X264_CSP_MASK 0x00ff /* */
#define X264_CSP_NONE 0x0000 /* Invalid mode */
#define X264_CSP_I420 0x0001 /* yuv 4:2:0 planar */
#define X264_CSP_YV12 0x0002 /* yvu 4:2:0 planar */
-#define X264_CSP_MAX 0x0003 /* end of list */
+#define X264_CSP_NV12 0x0003 /* yuv 4:2:0, with one y plane and one packed u+v */
+#define X264_CSP_MAX 0x0004 /* end of list */
#define X264_CSP_VFLIP 0x1000 /* */
/* Slice type */
/* x264_picture_alloc:
* alloc data for a picture. You must call x264_picture_clean on it.
- * returns 0 on success, or -1 on malloc failure. */
+ * returns 0 on success, or -1 on malloc failure or invalid colorspace. */
int x264_picture_alloc( x264_picture_t *pic, int i_csp, int i_width, int i_height );
/* x264_picture_clean: