/*****************************************************************************
* me.c: motion estimation
*****************************************************************************
- * Copyright (C) 2003-2010 x264 project
+ * Copyright (C) 2003-2012 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
* Laurent Aimar <fenrir@via.ecp.fr>
{0,0,2,2},
{0,0,4,10},
{0,0,4,10},
+ {0,0,4,10},
{0,0,4,10}};
/* (x-1)%6 */
#if 0
/* plain old exhaustive search */
for( int my = min_y; my <= max_y; my++ )
- for( int mx = min_x; mx <= max_x; mx++ )
+ for( int mx = min_x; mx < min_x + width; mx++ )
COST_MV( mx, my );
#else
/* successive elimination by comparing DC before a full SAD,
* because sum(abs(diff)) >= abs(diff(sum)). */
uint16_t *sums_base = m->integral;
- /* due to a GCC bug on some platforms (win32?), zero[] may not actually be aligned.
- * this is not a problem because it is not used for any SSE instructions. */
- ALIGNED_16( static pixel zero[8*FENC_STRIDE] );
+ ALIGNED_16( static pixel zero[8*FENC_STRIDE] ) = {0};
ALIGNED_ARRAY_16( int, enc_dc,[4] );
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int delta = x264_pixel_size[sad_size].w;
int16_t *xs = h->scratch_buffer;
int xn;
- uint16_t *cost_fpel_mvx = h->cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
+ uint16_t *cost_fpel_mvx = h->cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);
h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
if( b_refine_qpel || (dir^1) != odir ) \
{ \
int stride = 16; \
- pixel *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
+ pixel *src = h->mc.get_ref( pix, &stride, &m->p_fref[0], m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
if( b_chroma_me && cost < bcost ) \
{ \
- h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
- if( m->weight[1].weightfn ) \
- m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 16, pix, 16, \
- &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \
- cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \
- if( cost < bcost ) \
+ if( CHROMA444 ) \
{ \
- if( m->weight[2].weightfn ) \
- m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix+8, 16, pix+8, 16, \
- &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \
- cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \
+ stride = 16; \
+ src = h->mc.get_ref( pix, &stride, &m->p_fref[4], m->i_stride[1], mx, my, bw, bh, &m->weight[1] ); \
+ cost += h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[1], FENC_STRIDE, src, stride ); \
+ if( cost < bcost ) \
+ { \
+ stride = 16; \
+ src = h->mc.get_ref( pix, &stride, &m->p_fref[8], m->i_stride[2], mx, my, bw, bh, &m->weight[2] ); \
+ cost += h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[2], FENC_STRIDE, src, stride ); \
+ } \
+ } \
+ else \
+ { \
+ h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], \
+ mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \
+ if( m->weight[1].weightfn ) \
+ m->weight[1].weightfn[bw>>3]( pix, 16, pix, 16, &m->weight[1], bh>>chroma_v_shift ); \
+ cost += h->pixf.mbcmp[chromapix]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \
+ if( cost < bcost ) \
+ { \
+ if( m->weight[2].weightfn ) \
+ m->weight[2].weightfn[bw>>3]( pix+8, 16, pix+8, 16, &m->weight[2], bh>>chroma_v_shift ); \
+ cost += h->pixf.mbcmp[chromapix]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \
+ } \
} \
} \
COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, bdir, dir ); \
const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
const int i_pixel = m->i_pixel;
- const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
- const int mvy_offset = h->mb.b_interlaced & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ const int b_chroma_me = h->mb.b_chroma_me && (i_pixel <= PIXEL_8x8 || CHROMA444);
+ int chromapix = h->luma2chroma_pixel[i_pixel];
+ int chroma_v_shift = CHROMA_V_SHIFT;
+ int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
ALIGNED_ARRAY_16( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
int i = 4 + 3*dx + dy;\
int mvx = bm##list##x+dx;\
int mvy = bm##list##y+dy;\
- stride[list][i] = bw;\
- src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none );\
+ stride[0][list][i] = bw;\
+ src[0][list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[0][list][i], &m->p_fref[0],\
+ m->i_stride[0], mvx, mvy, bw, bh, x264_weight_none );\
if( rd )\
- h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
+ {\
+ if( CHROMA444 )\
+ {\
+ stride[1][list][i] = bw;\
+ src[1][list][i] = h->mc.get_ref( pixu_buf[list][i], &stride[1][list][i], &m->p_fref[4],\
+ m->i_stride[1], mvx, mvy, bw, bh, x264_weight_none );\
+ stride[2][list][i] = bw;\
+ src[2][list][i] = h->mc.get_ref( pixv_buf[list][i], &stride[2][list][i], &m->p_fref[8],\
+ m->i_stride[2], mvx, mvy, bw, bh, x264_weight_none );\
+ }\
+ else\
+ h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1],\
+ mvx, 2*(mvy+mv##list##y_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift );\
+ }\
}
-#define SATD_THRESH 17/16
+#define SATD_THRESH(cost) (cost+(cost>>4))
/* Don't unroll the BIME_CACHE loop. I couldn't find any way to force this
* other than making its iteration count not a compile-time constant. */
const int bw = x264_pixel_size[i_pixel].w;
const int bh = x264_pixel_size[i_pixel].h;
ALIGNED_ARRAY_16( pixel, pixy_buf,[2],[9][16*16] );
- ALIGNED_ARRAY_8( pixel, pixu_buf,[2],[9][8*8] );
- ALIGNED_ARRAY_8( pixel, pixv_buf,[2],[9][8*8] );
- pixel *src[2][9];
+ ALIGNED_ARRAY_16( pixel, pixu_buf,[2],[9][16*16] );
+ ALIGNED_ARRAY_16( pixel, pixv_buf,[2],[9][16*16] );
+ pixel *src[3][2][9];
+ int chromapix = h->luma2chroma_pixel[i_pixel];
+ int chroma_v_shift = CHROMA_V_SHIFT;
+ int chroma_x = (8 >> CHROMA_H_SHIFT) * x;
+ int chroma_y = (8 >> chroma_v_shift) * y;
pixel *pix = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
- pixel *pixu = &h->mb.pic.p_fdec[1][4*x + 4*y*FDEC_STRIDE];
- pixel *pixv = &h->mb.pic.p_fdec[2][4*x + 4*y*FDEC_STRIDE];
+ pixel *pixu = &h->mb.pic.p_fdec[1][chroma_x + chroma_y*FDEC_STRIDE];
+ pixel *pixv = &h->mb.pic.p_fdec[2][chroma_x + chroma_y*FDEC_STRIDE];
int ref0 = h->mb.cache.ref[0][s8];
int ref1 = h->mb.cache.ref[1][s8];
- const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
- const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
- int stride[2][9];
+ const int mv0y_offset = chroma_v_shift & MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ const int mv1y_offset = chroma_v_shift & MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ int stride[3][2][9];
int bm0x = m0->mv[0];
int bm0y = m0->mv[1];
int bm1x = m1->mv[0];
/* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] );
/* all permutations of an offset in up to 2 of the dimensions */
- static const int8_t dia4d[33][4] =
+ ALIGNED_4( static const int8_t dia4d[33][4] ) =
{
{0,0,0,0},
{0,0,0,1}, {0,0,0,-1}, {0,0,1,0}, {0,0,-1,0},
int i0 = 4 + 3*dia4d[j][0] + dia4d[j][1];
int i1 = 4 + 3*dia4d[j][2] + dia4d[j][3];
visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));
- h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src[0][i0], stride[0][i0], src[1][i1], stride[1][i1], i_weight );
+ h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src[0][0][i0], stride[0][0][i0], src[0][1][i1], stride[0][1][i1], i_weight );
int cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE )
+ p_cost_m0x[m0x] + p_cost_m0y[m0y] + p_cost_m1x[m1x] + p_cost_m1y[m1y];
if( rd )
{
- if( cost < bcost * SATD_THRESH )
+ if( cost < SATD_THRESH(bcost) )
{
bcost = X264_MIN( cost, bcost );
M32( cache0_mv ) = pack16to32_mask(m0x,m0y);
M32( cache1_mv ) = pack16to32_mask(m1x,m1y);
- h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
- h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
+ if( CHROMA444 )
+ {
+ h->mc.avg[i_pixel]( pixu, FDEC_STRIDE, src[1][0][i0], stride[1][0][i0], src[1][1][i1], stride[1][1][i1], i_weight );
+ h->mc.avg[i_pixel]( pixv, FDEC_STRIDE, src[2][0][i0], stride[2][0][i0], src[2][1][i1], stride[2][1][i1], i_weight );
+ }
+ else
+ {
+ h->mc.avg[chromapix]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
+ h->mc.avg[chromapix]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
+ }
uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel );
COPY2_IF_LT( bcostrd, costrd, bestj, j );
}
#define COST_MV_RD( mx, my, satd, do_dir, mdir ) \
{ \
- if( satd <= bsatd * SATD_THRESH ) \
+ if( satd <= SATD_THRESH(bsatd) ) \
{ \
uint64_t cost; \
M32( cache_mv ) = pack16to32_mask(mx,my); \
- if( m->i_pixel <= PIXEL_8x8 ) \
- h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 ); \
+ if( CHROMA444 ) \
+ { \
+ h->mc.mc_luma( pixu, FDEC_STRIDE, &m->p_fref[4], m->i_stride[1], mx, my, bw, bh, &m->weight[1] ); \
+ h->mc.mc_luma( pixv, FDEC_STRIDE, &m->p_fref[8], m->i_stride[2], mx, my, bw, bh, &m->weight[2] ); \
+ } \
+ else if( m->i_pixel <= PIXEL_8x8 ) \
+ { \
+ h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], \
+ mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \
+ if( m->weight[1].weightfn ) \
+ m->weight[1].weightfn[bw>>3]( pixu, FDEC_STRIDE, pixu, FDEC_STRIDE, &m->weight[1], bh>>chroma_v_shift ); \
+ if( m->weight[2].weightfn ) \
+ m->weight[2].weightfn[bw>>3]( pixv, FDEC_STRIDE, pixv, FDEC_STRIDE, &m->weight[2], bh>>chroma_v_shift ); \
+ } \
cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
} \
const int bw = x264_pixel_size[m->i_pixel].w;
const int bh = x264_pixel_size[m->i_pixel].h;
const int i_pixel = m->i_pixel;
- const int mvy_offset = h->mb.b_interlaced & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ int chroma_v_shift = CHROMA_V_SHIFT;
+ int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
uint64_t bcost = COST_MAX64;
int bmx = m->mv[0];
int bmy = m->mv[1];
int omx, omy, pmx, pmy;
- unsigned bsatd;
- int satd;
+ int satd, bsatd;
int dir = -2;
int i8 = i4>>2;
uint16_t amvd;
pixel *pix = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
- pixel *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
- pixel *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+ pixel *pixu, *pixv;
+ if( CHROMA444 )
+ {
+ pixu = &h->mb.pic.p_fdec[1][block_idx_xy_fdec[i4]];
+ pixv = &h->mb.pic.p_fdec[2][block_idx_xy_fdec[i4]];
+ }
+ else
+ {
+ pixu = &h->mb.pic.p_fdec[1][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4];
+ pixv = &h->mb.pic.p_fdec[2][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4];
+ }
h->mb.b_skip_mc = 1;
m->mv[0] = bmx;
m->mv[1] = bmy;
x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) );
- amvd = pack8to16( X264_MIN(abs(bmx - m->mvp[0]),33), X264_MIN(abs(bmy - m->mvp[1]),33) );
+ amvd = pack8to16( X264_MIN(abs(bmx - m->mvp[0]),66), X264_MIN(abs(bmy - m->mvp[1]),66) );
x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, amvd );
h->mb.b_skip_mc = 0;
}