/*****************************************************************************
- * me.c: h264 encoder library (Motion Estimation)
+ * me.c: motion estimation
*****************************************************************************
- * Copyright (C) 2003-2008 x264 project
+ * Copyright (C) 2003-2012 x264 project
*
* Authors: Loren Merritt <lorenm@u.washington.edu>
* Laurent Aimar <fenrir@via.ecp.fr>
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
*****************************************************************************/
#include "common/common.h"
{0,0,2,2},
{0,0,4,10},
{0,0,4,10},
+ {0,0,4,10},
{0,0,4,10}};
/* (x-1)%6 */
static const uint8_t mod6m1[8] = {5,0,1,2,3,4,5,0};
/* radius 2 hexagon. repeated entries are to avoid having to compute mod6 every time. */
-static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
-static const int square1[9][2] = {{0,0}, {0,-1}, {0,1}, {-1,0}, {1,0}, {-1,-1}, {-1,1}, {1,-1}, {1,1}};
+static const int8_t hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
+static const int8_t square1[9][2] = {{0,0}, {0,-1}, {0,1}, {-1,0}, {1,0}, {-1,-1}, {-1,1}, {1,-1}, {1,1}};
static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel );
#define COST_MV_HPEL( mx, my ) \
{ \
int stride2 = 16; \
- uint8_t *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \
+ pixel *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \
int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \
#define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\
{\
- uint8_t *pix_base = p_fref_w + bmx + bmy*stride;\
+ pixel *pix_base = p_fref_w + bmx + bmy*stride;\
h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\
pix_base + (m0x) + (m0y)*stride,\
pix_base + (m1x) + (m1y)*stride,\
#define COST_MV_X4_DIR( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs )\
{\
- uint8_t *pix_base = p_fref_w + bmx + bmy*stride;\
+ pixel *pix_base = p_fref_w + bmx + bmy*stride;\
h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
pix_base + (m0x) + (m0y)*stride,\
pix_base + (m1x) + (m1y)*stride,\
#define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\
{\
- uint8_t *pix_base = p_fref_w + omx + omy*stride;\
+ pixel *pix_base = p_fref_w + omx + omy*stride;\
h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
pix_base + (m0x) + (m0y)*stride,\
pix_base + (m1x) + (m1y)*stride,\
int bmx, bmy, bcost;
int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX;
int omx, omy, pmx, pmy;
- uint8_t *p_fenc = m->p_fenc[0];
- uint8_t *p_fref_w = m->p_fref_w;
- ALIGNED_ARRAY_16( uint8_t, pix,[16*16] );
+ pixel *p_fenc = m->p_fenc[0];
+ pixel *p_fref_w = m->p_fref_w;
+ ALIGNED_ARRAY_16( pixel, pix,[16*16] );
int costs[16];
const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
+ uint32_t pmv;
bmx = x264_clip3( m->mvp[0], mv_x_min_qpel, mv_x_max_qpel );
bmy = x264_clip3( m->mvp[1], mv_y_min_qpel, mv_y_max_qpel );
pmx = ( bmx + 2 ) >> 2;
/* try extra predictors if provided */
if( h->mb.i_subpel_refine >= 3 )
{
- uint32_t bmv = pack16to32_mask(bmx,bmy);
+ pmv = pack16to32_mask(bmx,bmy);
if( i_mvc )
COST_MV_HPEL( bmx, bmy );
for( int i = 0; i < i_mvc; i++ )
{
- if( M32( mvc[i] ) && (bmv - M32( mvc[i] )) )
+ if( M32( mvc[i] ) && (pmv != M32( mvc[i] )) )
{
int mx = x264_clip3( mvc[i][0], mv_x_min_qpel, mv_x_max_qpel );
int my = x264_clip3( mvc[i][1], mv_y_min_qpel, mv_y_max_qpel );
* sensible to omit the cost of the MV from the rounded MVP to avoid unfairly
* biasing against use of the predicted motion vector. */
bcost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[bmy*stride+bmx], stride );
- for( int i = 0; i < i_mvc; i++ )
+ pmv = pack16to32_mask( bmx, bmy );
+ if( i_mvc > 0 )
{
- int mx = (mvc[i][0] + 2) >> 2;
- int my = (mvc[i][1] + 2) >> 2;
- if( (mx | my) && ((mx-bmx) | (my-bmy)) )
+ ALIGNED_ARRAY_8( int16_t, mvc_fpel,[16],[2] );
+ x264_predictor_roundclip( mvc_fpel, mvc, i_mvc, mv_x_min, mv_x_max, mv_y_min, mv_y_max );
+ bcost <<= 4;
+ for( int i = 1; i <= i_mvc; i++ )
+ {
+ if( M32( mvc_fpel[i-1] ) && (pmv != M32( mvc[i-1] )) )
+ {
+ int mx = mvc_fpel[i-1][0];
+ int my = mvc_fpel[i-1][1];
+ int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, &p_fref_w[my*stride+mx], stride ) + BITS_MVD( mx, my );
+ cost = (cost << 4) + i;
+ COPY1_IF_LT( bcost, cost );
+ }
+ }
+ if( bcost&15 )
{
- mx = x264_clip3( mx, mv_x_min, mv_x_max );
- my = x264_clip3( my, mv_y_min, mv_y_max );
- COST_MV( mx, my );
+ bmx = mvc_fpel[(bcost&15)-1][0];
+ bmy = mvc_fpel[(bcost&15)-1][1];
}
+ bcost >>= 4;
}
}
- COST_MV( 0, 0 );
+
+ if( pmv )
+ COST_MV( 0, 0 );
switch( h->mb.i_me_method )
{
{
/* diamond search, radius 1 */
bcost <<= 4;
- int i = 0;
+ int i = i_me_range;
do
{
COST_MV_X4_DIR( 0,-1, 0,1, -1,0, 1,0, costs );
bmx -= (bcost<<28)>>30;
bmy -= (bcost<<30)>>30;
bcost &= ~15;
- if( !CHECK_MVRANGE(bmx, bmy) )
- break;
- } while( ++i < i_me_range );
+ } while( --i && CHECK_MVRANGE(bmx, bmy) );
bcost >>= 4;
break;
}
case X264_ME_HEX:
- {
+ {
me_hex2:
/* hexagon search, radius 2 */
#if 0
bmy += hex2[dir+1][1];
/* half hexagon, not overlapping the previous iteration */
- for( int i = 1; i < i_me_range>>1 && CHECK_MVRANGE(bmx, bmy); i++ )
+ for( int i = (i_me_range>>1) - 1; i > 0 && CHECK_MVRANGE(bmx, bmy); i-- )
{
COST_MV_X3_DIR( hex2[dir+0][0], hex2[dir+0][1],
hex2[dir+1][0], hex2[dir+1][1],
/* Uneven-cross Multi-Hexagon-grid Search
* as in JM, except with different early termination */
- static const int x264_pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 };
+ static const uint8_t x264_pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 };
int ucost1, ucost2;
int cross_start = 1;
/* range multipliers based on casual inspection of some statistics of
* average distance between current predictor and final mv found by ESA.
* these have not been tuned much by actual encoding. */
- static const int range_mul[4][4] =
+ static const uint8_t range_mul[4][4] =
{
{ 3, 3, 4, 4 },
{ 3, 4, 4, 4 },
: mvd < 20*denom ? 1
: mvd < 40*denom ? 2 : 3;
- i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] / 4;
+ i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] >> 2;
}
/* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
int i = 1;
do
{
- static const int hex4[16][2] = {
+ static const int8_t hex4[16][2] = {
{ 0,-4}, { 0, 4}, {-2,-3}, { 2,-3},
{-4,-2}, { 4,-2}, {-4,-1}, { 4,-1},
{-4, 0}, { 4, 0}, {-4, 1}, { 4, 1},
else
{
int dir = 0;
- uint8_t *pix_base = p_fref_w + omx + (omy-4*i)*stride;
+ pixel *pix_base = p_fref_w + omx + (omy-4*i)*stride;
int dy = i*stride;
#define SADS(k,x0,y0,x1,y1,x2,y2,x3,y3)\
h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\
#if 0
/* plain old exhaustive search */
for( int my = min_y; my <= max_y; my++ )
- for( int mx = min_x; mx <= max_x; mx++ )
+ for( int mx = min_x; mx < min_x + width; mx++ )
COST_MV( mx, my );
#else
/* successive elimination by comparing DC before a full SAD,
* because sum(abs(diff)) >= abs(diff(sum)). */
uint16_t *sums_base = m->integral;
- /* due to a GCC bug on some platforms (win32?), zero[] may not actually be aligned.
- * this is not a problem because it is not used for any SSE instructions. */
- ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] );
+ ALIGNED_16( static pixel zero[8*FENC_STRIDE] ) = {0};
ALIGNED_ARRAY_16( int, enc_dc,[4] );
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int delta = x264_pixel_size[sad_size].w;
int16_t *xs = h->scratch_buffer;
int xn;
- uint16_t *cost_fpel_mvx = h->cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
+ uint16_t *cost_fpel_mvx = h->cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);
h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
if( h->mb.i_me_method == X264_ME_TESA )
{
// ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
- mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15));
+ mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15) + 4);
int nmvsad = 0, limit;
int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+bmy*stride+bmx, stride )
continue;
bsad -= ycost;
xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
- cost_fpel_mvx+min_x, xs, width, bsad*17/16 );
+ cost_fpel_mvx+min_x, xs, width, bsad * 17 >> 4 );
for( i = 0; i < xn-2; i += 3 )
{
- uint8_t *ref = p_fref_w+min_x+my*stride;
+ pixel *ref = p_fref_w+min_x+my*stride;
int sads[3];
h->pixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
for( int j = 0; j < 3; j++ )
bsad += ycost;
}
- limit = i_me_range / 2;
+ limit = i_me_range >> 1;
sad_thresh = bsad*sad_thresh>>3;
while( nmvsad > limit*2 && sad_thresh > bsad )
{
if( WORD_SIZE == 8 && sizeof(mvsad_t) == 8 )
{
uint64_t mvsad = M64( &mvsads[i] ) = M64( &mvsads[j] );
-#ifdef WORDS_BIGENDIAN
+#if WORDS_BIGENDIAN
mvsad >>= 32;
#endif
sad = mvsad;
#define COST_MV_SAD( mx, my ) \
{ \
int stride = 16; \
- uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
+ pixel *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \
if( b_refine_qpel || (dir^1) != odir ) \
{ \
int stride = 16; \
- uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
+ pixel *src = h->mc.get_ref( pix, &stride, &m->p_fref[0], m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \
int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \
+ p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
if( b_chroma_me && cost < bcost ) \
{ \
- h->mc.mc_chroma( pix, 8, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \
- if( m->weight[1].weightfn ) \
- m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 8, pix, 8, \
- &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \
- cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix, 8 ); \
- if( cost < bcost ) \
+ if( CHROMA444 ) \
{ \
- h->mc.mc_chroma( pix, 8, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \
- if( m->weight[2].weightfn ) \
- m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix, 8, pix, 8, \
- &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \
- cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix, 8 ); \
+ stride = 16; \
+ src = h->mc.get_ref( pix, &stride, &m->p_fref[4], m->i_stride[1], mx, my, bw, bh, &m->weight[1] ); \
+ cost += h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[1], FENC_STRIDE, src, stride ); \
+ if( cost < bcost ) \
+ { \
+ stride = 16; \
+ src = h->mc.get_ref( pix, &stride, &m->p_fref[8], m->i_stride[2], mx, my, bw, bh, &m->weight[2] ); \
+ cost += h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[2], FENC_STRIDE, src, stride ); \
+ } \
+ } \
+ else \
+ { \
+ h->mc.mc_chroma( pix, pix+8, 16, m->p_fref[4], m->i_stride[1], \
+ mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \
+ if( m->weight[1].weightfn ) \
+ m->weight[1].weightfn[bw>>3]( pix, 16, pix, 16, &m->weight[1], bh>>chroma_v_shift ); \
+ cost += h->pixf.mbcmp[chromapix]( m->p_fenc[1], FENC_STRIDE, pix, 16 ); \
+ if( cost < bcost ) \
+ { \
+ if( m->weight[2].weightfn ) \
+ m->weight[2].weightfn[bw>>3]( pix+8, 16, pix+8, 16, &m->weight[2], bh>>chroma_v_shift ); \
+ cost += h->pixf.mbcmp[chromapix]( m->p_fenc[2], FENC_STRIDE, pix+8, 16 ); \
+ } \
} \
} \
COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, bdir, dir ); \
const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
const int i_pixel = m->i_pixel;
- const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
- const int mvy_offset = h->mb.b_interlaced & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ const int b_chroma_me = h->mb.b_chroma_me && (i_pixel <= PIXEL_8x8 || CHROMA444);
+ int chromapix = h->luma2chroma_pixel[i_pixel];
+ int chroma_v_shift = CHROMA_V_SHIFT;
+ int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
- ALIGNED_ARRAY_16( uint8_t, pix,[64*18] ); // really 17x17x2, but round up for alignment
+ ALIGNED_ARRAY_16( pixel, pix,[64*18] ); // really 17x17x2, but round up for alignment
int bmx = m->mv[0];
int bmy = m->mv[1];
int omx = bmx, omy = bmy;
int costs[4];
int stride = 64; // candidates are either all hpel or all qpel, so one stride is enough
- uint8_t *src0, *src1, *src2, *src3;
+ pixel *src0, *src1, *src2, *src3;
src0 = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] );
src2 = h->mc.get_ref( pix+32, &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh, &m->weight[0] );
src1 = src0 + stride;
break;
}
- if( !b_refine_qpel )
+ if( !b_refine_qpel && (h->pixf.mbcmp_unaligned[0] != h->pixf.fpelcmp[0] || b_chroma_me) )
{
bcost = COST_MAX;
COST_MV_SATD( bmx, bmy, -1 );
}
/* quarterpel diamond search */
- if( h->mb.i_subpel_refine > 1 )
+ if( h->mb.i_subpel_refine != 1 )
{
bdir = -1;
for( int i = qpel_iters; i > 0; i-- )
m->cost_mv = p_cost_mvx[bmx] + p_cost_mvy[bmy];
}
-#define BIME_CACHE( dx, dy, list ) \
-{ \
+#define BIME_CACHE( dx, dy, list )\
+{\
x264_me_t *m = m##list;\
- int i = 4 + 3*dx + dy; \
+ int i = 4 + 3*dx + dy;\
int mvx = bm##list##x+dx;\
int mvy = bm##list##y+dy;\
- stride[list][i] = bw;\
- src[list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[list][i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none ); \
+ stride[0][list][i] = bw;\
+ src[0][list][i] = h->mc.get_ref( pixy_buf[list][i], &stride[0][list][i], &m->p_fref[0],\
+ m->i_stride[0], mvx, mvy, bw, bh, x264_weight_none );\
if( rd )\
{\
- h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
- h->mc.mc_chroma( pixv_buf[list][i], 8, m->p_fref[5], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\
+ if( CHROMA444 )\
+ {\
+ stride[1][list][i] = bw;\
+ src[1][list][i] = h->mc.get_ref( pixu_buf[list][i], &stride[1][list][i], &m->p_fref[4],\
+ m->i_stride[1], mvx, mvy, bw, bh, x264_weight_none );\
+ stride[2][list][i] = bw;\
+ src[2][list][i] = h->mc.get_ref( pixv_buf[list][i], &stride[2][list][i], &m->p_fref[8],\
+ m->i_stride[2], mvx, mvy, bw, bh, x264_weight_none );\
+ }\
+ else\
+ h->mc.mc_chroma( pixu_buf[list][i], pixv_buf[list][i], 8, m->p_fref[4], m->i_stride[1],\
+ mvx, 2*(mvy+mv##list##y_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift );\
}\
}
-#define SATD_THRESH 17/16
+#define SATD_THRESH(cost) (cost+(cost>>4))
/* Don't unroll the BIME_CACHE loop. I couldn't find any way to force this
* other than making its iteration count not a compile-time constant. */
static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd )
{
- int16_t *cache0_mv = h->mb.cache.mv[0][x264_scan8[i8*4]];
- int16_t *cache1_mv = h->mb.cache.mv[1][x264_scan8[i8*4]];
+ int x = i8&1;
+ int y = i8>>1;
+ int s8 = X264_SCAN8_0 + 2*x + 16*y;
+ int16_t *cache0_mv = h->mb.cache.mv[0][s8];
+ int16_t *cache1_mv = h->mb.cache.mv[1][s8];
const int i_pixel = m0->i_pixel;
const int bw = x264_pixel_size[i_pixel].w;
const int bh = x264_pixel_size[i_pixel].h;
- ALIGNED_ARRAY_16( uint8_t, pixy_buf,[2],[9][16*16] );
- ALIGNED_ARRAY_8( uint8_t, pixu_buf,[2],[9][8*8] );
- ALIGNED_ARRAY_8( uint8_t, pixv_buf,[2],[9][8*8] );
- uint8_t *src[2][9];
- uint8_t *pix = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8];
- uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
- uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
- const int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]];
- const int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]];
- const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
- const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
- int stride[2][9];
+ ALIGNED_ARRAY_16( pixel, pixy_buf,[2],[9][16*16] );
+ ALIGNED_ARRAY_16( pixel, pixu_buf,[2],[9][16*16] );
+ ALIGNED_ARRAY_16( pixel, pixv_buf,[2],[9][16*16] );
+ pixel *src[3][2][9];
+ int chromapix = h->luma2chroma_pixel[i_pixel];
+ int chroma_v_shift = CHROMA_V_SHIFT;
+ int chroma_x = (8 >> CHROMA_H_SHIFT) * x;
+ int chroma_y = (8 >> chroma_v_shift) * y;
+ pixel *pix = &h->mb.pic.p_fdec[0][8*x + 8*y*FDEC_STRIDE];
+ pixel *pixu = &h->mb.pic.p_fdec[1][chroma_x + chroma_y*FDEC_STRIDE];
+ pixel *pixv = &h->mb.pic.p_fdec[2][chroma_x + chroma_y*FDEC_STRIDE];
+ int ref0 = h->mb.cache.ref[0][s8];
+ int ref1 = h->mb.cache.ref[1][s8];
+ const int mv0y_offset = chroma_v_shift & MB_INTERLACED & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ const int mv1y_offset = chroma_v_shift & MB_INTERLACED & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ int stride[3][2][9];
int bm0x = m0->mv[0];
int bm0y = m0->mv[1];
int bm1x = m1->mv[0];
/* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] );
/* all permutations of an offset in up to 2 of the dimensions */
- static const int8_t dia4d[33][4] =
+ ALIGNED_4( static const int8_t dia4d[33][4] ) =
{
{0,0,0,0},
{0,0,0,1}, {0,0,0,-1}, {0,0,1,0}, {0,0,-1,0},
int i0 = 4 + 3*dia4d[j][0] + dia4d[j][1];
int i1 = 4 + 3*dia4d[j][2] + dia4d[j][3];
visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));
- h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src[0][i0], stride[0][i0], src[1][i1], stride[1][i1], i_weight );
+ h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src[0][0][i0], stride[0][0][i0], src[0][1][i1], stride[0][1][i1], i_weight );
int cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE )
+ p_cost_m0x[m0x] + p_cost_m0y[m0y] + p_cost_m1x[m1x] + p_cost_m1y[m1y];
if( rd )
{
- if( cost < bcost * SATD_THRESH )
+ if( cost < SATD_THRESH(bcost) )
{
bcost = X264_MIN( cost, bcost );
M32( cache0_mv ) = pack16to32_mask(m0x,m0y);
M32( cache1_mv ) = pack16to32_mask(m1x,m1y);
- h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
- h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
+ if( CHROMA444 )
+ {
+ h->mc.avg[i_pixel]( pixu, FDEC_STRIDE, src[1][0][i0], stride[1][0][i0], src[1][1][i1], stride[1][1][i1], i_weight );
+ h->mc.avg[i_pixel]( pixv, FDEC_STRIDE, src[2][0][i0], stride[2][0][i0], src[2][1][i1], stride[2][1][i1], i_weight );
+ }
+ else
+ {
+ h->mc.avg[chromapix]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight );
+ h->mc.avg[chromapix]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight );
+ }
uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel );
COPY2_IF_LT( bcostrd, costrd, bestj, j );
}
if( rd )
{
- x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
+ x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 0, pack16to32_mask(bm0x, bm0y) );
amvd = pack8to16( X264_MIN(abs(bm0x - m0->mvp[0]),33), X264_MIN(abs(bm0y - m0->mvp[1]),33) );
- x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 0, amvd );
+ x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 0, amvd );
- x264_macroblock_cache_mv ( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
+ x264_macroblock_cache_mv ( h, 2*x, 2*y, bw>>2, bh>>2, 1, pack16to32_mask(bm1x, bm1y) );
amvd = pack8to16( X264_MIN(abs(bm1x - m1->mvp[0]),33), X264_MIN(abs(bm1y - m1->mvp[1]),33) );
- x264_macroblock_cache_mvd( h, (i8&1)*2, (i8>>1)*2, bw>>2, bh>>2, 1, amvd );
+ x264_macroblock_cache_mvd( h, 2*x, 2*y, bw>>2, bh>>2, 1, amvd );
}
m0->mv[0] = bm0x;
#define COST_MV_RD( mx, my, satd, do_dir, mdir ) \
{ \
- if( satd <= bsatd * SATD_THRESH ) \
+ if( satd <= SATD_THRESH(bsatd) ) \
{ \
uint64_t cost; \
M32( cache_mv ) = pack16to32_mask(mx,my); \
- if( m->i_pixel <= PIXEL_8x8 )\
- {\
- h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\
- h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\
- }\
+ if( CHROMA444 ) \
+ { \
+ h->mc.mc_luma( pixu, FDEC_STRIDE, &m->p_fref[4], m->i_stride[1], mx, my, bw, bh, &m->weight[1] ); \
+ h->mc.mc_luma( pixv, FDEC_STRIDE, &m->p_fref[8], m->i_stride[2], mx, my, bw, bh, &m->weight[2] ); \
+ } \
+ else if( m->i_pixel <= PIXEL_8x8 ) \
+ { \
+ h->mc.mc_chroma( pixu, pixv, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], \
+ mx, 2*(my+mvy_offset)>>chroma_v_shift, bw>>1, bh>>chroma_v_shift ); \
+ if( m->weight[1].weightfn ) \
+ m->weight[1].weightfn[bw>>3]( pixu, FDEC_STRIDE, pixu, FDEC_STRIDE, &m->weight[1], bh>>chroma_v_shift ); \
+ if( m->weight[2].weightfn ) \
+ m->weight[2].weightfn[bw>>3]( pixv, FDEC_STRIDE, pixv, FDEC_STRIDE, &m->weight[2], bh>>chroma_v_shift ); \
+ } \
cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \
COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \
} \
const int bw = x264_pixel_size[m->i_pixel].w;
const int bh = x264_pixel_size[m->i_pixel].h;
const int i_pixel = m->i_pixel;
- const int mvy_offset = h->mb.b_interlaced & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
+ int chroma_v_shift = CHROMA_V_SHIFT;
+ int mvy_offset = chroma_v_shift & MB_INTERLACED & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0;
uint64_t bcost = COST_MAX64;
int bmx = m->mv[0];
int bmy = m->mv[1];
int omx, omy, pmx, pmy;
- unsigned bsatd;
- int satd;
+ int satd, bsatd;
int dir = -2;
int i8 = i4>>2;
uint16_t amvd;
- uint8_t *pix = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
- uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
- uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4];
+ pixel *pix = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]];
+ pixel *pixu, *pixv;
+ if( CHROMA444 )
+ {
+ pixu = &h->mb.pic.p_fdec[1][block_idx_xy_fdec[i4]];
+ pixv = &h->mb.pic.p_fdec[2][block_idx_xy_fdec[i4]];
+ }
+ else
+ {
+ pixu = &h->mb.pic.p_fdec[1][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4];
+ pixv = &h->mb.pic.p_fdec[2][(i8>>1)*(8*FDEC_STRIDE>>chroma_v_shift)+(i8&1)*4];
+ }
h->mb.b_skip_mc = 1;
m->mv[0] = bmx;
m->mv[1] = bmy;
x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) );
- amvd = pack8to16( X264_MIN(abs(bmx - m->mvp[0]),33), X264_MIN(abs(bmy - m->mvp[1]),33) );
+ amvd = pack8to16( X264_MIN(abs(bmx - m->mvp[0]),66), X264_MIN(abs(bmy - m->mvp[1]),66) );
x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, amvd );
h->mb.b_skip_mc = 0;
}