X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=encoder%2Fme.c;h=4f8f86f346d7893e45568b4cdd66b46c5af8b068;hb=741ed788e905820d2a9fc892ea288350e939b78f;hp=18eb987fca1acc42961341ce9836cda787986aa3;hpb=b08410d07ea242250fcb827742c74046d59bd991;p=x264 diff --git a/encoder/me.c b/encoder/me.c index 18eb987f..4f8f86f3 100644 --- a/encoder/me.c +++ b/encoder/me.c @@ -42,6 +42,7 @@ static const int subpel_iterations[][4] = {0,0,2,2}, {0,0,2,2}, {0,0,4,10}, + {0,0,4,10}, {0,0,4,10}}; /* (x-1)%6 */ @@ -58,7 +59,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite #define COST_MV( mx, my )\ {\ int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE,\ - &p_fref[(my)*stride+(mx)], stride )\ + &p_fref_w[(my)*stride+(mx)], stride )\ + BITS_MVD(mx,my);\ COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my );\ } @@ -66,7 +67,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite #define COST_MV_HPEL( mx, my ) \ { \ int stride2 = 16; \ - uint8_t *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh ); \ + uint8_t *src = h->mc.get_ref( pix, &stride2, m->p_fref, stride, mx, my, bw, bh, &m->weight[0] ); \ int cost = h->pixf.fpelcmp[i_pixel]( p_fenc, FENC_STRIDE, src, stride2 ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ COPY3_IF_LT( bpred_cost, cost, bpred_mx, mx, bpred_my, my ); \ @@ -74,7 +75,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite #define COST_MV_X3_DIR( m0x, m0y, m1x, m1y, m2x, m2y, costs )\ {\ - uint8_t *pix_base = p_fref + bmx + bmy*stride;\ + uint8_t *pix_base = p_fref_w + bmx + bmy*stride;\ h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\ pix_base + (m0x) + (m0y)*stride,\ pix_base + (m1x) + (m1y)*stride,\ @@ -87,7 +88,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite #define COST_MV_X4_DIR( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs )\ {\ - uint8_t *pix_base = p_fref + bmx + bmy*stride;\ + uint8_t *pix_base = p_fref_w + bmx + bmy*stride;\ h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\ pix_base + (m0x) + (m0y)*stride,\ pix_base + (m1x) + (m1y)*stride,\ @@ -102,7 +103,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite #define COST_MV_X4( m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y )\ {\ - uint8_t *pix_base = p_fref + omx + omy*stride;\ + uint8_t *pix_base = p_fref_w + omx + omy*stride;\ h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\ pix_base + (m0x) + (m0y)*stride,\ pix_base + (m1x) + (m1y)*stride,\ @@ -122,9 +123,9 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite #define COST_MV_X3_ABS( m0x, m0y, m1x, m1y, m2x, m2y )\ {\ h->pixf.fpelcmp_x3[i_pixel]( p_fenc,\ - p_fref + (m0x) + (m0y)*stride,\ - p_fref + (m1x) + (m1y)*stride,\ - p_fref + (m2x) + (m2y)*stride,\ + p_fref_w + (m0x) + (m0y)*stride,\ + p_fref_w + (m1x) + (m1y)*stride,\ + p_fref_w + (m2x) + (m2y)*stride,\ stride, costs );\ costs[0] += p_cost_mvx[(m0x)<<2]; /* no cost_mvy */\ costs[1] += p_cost_mvx[(m1x)<<2];\ @@ -180,8 +181,8 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX; int omx, omy, pmx, pmy; uint8_t *p_fenc = m->p_fenc[0]; - uint8_t *p_fref = m->p_fref[0]; - DECLARE_ALIGNED_16( uint8_t pix[16*16] ); + uint8_t *p_fref_w = m->p_fref_w; + ALIGNED_ARRAY_16( uint8_t, pix,[16*16] ); int i, j; int dir; @@ -194,8 +195,8 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, #define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max ) - const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; - const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; + const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; + const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 ); bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 ); @@ -210,7 +211,7 @@ void x264_me_search_ref( x264_t *h, x264_me_t *m, int16_t (*mvc)[2], int i_mvc, COST_MV_HPEL( bmx, bmy ); for( i = 0; i < i_mvc; i++ ) { - if( *(uint32_t*)mvc[i] && (bmv - *(uint32_t*)mvc[i]) ) + if( M32( mvc[i] ) && (bmv - M32( mvc[i] )) ) { int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 ); int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 ); @@ -451,8 +452,8 @@ me_hex2: /* hexagon grid */ omx = bmx; omy = bmy; - const int16_t *p_cost_omvx = p_cost_mvx + omx*4; - const int16_t *p_cost_omvy = p_cost_mvy + omy*4; + const uint16_t *p_cost_omvx = p_cost_mvx + omx*4; + const uint16_t *p_cost_omvy = p_cost_mvy + omy*4; i = 1; do { @@ -477,7 +478,7 @@ me_hex2: else { int dir = 0; - uint8_t *pix_base = p_fref + omx + (omy-4*i)*stride; + uint8_t *pix_base = p_fref_w + omx + (omy-4*i)*stride; int dy = i*stride; #define SADS(k,x0,y0,x1,y1,x2,y2,x3,y3)\ h->pixf.fpelcmp_x4[i_pixel]( p_fenc,\ @@ -535,7 +536,7 @@ me_hex2: } } } while( ++i <= i_me_range/4 ); - if( bmy <= mv_y_max && bmy >= mv_y_min ) + if( bmy <= mv_y_max && bmy >= mv_y_min && bmx <= mv_x_max && bmx >= mv_x_min ) goto me_hex2; break; } @@ -562,13 +563,13 @@ me_hex2: uint16_t *sums_base = m->integral; /* due to a GCC bug on some platforms (win32?), zero[] may not actually be aligned. * this is not a problem because it is not used for any SSE instructions. */ - DECLARE_ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] ); - DECLARE_ALIGNED_16( int enc_dc[4] ); + ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] ); + ALIGNED_ARRAY_16( int, enc_dc,[4] ); int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4; int delta = x264_pixel_size[sad_size].w; int16_t *xs = h->scratch_buffer; int xn; - uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2); + uint16_t *cost_fpel_mvx = h->cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2); h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta, p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE, @@ -586,7 +587,7 @@ me_hex2: mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15)); int nmvsad = 0, limit; int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12; - int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref+bmy*stride+bmx, stride ) + int bsad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+bmy*stride+bmx, stride ) + BITS_MVD( bmx, bmy ); for( my = min_y; my <= max_y; my++ ) { @@ -598,7 +599,7 @@ me_hex2: cost_fpel_mvx+min_x, xs, width, bsad*17/16 ); for( i=0; ipixf.sad_x3[i_pixel]( p_fenc, ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads ); for( j=0; j<3; j++ ) @@ -608,8 +609,8 @@ me_hex2: { COPY1_IF_LT( bsad, sad ); mvsads[nmvsad].sad = sad + ycost; - mvsads[nmvsad].mx = min_x+xs[i+j]; - mvsads[nmvsad].my = my; + mvsads[nmvsad].mv[0] = min_x+xs[i+j]; + mvsads[nmvsad].mv[1] = my; nmvsad++; } } @@ -617,14 +618,14 @@ me_hex2: for( ; ipixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref+mx+my*stride, stride ) + int sad = h->pixf.sad[i_pixel]( p_fenc, FENC_STRIDE, p_fref_w+mx+my*stride, stride ) + cost_fpel_mvx[xs[i]]; if( sad < bsad*sad_thresh>>3 ) { COPY1_IF_LT( bsad, sad ); mvsads[nmvsad].sad = sad + ycost; - mvsads[nmvsad].mx = mx; - mvsads[nmvsad].my = my; + mvsads[nmvsad].mv[0] = mx; + mvsads[nmvsad].mv[1] = my; nmvsad++; } } @@ -632,42 +633,47 @@ me_hex2: } limit = i_me_range / 2; - if( nmvsad > limit*2 ) + sad_thresh = bsad*sad_thresh>>3; + while( nmvsad > limit*2 && sad_thresh > bsad ) { // halve the range if the domain is too large... eh, close enough - bsad = bsad*(sad_thresh+8)>>4; - for( i=0; i> 1; + for( i=0; i>= 32; +#endif + sad = mvsad; } - nmvsad = i; - } - if( nmvsad > limit ) - { - for( i=0; i i ) + else { - if( sizeof( mvsad_t ) == sizeof( uint64_t ) ) - XCHG( uint64_t, *(uint64_t*)&mvsads[i], *(uint64_t*)&mvsads[bj] ); - else - XCHG( mvsad_t, mvsads[i], mvsads[bj] ); + sad = mvsads[j].sad; + CP32( mvsads[i].mv, mvsads[j].mv ); + mvsads[i].sad = sad; } + i += (sad - (sad_thresh+1)) >> 31; } - nmvsad = limit; + nmvsad = i; + } + while( nmvsad > limit ) + { + int bi = 0; + for( i=1; i mvsads[bi].sad ) + bi = i; + nmvsad--; + if( sizeof( mvsad_t ) == sizeof( uint64_t ) ) + CP64( &mvsads[bi], &mvsads[nmvsad] ); + else + mvsads[bi] = mvsads[nmvsad]; } for( i=0; ii_pixel <= PIXEL_8x8 && h->sh.i_type == SLICE_TYPE_P ) m->cost -= m->i_ref_cost; - + refine_subpel( h, m, hpel, qpel, NULL, 1 ); } +void x264_me_refine_qpel_refdupe( x264_t *h, x264_me_t *m, int *p_halfpel_thresh ) +{ + refine_subpel( h, m, 0, X264_MIN( 2, subpel_iterations[h->mb.i_subpel_refine][3] ), p_halfpel_thresh, 0 ); +} + #define COST_MV_SAD( mx, my ) \ { \ int stride = 16; \ - uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \ + uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \ int cost = h->pixf.fpelcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ COPY3_IF_LT( bcost, cost, bmx, mx, bmy, my ); \ @@ -745,17 +756,23 @@ void x264_me_refine_qpel( x264_t *h, x264_me_t *m ) if( b_refine_qpel || (dir^1) != odir ) \ { \ int stride = 16; \ - uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh ); \ + uint8_t *src = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \ int cost = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \ if( b_chroma_me && cost < bcost ) \ { \ - h->mc.mc_chroma( pix[0], 8, m->p_fref[4], m->i_stride[1], mx, my, bw/2, bh/2 ); \ + h->mc.mc_chroma( pix[0], 8, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \ + if( m->weight[1].weightfn ) \ + m->weight[1].weightfn[x264_pixel_size[i_pixel].w>>3]( pix[0], 8, pix[0], 8, \ + &m->weight[1], x264_pixel_size[i_pixel].h>>1 ); \ cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], FENC_STRIDE, pix[0], 8 ); \ if( cost < bcost ) \ { \ - h->mc.mc_chroma( pix[0], 8, m->p_fref[5], m->i_stride[1], mx, my, bw/2, bh/2 ); \ + h->mc.mc_chroma( pix[0], 8, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw/2, bh/2 ); \ cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix[0], 8 ); \ + if( m->weight[2].weightfn ) \ + m->weight[2].weightfn[x264_pixel_size[i_pixel].w>>3]( pix[0], 8, pix[0], 8, \ + &m->weight[2], x264_pixel_size[i_pixel].h>>1 ); \ } \ } \ if( cost < bcost ) \ @@ -771,12 +788,13 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite { const int bw = x264_pixel_size[m->i_pixel].w; const int bh = x264_pixel_size[m->i_pixel].h; - const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; - const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; + const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; + const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; const int i_pixel = m->i_pixel; const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8; + const int mvy_offset = h->mb.b_interlaced & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; - DECLARE_ALIGNED_16( uint8_t pix[2][32*18] ); // really 17x17, but round up for alignment + ALIGNED_ARRAY_16( uint8_t, pix,[2],[32*18] ); // really 17x17, but round up for alignment int omx, omy; int i; @@ -801,8 +819,8 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite int costs[4]; int stride = 32; // candidates are either all hpel or all qpel, so one stride is enough uint8_t *src0, *src1, *src2, *src3; - src0 = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1 ); - src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh ); + src0 = h->mc.get_ref( pix[0], &stride, m->p_fref, m->i_stride[0], omx, omy-2, bw, bh+1, &m->weight[0] ); + src2 = h->mc.get_ref( pix[1], &stride, m->p_fref, m->i_stride[0], omx-2, omy, bw+4, bh, &m->weight[0] ); src1 = src0 + stride; src3 = src2 + 1; h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], src0, src1, src2, src3, stride, costs ); @@ -839,7 +857,7 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite bdir = -1; for( i = qpel_iters; i > 0; i-- ) { - if( bmy <= h->mb.mv_min_spel[1] || bmy >= h->mb.mv_max_spel[1] ) + if( bmy <= h->mb.mv_min_spel[1] || bmy >= h->mb.mv_max_spel[1] || bmx <= h->mb.mv_min_spel[0] || bmx >= h->mb.mv_max_spel[0] ) break; odir = bdir; omx = bmx; @@ -858,89 +876,26 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite m->cost_mv = p_cost_mvx[ bmx ] + p_cost_mvy[ bmy ]; } -#define BIME_CACHE( dx, dy ) \ +#define BIME_CACHE( dx, dy, list ) \ { \ + x264_me_t *m = m##list;\ int i = 4 + 3*dx + dy; \ - int mvx0 = om0x+dx, mvy0 = om0y+dy;\ - int mvx1 = om1x+dx, mvy1 = om1y+dy;\ - stride0[i] = bw;\ - stride1[i] = bw;\ - src0[i] = h->mc.get_ref( pix0[i], &stride0[i], m0->p_fref, m0->i_stride[0], mvx0, mvy0, bw, bh ); \ - src1[i] = h->mc.get_ref( pix1[i], &stride1[i], m1->p_fref, m1->i_stride[0], mvx1, mvy1, bw, bh ); \ + int mvx = om##list##x+dx;\ + int mvy = om##list##y+dy;\ + stride##list[i] = bw;\ + src##list[i] = h->mc.get_ref( pixy_buf[list][i], &stride##list[i], m->p_fref, m->i_stride[0], mvx, mvy, bw, bh, weight_none ); \ if( rd )\ {\ - if( h->mb.b_interlaced & ref0 )\ - mvy0 += (h->mb.i_mb_y & 1)*4 - 2;\ - if( h->mb.b_interlaced & ref1 )\ - mvy1 += (h->mb.i_mb_y & 1)*4 - 2;\ - h->mc.mc_chroma( pixu0[i], 8, m0->p_fref[4], m0->i_stride[1], mvx0, mvy0, bw>>1, bh>>1 );\ - h->mc.mc_chroma( pixu1[i], 8, m1->p_fref[4], m1->i_stride[1], mvx1, mvy1, bw>>1, bh>>1 );\ - h->mc.mc_chroma( pixv0[i], 8, m0->p_fref[5], m0->i_stride[1], mvx0, mvy0, bw>>1, bh>>1 );\ - h->mc.mc_chroma( pixv1[i], 8, m1->p_fref[5], m1->i_stride[1], mvx1, mvy1, bw>>1, bh>>1 );\ + h->mc.mc_chroma( pixu_buf[list][i], 8, m->p_fref[4], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ + h->mc.mc_chroma( pixv_buf[list][i], 8, m->p_fref[5], m->i_stride[1], mvx, mvy + mv##list##y_offset, bw>>1, bh>>1 );\ }\ } -#define BIME_CACHE2(a,b) \ - BIME_CACHE(a,b) \ - BIME_CACHE(-(a),-(b)) - #define SATD_THRESH 17/16 -#define COST_BIMV_SATD( m0x, m0y, m1x, m1y ) \ -if( pass == 0 || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) ) \ -{ \ - int cost; \ - int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); \ - int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); \ - visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7));\ - h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src0[i0], stride0[i0], src1[i1], stride1[i1], i_weight ); \ - cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ) \ - + p_cost_m0x[ m0x ] + p_cost_m0y[ m0y ] \ - + p_cost_m1x[ m1x ] + p_cost_m1y[ m1y ]; \ - if( rd ) \ - { \ - if( cost < bcost * SATD_THRESH ) \ - { \ - uint64_t costrd; \ - if( cost < bcost ) \ - bcost = cost; \ - *(uint32_t*)cache0_mv = *(uint32_t*)cache0_mv2 = pack16to32_mask(m0x,m0y); \ - *(uint32_t*)cache1_mv = *(uint32_t*)cache1_mv2 = pack16to32_mask(m1x,m1y); \ - h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu0[i0], 8, pixu1[i1], 8, i_weight );\ - h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv0[i0], 8, pixv1[i1], 8, i_weight );\ - costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel ); \ - if( costrd < bcostrd ) \ - {\ - bcostrd = costrd;\ - bm0x = m0x; \ - bm0y = m0y; \ - bm1x = m1x; \ - bm1y = m1y; \ - }\ - } \ - } \ - else if( cost < bcost ) \ - { \ - bcost = cost; \ - bm0x = m0x; \ - bm0y = m0y; \ - bm1x = m1x; \ - bm1y = m1y; \ - } \ -} - -#define CHECK_BIDIR(a,b,c,d) \ - COST_BIMV_SATD(om0x+a, om0y+b, om1x+c, om1y+d) - -#define CHECK_BIDIR2(a,b,c,d) \ - CHECK_BIDIR(a,b,c,d) \ - CHECK_BIDIR(-(a),-(b),-(c),-(d)) - -#define CHECK_BIDIR8(a,b,c,d) \ - CHECK_BIDIR2(a,b,c,d) \ - CHECK_BIDIR2(b,c,d,a) \ - CHECK_BIDIR2(c,d,a,b) \ - CHECK_BIDIR2(d,a,b,c) +/* Don't unroll the BIME_CACHE loop. I couldn't find any way to force this + * other than making its iteration count not a compile-time constant. */ +int x264_iter_kludge = 0; static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_weight, int i8, int i_lambda2, int rd ) { @@ -952,23 +907,22 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m const int i_pixel = m0->i_pixel; const int bw = x264_pixel_size[i_pixel].w; const int bh = x264_pixel_size[i_pixel].h; - const int16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0]; - const int16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1]; - const int16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0]; - const int16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1]; - DECLARE_ALIGNED_16( uint8_t pix0[9][16*16] ); - DECLARE_ALIGNED_16( uint8_t pix1[9][16*16] ); - DECLARE_ALIGNED_8( uint8_t pixu0[9][8*8] ); - DECLARE_ALIGNED_8( uint8_t pixu1[9][8*8] ); - DECLARE_ALIGNED_8( uint8_t pixv0[9][8*8] ); - DECLARE_ALIGNED_8( uint8_t pixv1[9][8*8] ); + const uint16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0]; + const uint16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1]; + const uint16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0]; + const uint16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1]; + ALIGNED_ARRAY_16( uint8_t, pixy_buf,[2],[9][16*16] ); + ALIGNED_ARRAY_8( uint8_t, pixu_buf,[2],[9][8*8] ); + ALIGNED_ARRAY_8( uint8_t, pixv_buf,[2],[9][8*8] ); uint8_t *src0[9]; uint8_t *src1[9]; uint8_t *pix = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8]; uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4]; uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4]; - int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]]; - int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]]; + const int ref0 = h->mb.cache.ref[0][x264_scan8[i8*4]]; + const int ref1 = h->mb.cache.ref[1][x264_scan8[i8*4]]; + const int mv0y_offset = h->mb.b_interlaced & ref0 ? (h->mb.i_mb_y & 1)*4 - 2 : 0; + const int mv1y_offset = h->mb.b_interlaced & ref1 ? (h->mb.i_mb_y & 1)*4 - 2 : 0; int stride0[9]; int stride1[9]; int bm0x = m0->mv[0], om0x = bm0x; @@ -977,19 +931,31 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m int bm1y = m1->mv[1], om1y = bm1y; int bcost = COST_MAX; int pass = 0; + int j; + int mc_list0 = 1, mc_list1 = 1; uint64_t bcostrd = COST_MAX64; - /* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */ - DECLARE_ALIGNED_16( uint8_t visited[8][8][8] ); + ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] ); + /* all permutations of an offset in up to 2 of the dimensions */ + static const int8_t dia4d[33][4] = { + {0,0,0,0}, + {0,0,0,1}, {0,0,0,-1}, {0,0,1,0}, {0,0,-1,0}, + {0,1,0,0}, {0,-1,0,0}, {1,0,0,0}, {-1,0,0,0}, + {0,0,1,1}, {0,0,-1,-1},{0,1,1,0}, {0,-1,-1,0}, + {1,1,0,0}, {-1,-1,0,0},{1,0,0,1}, {-1,0,0,-1}, + {0,1,0,1}, {0,-1,0,-1},{1,0,1,0}, {-1,0,-1,0}, + {0,0,-1,1},{0,0,1,-1}, {0,-1,1,0},{0,1,-1,0}, + {-1,1,0,0},{1,-1,0,0}, {1,0,0,-1},{-1,0,0,1}, + {0,-1,0,1},{0,1,0,-1}, {-1,0,1,0},{1,0,-1,0}, + }; if( bm0y < h->mb.mv_min_spel[1] + 8 || bm1y < h->mb.mv_min_spel[1] + 8 || - bm0y > h->mb.mv_max_spel[1] - 8 || bm1y > h->mb.mv_max_spel[1] - 8 ) + bm0y > h->mb.mv_max_spel[1] - 8 || bm1y > h->mb.mv_max_spel[1] - 8 || + bm0x < h->mb.mv_min_spel[0] + 8 || bm1x < h->mb.mv_min_spel[0] + 8 || + bm0x > h->mb.mv_max_spel[0] - 8 || bm1x > h->mb.mv_max_spel[0] - 8 ) return; - h->mc.memzero_aligned( visited, sizeof(visited) ); - - BIME_CACHE( 0, 0 ); - CHECK_BIDIR( 0, 0, 0, 0 ); + h->mc.memzero_aligned( visited, sizeof(uint8_t[8][8][8]) ); for( pass = 0; pass < 8; pass++ ) { @@ -997,27 +963,57 @@ static void ALWAYS_INLINE x264_me_refine_bidir( x264_t *h, x264_me_t *m0, x264_m /* doesn't do chroma ME. this probably doesn't matter, as the gains * from bidir ME are the same with and without chroma ME. */ - BIME_CACHE2( 1, 0 ); - BIME_CACHE2( 0, 1 ); - BIME_CACHE2( 1, 1 ); - BIME_CACHE2( 1,-1 ); + if( mc_list0 ) + for( j = x264_iter_kludge; j < 9; j++ ) + BIME_CACHE( square1[j][0], square1[j][1], 0 ); + + if( mc_list1 ) + for( j = x264_iter_kludge; j < 9; j++ ) + BIME_CACHE( square1[j][0], square1[j][1], 1 ); - CHECK_BIDIR8( 0, 0, 0, 1 ); - CHECK_BIDIR8( 0, 0, 1, 1 ); - CHECK_BIDIR2( 0, 1, 0, 1 ); - CHECK_BIDIR2( 1, 0, 1, 0 ); - CHECK_BIDIR8( 0, 0,-1, 1 ); - CHECK_BIDIR2( 0,-1, 0, 1 ); - CHECK_BIDIR2(-1, 0, 1, 0 ); + for( j = !!pass; j < 33; j++ ) + { + int m0x = dia4d[j][0] + om0x; + int m0y = dia4d[j][1] + om0y; + int m1x = dia4d[j][2] + om1x; + int m1y = dia4d[j][3] + om1y; + if( !pass || !((visited[(m0x)&7][(m0y)&7][(m1x)&7] & (1<<((m1y)&7)))) ) + { + int i0 = 4 + 3*(m0x-om0x) + (m0y-om0y); + int i1 = 4 + 3*(m1x-om1x) + (m1y-om1y); + visited[(m0x)&7][(m0y)&7][(m1x)&7] |= (1<<((m1y)&7)); + h->mc.avg[i_pixel]( pix, FDEC_STRIDE, src0[i0], stride0[i0], src1[i1], stride1[i1], i_weight ); + int cost = h->pixf.mbcmp[i_pixel]( m0->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ) + + p_cost_m0x[m0x] + p_cost_m0y[m0y] + p_cost_m1x[m1x] + p_cost_m1y[m1y]; + if( rd ) + { + if( cost < bcost * SATD_THRESH ) + { + bcost = X264_MIN( cost, bcost ); + M32( cache0_mv ) = pack16to32_mask(m0x,m0y); + M32( cache0_mv2 ) = pack16to32_mask(m0x,m0y); + M32( cache1_mv ) = pack16to32_mask(m1x,m1y); + M32( cache1_mv2 ) = pack16to32_mask(m1x,m1y); + h->mc.avg[i_pixel+3]( pixu, FDEC_STRIDE, pixu_buf[0][i0], 8, pixu_buf[1][i1], 8, i_weight ); + h->mc.avg[i_pixel+3]( pixv, FDEC_STRIDE, pixv_buf[0][i0], 8, pixv_buf[1][i1], 8, i_weight ); + uint64_t costrd = x264_rd_cost_part( h, i_lambda2, i8*4, m0->i_pixel ); + COPY5_IF_LT( bcostrd, costrd, bm0x, m0x, bm0y, m0y, bm1x, m1x, bm1y, m1y ); + } + } + else + COPY5_IF_LT( bcost, cost, bm0x, m0x, bm0y, m0y, bm1x, m1x, bm1y, m1y ); + } + } - if( om0x == bm0x && om0y == bm0y && om1x == bm1x && om1y == bm1y ) + mc_list0 = (om0x-bm0x)|(om0y-bm0y); + mc_list1 = (om1x-bm1x)|(om1y-bm1y); + if( !mc_list0 && !mc_list1 ) break; om0x = bm0x; om0y = bm0y; om1x = bm1x; om1y = bm1y; - BIME_CACHE( 0, 0 ); } m0->mv[0] = bm0x; @@ -1045,9 +1041,8 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei { \ if( !avoid_mvp || !(mx == pmx && my == pmy) ) \ { \ - int stride = 16; \ - uint8_t *src = h->mc.get_ref( pix, &stride, m->p_fref, m->i_stride[0], mx, my, bw*4, bh*4 ); \ - dst = h->pixf.mbcmp_unaligned[i_pixel]( m->p_fenc[0], FENC_STRIDE, src, stride ) \ + h->mc.mc_luma( pix, FDEC_STRIDE, m->p_fref, m->i_stride[0], mx, my, bw, bh, &m->weight[0] ); \ + dst = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], FENC_STRIDE, pix, FDEC_STRIDE ) \ + p_cost_mvx[mx] + p_cost_mvy[my]; \ COPY1_IF_LT( bsatd, dst ); \ } \ @@ -1060,7 +1055,13 @@ void x264_me_refine_bidir_rd( x264_t *h, x264_me_t *m0, x264_me_t *m1, int i_wei if( satd <= bsatd * SATD_THRESH ) \ { \ uint64_t cost; \ - *(uint32_t*)cache_mv = *(uint32_t*)cache_mv2 = pack16to32_mask(mx,my); \ + M32( cache_mv ) = pack16to32_mask(mx,my); \ + M32( cache_mv2 ) = pack16to32_mask(mx,my); \ + if( m->i_pixel <= PIXEL_8x8 )\ + {\ + h->mc.mc_chroma( pixu, FDEC_STRIDE, m->p_fref[4], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\ + h->mc.mc_chroma( pixv, FDEC_STRIDE, m->p_fref[5], m->i_stride[1], mx, my + mvy_offset, bw>>1, bh>>1 );\ + }\ cost = x264_rd_cost_part( h, i_lambda2, i4, m->i_pixel ); \ COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, dir, do_dir?mdir:dir ); \ } \ @@ -1072,29 +1073,38 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int static const int pixel_mv_offs[] = { 0, 4, 4*8, 0, 2, 2*8, 0 }; int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]]; int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel]; - const int16_t *p_cost_mvx, *p_cost_mvy; - const int bw = x264_pixel_size[m->i_pixel].w>>2; - const int bh = x264_pixel_size[m->i_pixel].h>>2; + const uint16_t *p_cost_mvx, *p_cost_mvy; + const int bw = x264_pixel_size[m->i_pixel].w; + const int bh = x264_pixel_size[m->i_pixel].h; const int i_pixel = m->i_pixel; + const int mvy_offset = h->mb.b_interlaced & m->i_ref ? (h->mb.i_mb_y & 1)*4 - 2 : 0; - DECLARE_ALIGNED_16( uint8_t pix[16*16] ); - uint64_t bcost = m->i_pixel == PIXEL_16x16 ? m->cost : COST_MAX64; + uint64_t bcost = COST_MAX64; int bmx = m->mv[0]; int bmy = m->mv[1]; int omx, omy, pmx, pmy, i, j; unsigned bsatd; - int satd = 0; + int satd; int dir = -2; - int satds[8]; + int i8 = i4>>2; + + uint8_t *pix = &h->mb.pic.p_fdec[0][block_idx_xy_fdec[i4]]; + uint8_t *pixu = &h->mb.pic.p_fdec[1][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4]; + uint8_t *pixv = &h->mb.pic.p_fdec[2][(i8>>1)*4*FDEC_STRIDE+(i8&1)*4]; + + h->mb.b_skip_mc = 1; if( m->i_pixel != PIXEL_16x16 && i4 != 0 ) - x264_mb_predict_mv( h, i_list, i4, bw, m->mvp ); + x264_mb_predict_mv( h, i_list, i4, bw>>2, m->mvp ); pmx = m->mvp[0]; pmy = m->mvp[1]; p_cost_mvx = m->p_cost_mv - pmx; p_cost_mvy = m->p_cost_mv - pmy; COST_MV_SATD( bmx, bmy, bsatd, 0 ); - COST_MV_RD( bmx, bmy, 0, 0, 0 ); + if( m->i_pixel != PIXEL_16x16 ) + COST_MV_RD( bmx, bmy, 0, 0, 0 ) + else + bcost = m->cost; /* check the predicted mv */ if( (bmx != pmx || bmy != pmy) @@ -1102,7 +1112,7 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int && pmy >= h->mb.mv_min_spel[1] && pmy <= h->mb.mv_max_spel[1] ) { COST_MV_SATD( pmx, pmy, satd, 0 ); - COST_MV_RD( pmx, pmy, satd, 0,0 ); + COST_MV_RD ( pmx, pmy, satd, 0, 0 ); /* The hex motion search is guaranteed to not repeat the center candidate, * so if pmv is chosen, set the "MV to avoid checking" to bmv instead. */ if( bmx == pmx && bmy == pmy ) @@ -1112,16 +1122,22 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int } } - if( bmy < h->mb.mv_min_spel[1] + 3 || - bmy > h->mb.mv_max_spel[1] - 3 ) + if( bmy < h->mb.mv_min_spel[1] + 3 || bmy > h->mb.mv_max_spel[1] - 3 || + bmx < h->mb.mv_min_spel[0] + 3 || bmx > h->mb.mv_max_spel[0] - 3 ) + { + h->mb.b_skip_mc = 0; return; + } /* subpel hex search, same pattern as ME HEX. */ dir = -2; omx = bmx; omy = bmy; - for( j=0; j<6; j++ ) COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1 ); - for( j=0; j<6; j++ ) COST_MV_RD ( omx + hex2[j+1][0], omy + hex2[j+1][1], satds[j], 1,j ); + for( j=0; j<6; j++ ) + { + COST_MV_SATD( omx + hex2[j+1][0], omy + hex2[j+1][1], satd, 1 ); + COST_MV_RD ( omx + hex2[j+1][0], omy + hex2[j+1][1], satd, 1, j ); + } if( dir != -2 ) { @@ -1135,22 +1151,29 @@ void x264_me_refine_qpel_rd( x264_t *h, x264_me_t *m, int i_lambda2, int i4, int dir = -2; omx = bmx; omy = bmy; - for( j=0; j<3; j++ ) COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1 ); - for( j=0; j<3; j++ ) COST_MV_RD ( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satds[j], 1, odir-1+j ); + for( j=0; j<3; j++ ) + { + COST_MV_SATD( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satd, 1 ); + COST_MV_RD ( omx + hex2[odir+j][0], omy + hex2[odir+j][1], satd, 1, odir-1+j ); + } if( dir == -2 ) break; } } - /* square refine, same as pattern as ME HEX. */ + /* square refine, same pattern as ME HEX. */ omx = bmx; omy = bmy; - for( i=0; i<8; i++ ) COST_MV_SATD( omx + square1[i+1][0], omy + square1[i+1][1], satds[i], 1 ); - for( i=0; i<8; i++ ) COST_MV_RD ( omx + square1[i+1][0], omy + square1[i+1][1], satds[i], 0,0 ); + for( i=0; i<8; i++ ) + { + COST_MV_SATD( omx + square1[i+1][0], omy + square1[i+1][1], satd, 1 ); + COST_MV_RD ( omx + square1[i+1][0], omy + square1[i+1][1], satd, 0, 0 ); + } m->cost = bcost; m->mv[0] = bmx; m->mv[1] = bmy; - x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw, bh, i_list, pack16to32_mask(bmx, bmy) ); - x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw, bh, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) ); + x264_macroblock_cache_mv ( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx, bmy) ); + x264_macroblock_cache_mvd( h, block_idx_x[i4], block_idx_y[i4], bw>>2, bh>>2, i_list, pack16to32_mask(bmx - m->mvp[0], bmy - m->mvp[1]) ); + h->mb.b_skip_mc = 0; }