int omx, omy, pmx, pmy;
uint8_t *p_fenc = m->p_fenc[0];
uint8_t *p_fref = m->p_fref[0];
- DECLARE_ALIGNED_16( uint8_t pix[16*16] );
+ ALIGNED_ARRAY_16( uint8_t, pix,[16*16] );
int i, j;
int dir;
#define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max )
- const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
- const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
+ const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
+ const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 );
bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 );
/* hexagon grid */
omx = bmx; omy = bmy;
- const int16_t *p_cost_omvx = p_cost_mvx + omx*4;
- const int16_t *p_cost_omvy = p_cost_mvy + omy*4;
+ const uint16_t *p_cost_omvx = p_cost_mvx + omx*4;
+ const uint16_t *p_cost_omvy = p_cost_mvy + omy*4;
i = 1;
do
{
uint16_t *sums_base = m->integral;
/* due to a GCC bug on some platforms (win32?), zero[] may not actually be aligned.
* this is not a problem because it is not used for any SSE instructions. */
- DECLARE_ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] );
- DECLARE_ALIGNED_16( int enc_dc[4] );
+ ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] );
+ ALIGNED_ARRAY_16( int, enc_dc,[4] );
int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
int delta = x264_pixel_size[sad_size].w;
int16_t *xs = h->scratch_buffer;
int xn;
- uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
+ uint16_t *cost_fpel_mvx = h->cost_mv_fpel[x264_lambda_tab[h->mb.i_qp]][-m->mvp[0]&3] + (-m->mvp[0]>>2);
h->pixf.sad_x4[sad_size]( zero, p_fenc, p_fenc+delta,
p_fenc+delta*FENC_STRIDE, p_fenc+delta+delta*FENC_STRIDE,
}
limit = i_me_range / 2;
- if( nmvsad > limit*2 )
+ sad_thresh = bsad*sad_thresh>>3;
+ while( nmvsad > limit*2 && sad_thresh > bsad )
{
// halve the range if the domain is too large... eh, close enough
- bsad = bsad*(sad_thresh+8)>>4;
- for( i=0; i<nmvsad && mvsads[i].sad <= bsad; i++ );
+ sad_thresh = (sad_thresh + bsad) >> 1;
+ for( i=0; i<nmvsad && mvsads[i].sad <= sad_thresh; i++ );
for( j=i; j<nmvsad; j++ )
- if( mvsads[j].sad <= bsad )
- {
- /* mvsad_t is not guaranteed to be 8 bytes on all archs, so check before using explicit write-combining */
- if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
- *(uint64_t*)&mvsads[i++] = *(uint64_t*)&mvsads[j];
- else
- mvsads[i++] = mvsads[j];
- }
+ {
+ /* mvsad_t is not guaranteed to be 8 bytes on all archs, so check before using explicit write-combining */
+ if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
+ *(uint64_t*)&mvsads[i] = *(uint64_t*)&mvsads[j];
+ else
+ mvsads[i] = mvsads[j];
+ i += mvsads[j].sad <= sad_thresh;
+ }
nmvsad = i;
}
- if( nmvsad > limit )
+ while( nmvsad > limit )
{
- for( i=0; i<limit; i++ )
- {
- int bj = i;
- int bsad = mvsads[bj].sad;
- for( j=i+1; j<nmvsad; j++ )
- COPY2_IF_LT( bsad, mvsads[j].sad, bj, j );
- if( bj > i )
- {
- if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
- XCHG( uint64_t, *(uint64_t*)&mvsads[i], *(uint64_t*)&mvsads[bj] );
- else
- XCHG( mvsad_t, mvsads[i], mvsads[bj] );
- }
- }
- nmvsad = limit;
+ int bsad = mvsads[0].sad;
+ int bi = 0;
+ for( i=1; i<nmvsad; i++ )
+ COPY2_IF_GT( bsad, mvsads[i].sad, bi, i );
+ nmvsad--;
+ mvsads[bi] = mvsads[nmvsad];
+ if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
+ *(uint64_t*)&mvsads[bi] = *(uint64_t*)&mvsads[nmvsad];
+ else
+ mvsads[bi] = mvsads[nmvsad];
}
for( i=0; i<nmvsad; i++ )
COST_MV( mvsads[i].mx, mvsads[i].my );
{
const int bw = x264_pixel_size[m->i_pixel].w;
const int bh = x264_pixel_size[m->i_pixel].h;
- const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
- const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
+ const uint16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
+ const uint16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
const int i_pixel = m->i_pixel;
const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
- DECLARE_ALIGNED_16( uint8_t pix[2][32*18] ); // really 17x17, but round up for alignment
+ ALIGNED_ARRAY_16( uint8_t, pix,[2],[32*18] ); // really 17x17, but round up for alignment
int omx, omy;
int i;
const int i_pixel = m0->i_pixel;
const int bw = x264_pixel_size[i_pixel].w;
const int bh = x264_pixel_size[i_pixel].h;
- const int16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0];
- const int16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
- const int16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
- const int16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];
- DECLARE_ALIGNED_16( uint8_t pixy_buf[2][9][16*16] );
- DECLARE_ALIGNED_8( uint8_t pixu_buf[2][9][8*8] );
- DECLARE_ALIGNED_8( uint8_t pixv_buf[2][9][8*8] );
+ const uint16_t *p_cost_m0x = m0->p_cost_mv - m0->mvp[0];
+ const uint16_t *p_cost_m0y = m0->p_cost_mv - m0->mvp[1];
+ const uint16_t *p_cost_m1x = m1->p_cost_mv - m1->mvp[0];
+ const uint16_t *p_cost_m1y = m1->p_cost_mv - m1->mvp[1];
+ ALIGNED_ARRAY_16( uint8_t, pixy_buf,[2],[9][16*16] );
+ ALIGNED_8( uint8_t pixu_buf[2][9][8*8] );
+ ALIGNED_8( uint8_t pixv_buf[2][9][8*8] );
uint8_t *src0[9];
uint8_t *src1[9];
uint8_t *pix = &h->mb.pic.p_fdec[0][(i8>>1)*8*FDEC_STRIDE+(i8&1)*8];
int mc_list0 = 1, mc_list1 = 1;
uint64_t bcostrd = COST_MAX64;
/* each byte of visited represents 8 possible m1y positions, so a 4D array isn't needed */
- DECLARE_ALIGNED_16( uint8_t visited[8][8][8] );
+ ALIGNED_ARRAY_16( uint8_t, visited,[8],[8][8] );
/* all permutations of an offset in up to 2 of the dimensions */
static const int8_t dia4d[32][4] = {
{0,0,0,1}, {0,0,0,-1}, {0,0,1,0}, {0,0,-1,0},
bm0y > h->mb.mv_max_spel[1] - 8 || bm1y > h->mb.mv_max_spel[1] - 8 )
return;
- h->mc.memzero_aligned( visited, sizeof(visited) );
+ h->mc.memzero_aligned( visited, sizeof(uint8_t[8][8][8]) );
BIME_CACHE( 0, 0, 0 );
BIME_CACHE( 0, 0, 1 );
static const int pixel_mv_offs[] = { 0, 4, 4*8, 0, 2, 2*8, 0 };
int16_t *cache_mv = h->mb.cache.mv[i_list][x264_scan8[i4]];
int16_t *cache_mv2 = cache_mv + pixel_mv_offs[m->i_pixel];
- const int16_t *p_cost_mvx, *p_cost_mvy;
+ const uint16_t *p_cost_mvx, *p_cost_mvy;
const int bw = x264_pixel_size[m->i_pixel].w>>2;
const int bh = x264_pixel_size[m->i_pixel].h>>2;
const int i_pixel = m->i_pixel;
- DECLARE_ALIGNED_16( uint8_t pix[16*16] );
- uint64_t bcost = m->i_pixel == PIXEL_16x16 ? m->cost : COST_MAX64;
+ ALIGNED_ARRAY_16( uint8_t, pix,[16*16] );
+ uint64_t bcost = COST_MAX64;
int bmx = m->mv[0];
int bmy = m->mv[1];
int omx, omy, pmx, pmy, i, j;
p_cost_mvx = m->p_cost_mv - pmx;
p_cost_mvy = m->p_cost_mv - pmy;
COST_MV_SATD( bmx, bmy, bsatd, 0 );
- COST_MV_RD( bmx, bmy, 0, 0, 0 );
+ if( m->i_pixel != PIXEL_16x16 )
+ COST_MV_RD( bmx, bmy, 0, 0, 0 )
+ else
+ bcost = m->cost;
/* check the predicted mv */
if( (bmx != pmx || bmy != pmy)
}
}
- /* square refine, same as pattern as ME HEX. */
+ /* square refine, same pattern as ME HEX. */
omx = bmx;
omy = bmy;
for( i=0; i<8; i++ ) COST_MV_SATD( omx + square1[i+1][0], omy + square1[i+1][1], satds[i], 1 );