+ /* Uneven-cross Multi-Hexagon-grid Search
+ * as in JM, except with different early termination */
+
+ static const int x264_pixel_size_shift[7] = { 0, 1, 1, 2, 3, 3, 4 };
+
+ int ucost1, ucost2;
+ int cross_start = 1;
+
+ /* refine predictors */
+ ucost1 = bcost;
+ DIA1_ITER( pmx, pmy );
+ if( pmx | pmy )
+ DIA1_ITER( 0, 0 );
+
+ if(i_pixel == PIXEL_4x4)
+ goto me_hex2;
+
+ ucost2 = bcost;
+ if( (bmx | bmy) && ((bmx-pmx) | (bmy-pmy)) )
+ DIA1_ITER( bmx, bmy );
+ if( bcost == ucost2 )
+ cross_start = 3;
+ omx = bmx; omy = bmy;
+
+ /* early termination */
+#define SAD_THRESH(v) ( bcost < ( v >> x264_pixel_size_shift[i_pixel] ) )
+ if( bcost == ucost2 && SAD_THRESH(2000) )
+ {
+ COST_MV_X4( 0,-2, -1,-1, 1,-1, -2,0 );
+ COST_MV_X4( 2, 0, -1, 1, 1, 1, 0,2 );
+ if( bcost == ucost1 && SAD_THRESH(500) )
+ break;
+ if( bcost == ucost2 )
+ {
+ int range = (i_me_range>>1) | 1;
+ CROSS( 3, range, range );
+ COST_MV_X4( -1,-2, 1,-2, -2,-1, 2,-1 );
+ COST_MV_X4( -2, 1, 2, 1, -1, 2, 1, 2 );
+ if( bcost == ucost2 )
+ break;
+ cross_start = range + 2;
+ }
+ }
+
+ /* adaptive search range */
+ if( i_mvc )
+ {
+ /* range multipliers based on casual inspection of some statistics of
+ * average distance between current predictor and final mv found by ESA.
+ * these have not been tuned much by actual encoding. */
+ static const int range_mul[4][4] =
+ {
+ { 3, 3, 4, 4 },
+ { 3, 4, 4, 4 },
+ { 4, 4, 4, 5 },
+ { 4, 4, 5, 6 },
+ };
+ int mvd;
+ int sad_ctx, mvd_ctx;
+ int denom = 1;
+
+ if( i_mvc == 1 )
+ {
+ if( i_pixel == PIXEL_16x16 )
+ /* mvc is probably the same as mvp, so the difference isn't meaningful.
+ * but prediction usually isn't too bad, so just use medium range */
+ mvd = 25;
+ else
+ mvd = abs( m->mvp[0] - mvc[0][0] )
+ + abs( m->mvp[1] - mvc[0][1] );
+ }
+ else
+ {
+ /* calculate the degree of agreement between predictors. */
+ /* in 16x16, mvc includes all the neighbors used to make mvp,
+ * so don't count mvp separately. */
+ denom = i_mvc - 1;
+ mvd = 0;
+ if( i_pixel != PIXEL_16x16 )
+ {
+ mvd = abs( m->mvp[0] - mvc[0][0] )
+ + abs( m->mvp[1] - mvc[0][1] );
+ denom++;
+ }
+ mvd += x264_predictor_difference( mvc, i_mvc );
+ }
+
+ sad_ctx = SAD_THRESH(1000) ? 0
+ : SAD_THRESH(2000) ? 1
+ : SAD_THRESH(4000) ? 2 : 3;
+ mvd_ctx = mvd < 10*denom ? 0
+ : mvd < 20*denom ? 1
+ : mvd < 40*denom ? 2 : 3;
+
+ i_me_range = i_me_range * range_mul[mvd_ctx][sad_ctx] / 4;
+ }
+
+ /* FIXME if the above DIA2/OCT2/CROSS found a new mv, it has not updated omx/omy.
+ * we are still centered on the same place as the DIA2. is this desirable? */
+ CROSS( cross_start, i_me_range, i_me_range/2 );
+
+ COST_MV_X4( -2,-2, -2,2, 2,-2, 2,2 );
+
+ /* hexagon grid */
+ omx = bmx; omy = bmy;
+
+ i = 1;
+ do
+ {
+ static const int hex4[16][2] = {
+ {-4, 2}, {-4, 1}, {-4, 0}, {-4,-1}, {-4,-2},
+ { 4,-2}, { 4,-1}, { 4, 0}, { 4, 1}, { 4, 2},
+ { 2, 3}, { 0, 4}, {-2, 3},
+ {-2,-3}, { 0,-4}, { 2,-3},
+ };
+
+ if( 4*i > X264_MIN4( mv_x_max-omx, omx-mv_x_min,
+ mv_y_max-omy, omy-mv_y_min ) )
+ {
+ for( j = 0; j < 16; j++ )
+ {
+ int mx = omx + hex4[j][0]*i;
+ int my = omy + hex4[j][1]*i;
+ if( CHECK_MVRANGE(mx, my) )
+ COST_MV( mx, my );
+ }
+ }
+ else
+ {
+ COST_MV_X4( -4*i, 2*i, -4*i, 1*i, -4*i, 0*i, -4*i,-1*i );
+ COST_MV_X4( -4*i,-2*i, 4*i,-2*i, 4*i,-1*i, 4*i, 0*i );
+ COST_MV_X4( 4*i, 1*i, 4*i, 2*i, 2*i, 3*i, 0*i, 4*i );
+ COST_MV_X4( -2*i, 3*i, -2*i,-3*i, 0*i,-4*i, 2*i,-3*i );
+ }
+ } while( ++i <= i_me_range/4 );
+ if( bmy <= mv_y_max )
+ goto me_hex2;
+ break;
+ }
+
+ case X264_ME_ESA:
+ case X264_ME_TESA:
+ {
+ const int min_x = X264_MAX( bmx - i_me_range, mv_x_min );
+ const int min_y = X264_MAX( bmy - i_me_range, mv_y_min );
+ const int max_x = X264_MIN( bmx + i_me_range, mv_x_max );
+ const int max_y = X264_MIN( bmy + i_me_range, mv_y_max );
+ /* SEA is fastest in multiples of 4 */
+ const int width = (max_x - min_x + 3) & ~3;
+ int my;
+#if 0
+ /* plain old exhaustive search */
+ int mx;
+ for( my = min_y; my <= max_y; my++ )
+ for( mx = min_x; mx <= max_x; mx++ )
+ COST_MV( mx, my );
+#else
+ /* successive elimination by comparing DC before a full SAD,
+ * because sum(abs(diff)) >= abs(diff(sum)). */
+ const int stride = m->i_stride[0];
+ uint16_t *sums_base = m->integral;
+ /* due to a GCC bug on some platforms (win32?), zero[] may not actually be aligned.
+ * unlike the similar case in ratecontrol.c, this is not a problem because it is not used for any
+ * SSE instructions and the only loss is a tiny bit of performance. */
+ DECLARE_ALIGNED_16( static uint8_t zero[8*FENC_STRIDE] );
+ DECLARE_ALIGNED_16( int enc_dc[4] );
+ int sad_size = i_pixel <= PIXEL_8x8 ? PIXEL_8x8 : PIXEL_4x4;
+ int delta = x264_pixel_size[sad_size].w;
+ int16_t *xs = h->scratch_buffer;
+ int xn;
+ uint16_t *cost_fpel_mvx = x264_cost_mv_fpel[h->mb.i_qp][-m->mvp[0]&3] + (-m->mvp[0]>>2);
+
+ h->pixf.sad_x4[sad_size]( zero, m->p_fenc[0], m->p_fenc[0]+delta,
+ m->p_fenc[0]+delta*FENC_STRIDE, m->p_fenc[0]+delta+delta*FENC_STRIDE,
+ FENC_STRIDE, enc_dc );
+ if( delta == 4 )
+ sums_base += stride * (h->fenc->i_lines[0] + PADV*2);
+ if( i_pixel == PIXEL_16x16 || i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
+ delta *= stride;
+ if( i_pixel == PIXEL_8x16 || i_pixel == PIXEL_4x8 )
+ enc_dc[1] = enc_dc[2];
+
+ if( h->mb.i_me_method == X264_ME_TESA )
+ {
+ // ADS threshold, then SAD threshold, then keep the best few SADs, then SATD
+ mvsad_t *mvsads = (mvsad_t *)(xs + ((width+15)&~15));
+ int nmvsad = 0, limit;
+ int sad_thresh = i_me_range <= 16 ? 10 : i_me_range <= 24 ? 11 : 12;
+ int bsad = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+bmy*stride+bmx, stride )
+ + BITS_MVD( bmx, bmy );
+ for( my = min_y; my <= max_y; my++ )
+ {
+ int ycost = p_cost_mvy[my<<2];
+ if( bsad <= ycost )
+ continue;
+ bsad -= ycost;
+ xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
+ cost_fpel_mvx+min_x, xs, width, bsad*17/16 );
+ for( i=0; i<xn-2; i+=3 )
+ {
+ uint8_t *ref = p_fref+min_x+my*stride;
+ int sads[3];
+ h->pixf.sad_x3[i_pixel]( m->p_fenc[0], ref+xs[i], ref+xs[i+1], ref+xs[i+2], stride, sads );
+ for( j=0; j<3; j++ )
+ {
+ int sad = sads[j] + cost_fpel_mvx[xs[i+j]];
+ if( sad < bsad*sad_thresh>>3 )
+ {
+ COPY1_IF_LT( bsad, sad );
+ mvsads[nmvsad].sad = sad + ycost;
+ mvsads[nmvsad].mx = min_x+xs[i+j];
+ mvsads[nmvsad].my = my;
+ nmvsad++;
+ }
+ }
+ }
+ for( ; i<xn; i++ )
+ {
+ int mx = min_x+xs[i];
+ int sad = h->pixf.sad[i_pixel]( m->p_fenc[0], FENC_STRIDE, p_fref+mx+my*stride, stride )
+ + cost_fpel_mvx[xs[i]];
+ if( sad < bsad*sad_thresh>>3 )
+ {
+ COPY1_IF_LT( bsad, sad );
+ mvsads[nmvsad].sad = sad + ycost;
+ mvsads[nmvsad].mx = mx;
+ mvsads[nmvsad].my = my;
+ nmvsad++;
+ }
+ }
+ bsad += ycost;
+ }
+
+ limit = i_me_range / 2;
+ if( nmvsad > limit*2 )
+ {
+ // halve the range if the domain is too large... eh, close enough
+ bsad = bsad*(sad_thresh+8)>>4;
+ for( i=0; i<nmvsad && mvsads[i].sad <= bsad; i++ );
+ for( j=i; j<nmvsad; j++ )
+ if( mvsads[j].sad <= bsad )
+ {
+ /* mvsad_t is not guaranteed to be 8 bytes on all archs, so check before using explicit write-combining */
+ if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
+ *(uint64_t*)&mvsads[i++] = *(uint64_t*)&mvsads[j];
+ else
+ mvsads[i++] = mvsads[j];
+ }
+ nmvsad = i;
+ }
+ if( nmvsad > limit )
+ {
+ for( i=0; i<limit; i++ )
+ {
+ int bj = i;
+ int bsad = mvsads[bj].sad;
+ for( j=i+1; j<nmvsad; j++ )
+ COPY2_IF_LT( bsad, mvsads[j].sad, bj, j );
+ if( bj > i )
+ {
+ if( sizeof( mvsad_t ) == sizeof( uint64_t ) )
+ XCHG( uint64_t, *(uint64_t*)&mvsads[i], *(uint64_t*)&mvsads[bj] );
+ else
+ XCHG( mvsad_t, mvsads[i], mvsads[bj] );
+ }
+ }
+ nmvsad = limit;
+ }
+ for( i=0; i<nmvsad; i++ )
+ COST_MV( mvsads[i].mx, mvsads[i].my );
+ }
+ else
+ {
+ // just ADS and SAD
+ for( my = min_y; my <= max_y; my++ )
+ {
+ int ycost = p_cost_mvy[my<<2];
+ if( bcost <= ycost )
+ continue;
+ bcost -= ycost;
+ xn = h->pixf.ads[i_pixel]( enc_dc, sums_base + min_x + my * stride, delta,
+ cost_fpel_mvx+min_x, xs, width, bcost );
+ for( i=0; i<xn-2; i+=3 )
+ COST_MV_X3_ABS( min_x+xs[i],my, min_x+xs[i+1],my, min_x+xs[i+2],my );
+ bcost += ycost;
+ for( ; i<xn; i++ )
+ COST_MV( min_x+xs[i], my );
+ }
+ }
+#endif