git.sesse.net Git - x264/blob - encoder/me.c

   1 /*****************************************************************************
   2  * me.c: h264 encoder library (Motion Estimation)
   3  *****************************************************************************
   4  * Copyright (C) 2003 Laurent Aimar
   5  * $Id: me.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
   6  *
   7  * Authors: Laurent Aimar <fenrir@via.ecp.fr>
   8  *          Loren Merritt <lorenm@u.washington.edu>
   9  *
  10  * This program is free software; you can redistribute it and/or modify
  11  * it under the terms of the GNU General Public License as published by
  12  * the Free Software Foundation; either version 2 of the License, or
  13  * (at your option) any later version.
  14  *
  15  * This program is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  18  * GNU General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU General Public License
  21  * along with this program; if not, write to the Free Software
  22  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  23  *****************************************************************************/
  24
  25 #include <stdlib.h>
  26 #include <stdio.h>
  27 #include <string.h>
  28
  29 #include "common/common.h"
  30 #include "me.h"
  31
  32 /* presets selected from good points on the speed-vs-quality curve of several test videos
  33  * subpel_iters[i_subpel_refine] = { refine_hpel, refine_qpel, me_hpel, me_qpel }
  34  * where me_* are the number of EPZS iterations run on all candidate block types,
  35  * and refine_* are run only on the winner. */
  36 static const int subpel_iterations[][4] =
  37    {{1,0,0,0},
  38     {1,1,0,0},
  39     {0,1,1,0},
  40     {0,2,1,0},
  41     {0,2,1,1},
  42     {0,2,1,2},
  43     {0,0,2,3}};
  44
  45 static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel );
  46
  47 #define COST_MV_INT( mx, my, bd, d ) \
  48 { \
  49     int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], m->i_stride[0],     \
  50                    &p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0] ) \
  51              + p_cost_mvx[ (mx)<<2 ]  \
  52              + p_cost_mvy[ (my)<<2 ]; \
  53     if( cost < bcost ) \
  54     {                  \
  55         bcost = cost;  \
  56         bmx = mx;      \
  57         bmy = my;      \
  58         if( bd ) \
  59             dir = d; \
  60     } \
  61 }
  62 #define COST_MV( mx, my )         COST_MV_INT( mx, my, 0, 0 )
  63 #define COST_MV_DIR( mx, my, d )  COST_MV_INT( mx, my, 1, d )
  64
  65 #define DIA1_ITER( mx, my )\
  66     {\
  67         omx = mx; omy = my;\
  68         COST_MV( omx  , omy-1 );/*  1  */\
  69         COST_MV( omx  , omy+1 );/* 101 */\
  70         COST_MV( omx-1, omy   );/*  1  */\
  71         COST_MV( omx+1, omy   );\
  72     }
  73
  74
  75 void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
  76 {
  77     const int i_pixel = m->i_pixel;
  78     const int i_me_range = h->param.analyse.i_me_range;
  79     int bmx, bmy, bcost;
  80     int omx, omy, pmx, pmy;
  81     uint8_t *p_fref = m->p_fref[0];
  82     int i, j;
  83     int dir;
  84
  85     int mv_x_min = h->mb.mv_min_fpel[0];
  86     int mv_y_min = h->mb.mv_min_fpel[1];
  87     int mv_x_max = h->mb.mv_max_fpel[0];
  88     int mv_y_max = h->mb.mv_max_fpel[1];
  89
  90     const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
  91     const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
  92
  93     if( h->mb.i_me_method == X264_ME_UMH )
  94     {
  95         /* clamp mvp to inside frame+padding, so that we don't have to check it each iteration */
  96         p_cost_mvx = m->p_cost_mv - x264_clip3( m->mvp[0], h->mb.mv_min[0], h->mb.mv_max[0] );
  97         p_cost_mvy = m->p_cost_mv - x264_clip3( m->mvp[1], h->mb.mv_min[1], h->mb.mv_max[1] );
  98     }
  99
 100     bmx = pmx = x264_clip3( ( m->mvp[0] + 2 ) >> 2, mv_x_min, mv_x_max );
 101     bmy = pmy = x264_clip3( ( m->mvp[1] + 2 ) >> 2, mv_y_min, mv_y_max );
 102     bcost = COST_MAX;
 103     COST_MV( pmx, pmy );
 104     /* I don't know why this helps */
 105     bcost -= p_cost_mvx[ bmx<<2 ] + p_cost_mvy[ bmy<<2 ];
 106
 107     /* try extra predictors if provided */
 108     for( i = 0; i < i_mvc; i++ )
 109     {
 110         const int mx = x264_clip3( ( mvc[i][0] + 2 ) >> 2, mv_x_min, mv_x_max );
 111         const int my = x264_clip3( ( mvc[i][1] + 2 ) >> 2, mv_y_min, mv_y_max );
 112         if( mx != bmx || my != bmy )
 113             COST_MV( mx, my );
 114     }
 115
 116     COST_MV( 0, 0 );
 117
 118     mv_x_max += 8;
 119     mv_y_max += 8;
 120     mv_x_min -= 8;
 121     mv_y_min -= 8;
 122
 123     switch( h->mb.i_me_method )
 124     {
 125     case X264_ME_DIA:
 126         /* diamond search, radius 1 */
 127         for( i = 0; i < i_me_range; i++ )
 128         {
 129             DIA1_ITER( bmx, bmy );
 130             if( bmx == omx && bmy == omy )
 131                 break;
 132         }
 133         break;
 134
 135     case X264_ME_HEX:
 136 me_hex2:
 137         /* hexagon search, radius 2 */
 138 #if 0
 139         for( i = 0; i < i_me_range/2; i++ )
 140         {
 141             omx = bmx; omy = bmy;
 142             COST_MV( omx-2, omy   );
 143             COST_MV( omx-1, omy+2 );
 144             COST_MV( omx+1, omy+2 );
 145             COST_MV( omx+2, omy   );
 146             COST_MV( omx+1, omy-2 );
 147             COST_MV( omx-1, omy-2 );
 148             if( bmx == omx && bmy == omy )
 149                 break;
 150         }
 151 #else
 152         /* equivalent to the above, but eliminates duplicate candidates */
 153         dir = -1;
 154         omx = bmx; omy = bmy;
 155         COST_MV_DIR( omx-2, omy,   0 );
 156         COST_MV_DIR( omx-1, omy+2, 1 );
 157         COST_MV_DIR( omx+1, omy+2, 2 );
 158         COST_MV_DIR( omx+2, omy,   3 );
 159         COST_MV_DIR( omx+1, omy-2, 4 );
 160         COST_MV_DIR( omx-1, omy-2, 5 );
 161         if( dir != -1 )
 162         {
 163             for( i = 1; i < i_me_range/2; i++ )
 164             {
 165                 static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
 166                 static const int mod6[8] = {5,0,1,2,3,4,5,0};
 167                 const int odir = mod6[dir+1];
 168                 omx = bmx; omy = bmy;
 169                 COST_MV_DIR( omx + hex2[odir+0][0], omy + hex2[odir+0][1], odir-1 );
 170                 COST_MV_DIR( omx + hex2[odir+1][0], omy + hex2[odir+1][1], odir   );
 171                 COST_MV_DIR( omx + hex2[odir+2][0], omy + hex2[odir+2][1], odir+1 );
 172                 if( bmx == omx && bmy == omy )
 173                     break;
 174             }
 175         }
 176 #endif
 177         /* square refine */
 178         DIA1_ITER( bmx, bmy );
 179         COST_MV( omx-1, omy-1 );
 180         COST_MV( omx-1, omy+1 );
 181         COST_MV( omx+1, omy-1 );
 182         COST_MV( omx+1, omy+1 );
 183         break;
 184
 185     case X264_ME_UMH:
 186         {
 187             /* Uneven-cross Multi-Hexagon-grid Search
 188              * as in JM, except without early termination */
 189
 190             int ucost;
 191             int cross_start;
 192
 193             /* refine predictors */
 194             DIA1_ITER( pmx, pmy );
 195             if( pmx || pmy )
 196                 DIA1_ITER( 0, 0 );
 197
 198             if(i_pixel == PIXEL_4x4)
 199                 goto me_hex2;
 200
 201             ucost = bcost;
 202             if( (bmx || bmy) && (bmx!=pmx || bmy!=pmy) )
 203                 DIA1_ITER( bmx, bmy );
 204
 205             /* cross */
 206             omx = bmx; omy = bmy;
 207             cross_start = ( bcost == ucost ) ? 3 : 1;
 208             for( i = cross_start; i < i_me_range; i+=2 )
 209             {
 210                 if( omx + i <= mv_x_max )
 211                     COST_MV( omx + i, omy );
 212                 if( omx - i >= mv_x_min )
 213                     COST_MV( omx - i, omy );
 214             }
 215             for( i = cross_start; i < i_me_range/2; i+=2 )
 216             {
 217                 if( omy + i <= mv_y_max )
 218                     COST_MV( omx, omy + i );
 219                 if( omy - i >= mv_y_min )
 220                     COST_MV( omx, omy - i );
 221             }
 222
 223             /* 5x5 ESA */
 224             omx = bmx; omy = bmy;
 225             for( i = (bcost == ucost) ? 4 : 0; i < 24; i++ )
 226             {
 227                 static const int square2[24][2] = {
 228                     { 1, 0}, { 0, 1}, {-1, 0}, { 0,-1},
 229                     { 1, 1}, {-1, 1}, {-1,-1}, { 1,-1},
 230                     { 2,-1}, { 2, 0}, { 2, 1}, { 2, 2},
 231                     { 1, 2}, { 0, 2}, {-1, 2}, {-2, 2},
 232                     {-2, 1}, {-2, 0}, {-2,-1}, {-2,-2},
 233                     {-1,-2}, { 0,-2}, { 1,-2}, { 2,-2}
 234                 };
 235                 COST_MV( omx + square2[i][0], omy + square2[i][1] );
 236             }
 237             /* hexagon grid */
 238             omx = bmx; omy = bmy;
 239             for( i = 1; i <= i_me_range/4; i++ )
 240             {
 241                 int bounds_check = 4*i > X264_MIN4( mv_x_max-omx, mv_y_max-omy, omx-mv_x_min, omy-mv_y_min );
 242                 for( j = 0; j < 16; j++ )
 243                 {
 244                     static const int hex4[16][2] = {
 245                         {-4, 2}, {-4, 1}, {-4, 0}, {-4,-1}, {-4,-2},
 246                         { 4,-2}, { 4,-1}, { 4, 0}, { 4, 1}, { 4, 2},
 247                         { 2, 3}, { 0, 4}, {-2, 3},
 248                         {-2,-3}, { 0,-4}, { 2,-3},
 249                     };
 250                     int mx = omx + hex4[j][0]*i;
 251                     int my = omy + hex4[j][1]*i;
 252                     if( !bounds_check || ( mx >= mv_x_min && mx <= mv_x_max
 253                                         && my >= mv_y_min && my <= mv_y_max ) )
 254                         COST_MV( mx, my );
 255                 }
 256             }
 257             goto me_hex2;
 258         }
 259
 260     case X264_ME_ESA:
 261         {
 262             const int min_x = X264_MAX( bmx - i_me_range, mv_x_min);
 263             const int min_y = X264_MAX( bmy - i_me_range, mv_y_min);
 264             const int max_x = X264_MIN( bmx + i_me_range, mv_x_max);
 265             const int max_y = X264_MIN( bmy + i_me_range, mv_y_max);
 266             for( omy = min_y; omy <= max_y; omy++ )
 267                 for( omx = min_x; omx <= max_x; omx++ )
 268                 {
 269                     COST_MV( omx, omy );
 270                 }
 271         }
 272         break;
 273     }
 274
 275     /* -> qpel mv */
 276     m->mv[0] = bmx << 2;
 277     m->mv[1] = bmy << 2;
 278
 279     /* compute the real cost */
 280     m->cost_mv = p_cost_mvx[ m->mv[0] ] + p_cost_mvy[ m->mv[1] ];
 281     m->cost = bcost;
 282     if( bmx == pmx && bmy == pmy )
 283         m->cost += m->cost_mv;
 284
 285     /* subpel refine */
 286     if( h->mb.i_subpel_refine >= 2 )
 287     {
 288         int hpel = subpel_iterations[h->mb.i_subpel_refine][2];
 289         int qpel = subpel_iterations[h->mb.i_subpel_refine][3];
 290         refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 );
 291     }
 292 }
 293 #undef COST_MV
 294
 295 void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
 296 {
 297     int hpel = subpel_iterations[h->mb.i_subpel_refine][0];
 298     int qpel = subpel_iterations[h->mb.i_subpel_refine][1];
 299
 300     if( m->i_pixel <= PIXEL_8x8 && h->sh.i_type == SLICE_TYPE_P )
 301         m->cost -= m->i_ref_cost;
 302
 303     refine_subpel( h, m, hpel, qpel, NULL, 1 );
 304 }
 305
 306 #define COST_MV_SAD( mx, my ) \
 307 { \
 308     int stride = 16; \
 309     uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw, bh ); \
 310     int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], m->i_stride[0], src, stride ) \
 311              + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
 312     if( cost < bcost ) \
 313     {                  \
 314         bcost = cost;  \
 315         bmx = mx;      \
 316         bmy = my;      \
 317     } \
 318 }
 319
 320 #define COST_MV_SATD( mx, my, dir ) \
 321 if( b_refine_qpel || (dir^1) != odir ) \
 322 { \
 323     int stride = 16; \
 324     uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw, bh ); \
 325     int cost = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], m->i_stride[0], src, stride ) \
 326              + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
 327     if( b_chroma_me && cost < bcost ) \
 328     { \
 329         h->mc.mc_chroma( m->p_fref[4], m->i_stride[1], pix, 8, mx, my, bw/2, bh/2 ); \
 330         cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], m->i_stride[1], pix, 8 ); \
 331         if( cost < bcost ) \
 332         { \
 333             h->mc.mc_chroma( m->p_fref[5], m->i_stride[1], pix, 8, mx, my, bw/2, bh/2 ); \
 334             cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], m->i_stride[1], pix, 8 ); \
 335         } \
 336     } \
 337     if( cost < bcost ) \
 338     {                  \
 339         bcost = cost;  \
 340         bmx = mx;      \
 341         bmy = my;      \
 342         bdir = dir;    \
 343     } \
 344 }
 345
 346 static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel )
 347 {
 348     const int bw = x264_pixel_size[m->i_pixel].w;
 349     const int bh = x264_pixel_size[m->i_pixel].h;
 350     const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
 351     const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
 352     const int i_pixel = m->i_pixel;
 353     const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
 354
 355     DECLARE_ALIGNED( uint8_t, pix[16*16], 16 );
 356     int omx, omy;
 357     int i;
 358
 359     int bmx = m->mv[0];
 360     int bmy = m->mv[1];
 361     int bcost = m->cost;
 362     int odir = -1, bdir;
 363
 364
 365     /* try the subpel component of the predicted mv if it's close to
 366      * the result of the fullpel search */
 367     if( hpel_iters )
 368     {
 369         int mx = x264_clip3( m->mvp[0], h->mb.mv_min[0], h->mb.mv_max[0] );
 370         int my = x264_clip3( m->mvp[1], h->mb.mv_min[1], h->mb.mv_max[1] );
 371         if( mx != bmx || my != bmy )
 372             COST_MV_SAD( mx, my );
 373     }
 374
 375     /* hpel search */
 376     for( i = hpel_iters; i > 0; i-- )
 377     {
 378         omx = bmx;
 379         omy = bmy;
 380         COST_MV_SAD( omx, omy - 2 );
 381         COST_MV_SAD( omx, omy + 2 );
 382         COST_MV_SAD( omx - 2, omy );
 383         COST_MV_SAD( omx + 2, omy );
 384         if( bmx == omx && bmy == omy )
 385             break;
 386     }
 387
 388     if( !b_refine_qpel )
 389     {
 390         bcost = COST_MAX;
 391         COST_MV_SATD( bmx, bmy, -1 );
 392     }
 393
 394     /* early termination when examining multiple reference frames */
 395     if( p_halfpel_thresh )
 396     {
 397         if( (bcost*7)>>3 > *p_halfpel_thresh )
 398         {
 399             m->cost = bcost;
 400             m->mv[0] = bmx;
 401             m->mv[1] = bmy;
 402             // don't need cost_mv
 403             return;
 404         }
 405         else if( bcost < *p_halfpel_thresh )
 406             *p_halfpel_thresh = bcost;
 407     }
 408
 409     /* qpel search */
 410     bdir = -1;
 411     for( i = qpel_iters; i > 0; i-- )
 412     {
 413         odir = bdir;
 414         omx = bmx;
 415         omy = bmy;
 416         COST_MV_SATD( omx, omy - 1, 0 );
 417         COST_MV_SATD( omx, omy + 1, 1 );
 418         COST_MV_SATD( omx - 1, omy, 2 );
 419         COST_MV_SATD( omx + 1, omy, 3 );
 420         if( bmx == omx && bmy == omy )
 421             break;
 422     }
 423
 424     m->cost = bcost;
 425     m->mv[0] = bmx;
 426     m->mv[1] = bmy;
 427     m->cost_mv = p_cost_mvx[ bmx ] + p_cost_mvy[ bmy ];
 428 }
 429