1 /*****************************************************************************
2 * me.c: h264 encoder library (Motion Estimation)
3 *****************************************************************************
4 * Copyright (C) 2003 Laurent Aimar
5 * $Id: me.c,v 1.1 2004/06/03 19:27:08 fenrir Exp $
7 * Authors: Laurent Aimar <fenrir@via.ecp.fr>
8 * Loren Merritt <lorenm@u.washington.edu>
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
23 *****************************************************************************/
29 #include "common/common.h"
32 /* presets selected from good points on the speed-vs-quality curve of several test videos
33 * subpel_iters[i_subpel_refine] = { refine_hpel, refine_qpel, me_hpel, me_qpel }
34 * where me_* are the number of EPZS iterations run on all candidate block types,
35 * and refine_* are run only on the winner. */
36 static const int subpel_iterations[][4] =
45 static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel );
47 #define COST_MV_INT( mx, my, bd, d ) \
49 int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], m->i_stride[0], \
50 &p_fref[(my)*m->i_stride[0]+(mx)], m->i_stride[0] ) \
51 + p_cost_mvx[ (mx)<<2 ] \
52 + p_cost_mvy[ (my)<<2 ]; \
62 #define COST_MV( mx, my ) COST_MV_INT( mx, my, 0, 0 )
63 #define COST_MV_DIR( mx, my, d ) COST_MV_INT( mx, my, 1, d )
65 #define DIA1_ITER( mx, my )\
68 COST_MV( omx , omy-1 );/* 1 */\
69 COST_MV( omx , omy+1 );/* 101 */\
70 COST_MV( omx-1, omy );/* 1 */\
71 COST_MV( omx+1, omy );\
75 void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
77 const int i_pixel = m->i_pixel;
78 const int i_me_range = h->param.analyse.i_me_range;
80 int omx, omy, pmx, pmy;
81 uint8_t *p_fref = m->p_fref[0];
85 int mv_x_min = h->mb.mv_min_fpel[0];
86 int mv_y_min = h->mb.mv_min_fpel[1];
87 int mv_x_max = h->mb.mv_max_fpel[0];
88 int mv_y_max = h->mb.mv_max_fpel[1];
90 const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
91 const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
93 if( h->mb.i_me_method == X264_ME_UMH )
95 /* clamp mvp to inside frame+padding, so that we don't have to check it each iteration */
96 p_cost_mvx = m->p_cost_mv - x264_clip3( m->mvp[0], h->mb.mv_min[0], h->mb.mv_max[0] );
97 p_cost_mvy = m->p_cost_mv - x264_clip3( m->mvp[1], h->mb.mv_min[1], h->mb.mv_max[1] );
100 bmx = pmx = x264_clip3( ( m->mvp[0] + 2 ) >> 2, mv_x_min, mv_x_max );
101 bmy = pmy = x264_clip3( ( m->mvp[1] + 2 ) >> 2, mv_y_min, mv_y_max );
104 /* I don't know why this helps */
105 bcost -= p_cost_mvx[ bmx<<2 ] + p_cost_mvy[ bmy<<2 ];
107 /* try extra predictors if provided */
108 for( i = 0; i < i_mvc; i++ )
110 const int mx = x264_clip3( ( mvc[i][0] + 2 ) >> 2, mv_x_min, mv_x_max );
111 const int my = x264_clip3( ( mvc[i][1] + 2 ) >> 2, mv_y_min, mv_y_max );
112 if( mx != bmx || my != bmy )
123 switch( h->mb.i_me_method )
126 /* diamond search, radius 1 */
127 for( i = 0; i < i_me_range; i++ )
129 DIA1_ITER( bmx, bmy );
130 if( bmx == omx && bmy == omy )
137 /* hexagon search, radius 2 */
139 for( i = 0; i < i_me_range/2; i++ )
141 omx = bmx; omy = bmy;
142 COST_MV( omx-2, omy );
143 COST_MV( omx-1, omy+2 );
144 COST_MV( omx+1, omy+2 );
145 COST_MV( omx+2, omy );
146 COST_MV( omx+1, omy-2 );
147 COST_MV( omx-1, omy-2 );
148 if( bmx == omx && bmy == omy )
152 /* equivalent to the above, but eliminates duplicate candidates */
154 omx = bmx; omy = bmy;
155 COST_MV_DIR( omx-2, omy, 0 );
156 COST_MV_DIR( omx-1, omy+2, 1 );
157 COST_MV_DIR( omx+1, omy+2, 2 );
158 COST_MV_DIR( omx+2, omy, 3 );
159 COST_MV_DIR( omx+1, omy-2, 4 );
160 COST_MV_DIR( omx-1, omy-2, 5 );
163 for( i = 1; i < i_me_range/2; i++ )
165 static const int hex2[8][2] = {{-1,-2}, {-2,0}, {-1,2}, {1,2}, {2,0}, {1,-2}, {-1,-2}, {-2,0}};
166 static const int mod6[8] = {5,0,1,2,3,4,5,0};
167 const int odir = mod6[dir+1];
168 omx = bmx; omy = bmy;
169 COST_MV_DIR( omx + hex2[odir+0][0], omy + hex2[odir+0][1], odir-1 );
170 COST_MV_DIR( omx + hex2[odir+1][0], omy + hex2[odir+1][1], odir );
171 COST_MV_DIR( omx + hex2[odir+2][0], omy + hex2[odir+2][1], odir+1 );
172 if( bmx == omx && bmy == omy )
178 DIA1_ITER( bmx, bmy );
179 COST_MV( omx-1, omy-1 );
180 COST_MV( omx-1, omy+1 );
181 COST_MV( omx+1, omy-1 );
182 COST_MV( omx+1, omy+1 );
187 /* Uneven-cross Multi-Hexagon-grid Search
188 * as in JM, except without early termination */
193 /* refine predictors */
194 DIA1_ITER( pmx, pmy );
198 if(i_pixel == PIXEL_4x4)
202 if( (bmx || bmy) && (bmx!=pmx || bmy!=pmy) )
203 DIA1_ITER( bmx, bmy );
206 omx = bmx; omy = bmy;
207 cross_start = ( bcost == ucost ) ? 3 : 1;
208 for( i = cross_start; i < i_me_range; i+=2 )
210 if( omx + i <= mv_x_max )
211 COST_MV( omx + i, omy );
212 if( omx - i >= mv_x_min )
213 COST_MV( omx - i, omy );
215 for( i = cross_start; i < i_me_range/2; i+=2 )
217 if( omy + i <= mv_y_max )
218 COST_MV( omx, omy + i );
219 if( omy - i >= mv_y_min )
220 COST_MV( omx, omy - i );
224 omx = bmx; omy = bmy;
225 for( i = (bcost == ucost) ? 4 : 0; i < 24; i++ )
227 static const int square2[24][2] = {
228 { 1, 0}, { 0, 1}, {-1, 0}, { 0,-1},
229 { 1, 1}, {-1, 1}, {-1,-1}, { 1,-1},
230 { 2,-1}, { 2, 0}, { 2, 1}, { 2, 2},
231 { 1, 2}, { 0, 2}, {-1, 2}, {-2, 2},
232 {-2, 1}, {-2, 0}, {-2,-1}, {-2,-2},
233 {-1,-2}, { 0,-2}, { 1,-2}, { 2,-2}
235 COST_MV( omx + square2[i][0], omy + square2[i][1] );
238 omx = bmx; omy = bmy;
239 for( i = 1; i <= i_me_range/4; i++ )
241 int bounds_check = 4*i > X264_MIN4( mv_x_max-omx, mv_y_max-omy, omx-mv_x_min, omy-mv_y_min );
242 for( j = 0; j < 16; j++ )
244 static const int hex4[16][2] = {
245 {-4, 2}, {-4, 1}, {-4, 0}, {-4,-1}, {-4,-2},
246 { 4,-2}, { 4,-1}, { 4, 0}, { 4, 1}, { 4, 2},
247 { 2, 3}, { 0, 4}, {-2, 3},
248 {-2,-3}, { 0,-4}, { 2,-3},
250 int mx = omx + hex4[j][0]*i;
251 int my = omy + hex4[j][1]*i;
252 if( !bounds_check || ( mx >= mv_x_min && mx <= mv_x_max
253 && my >= mv_y_min && my <= mv_y_max ) )
262 const int min_x = X264_MAX( bmx - i_me_range, mv_x_min);
263 const int min_y = X264_MAX( bmy - i_me_range, mv_y_min);
264 const int max_x = X264_MIN( bmx + i_me_range, mv_x_max);
265 const int max_y = X264_MIN( bmy + i_me_range, mv_y_max);
266 for( omy = min_y; omy <= max_y; omy++ )
267 for( omx = min_x; omx <= max_x; omx++ )
279 /* compute the real cost */
280 m->cost_mv = p_cost_mvx[ m->mv[0] ] + p_cost_mvy[ m->mv[1] ];
282 if( bmx == pmx && bmy == pmy )
283 m->cost += m->cost_mv;
286 if( h->mb.i_subpel_refine >= 2 )
288 int hpel = subpel_iterations[h->mb.i_subpel_refine][2];
289 int qpel = subpel_iterations[h->mb.i_subpel_refine][3];
290 refine_subpel( h, m, hpel, qpel, p_halfpel_thresh, 0 );
295 void x264_me_refine_qpel( x264_t *h, x264_me_t *m )
297 int hpel = subpel_iterations[h->mb.i_subpel_refine][0];
298 int qpel = subpel_iterations[h->mb.i_subpel_refine][1];
300 if( m->i_pixel <= PIXEL_8x8 && h->sh.i_type == SLICE_TYPE_P )
301 m->cost -= m->i_ref_cost;
303 refine_subpel( h, m, hpel, qpel, NULL, 1 );
306 #define COST_MV_SAD( mx, my ) \
309 uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw, bh ); \
310 int cost = h->pixf.sad[i_pixel]( m->p_fenc[0], m->i_stride[0], src, stride ) \
311 + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
320 #define COST_MV_SATD( mx, my, dir ) \
321 if( b_refine_qpel || (dir^1) != odir ) \
324 uint8_t *src = h->mc.get_ref( m->p_fref, m->i_stride[0], pix, &stride, mx, my, bw, bh ); \
325 int cost = h->pixf.mbcmp[i_pixel]( m->p_fenc[0], m->i_stride[0], src, stride ) \
326 + p_cost_mvx[ mx ] + p_cost_mvy[ my ]; \
327 if( b_chroma_me && cost < bcost ) \
329 h->mc.mc_chroma( m->p_fref[4], m->i_stride[1], pix, 8, mx, my, bw/2, bh/2 ); \
330 cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[1], m->i_stride[1], pix, 8 ); \
333 h->mc.mc_chroma( m->p_fref[5], m->i_stride[1], pix, 8, mx, my, bw/2, bh/2 ); \
334 cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], m->i_stride[1], pix, 8 ); \
346 static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel )
348 const int bw = x264_pixel_size[m->i_pixel].w;
349 const int bh = x264_pixel_size[m->i_pixel].h;
350 const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
351 const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];
352 const int i_pixel = m->i_pixel;
353 const int b_chroma_me = h->mb.b_chroma_me && i_pixel <= PIXEL_8x8;
355 DECLARE_ALIGNED( uint8_t, pix[16*16], 16 );
365 /* try the subpel component of the predicted mv if it's close to
366 * the result of the fullpel search */
369 int mx = x264_clip3( m->mvp[0], h->mb.mv_min[0], h->mb.mv_max[0] );
370 int my = x264_clip3( m->mvp[1], h->mb.mv_min[1], h->mb.mv_max[1] );
371 if( mx != bmx || my != bmy )
372 COST_MV_SAD( mx, my );
376 for( i = hpel_iters; i > 0; i-- )
380 COST_MV_SAD( omx, omy - 2 );
381 COST_MV_SAD( omx, omy + 2 );
382 COST_MV_SAD( omx - 2, omy );
383 COST_MV_SAD( omx + 2, omy );
384 if( bmx == omx && bmy == omy )
391 COST_MV_SATD( bmx, bmy, -1 );
394 /* early termination when examining multiple reference frames */
395 if( p_halfpel_thresh )
397 if( (bcost*7)>>3 > *p_halfpel_thresh )
402 // don't need cost_mv
405 else if( bcost < *p_halfpel_thresh )
406 *p_halfpel_thresh = bcost;
411 for( i = qpel_iters; i > 0; i-- )
416 COST_MV_SATD( omx, omy - 1, 0 );
417 COST_MV_SATD( omx, omy + 1, 1 );
418 COST_MV_SATD( omx - 1, omy, 2 );
419 COST_MV_SATD( omx + 1, omy, 3 );
420 if( bmx == omx && bmy == omy )
427 m->cost_mv = p_cost_mvx[ bmx ] + p_cost_mvy[ bmy ];