From: Fiona Glaser <fiona@x264.com>
Date: Sat, 10 Apr 2010 07:35:50 +0000 (-0700)
Subject: Special case in qpel refine for subme=1
X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=cec7764a9a3749d6f67ea25af3082178e4d70d34;p=x264

Special case in qpel refine for subme=1
~15-20% faster qpel refine with subme=1.
Some minor cleanups in refine_supel.
---

diff --git a/encoder/me.c b/encoder/me.c
index b0e16a56..e85b419b 100644
--- a/encoder/me.c
+++ b/encoder/me.c
@@ -788,13 +788,7 @@ if( b_refine_qpel || (dir^1) != odir ) \
             cost += h->pixf.mbcmp[i_pixel+3]( m->p_fenc[2], FENC_STRIDE, pix[0], 8 ); \
         } \
     } \
-    if( cost < bcost ) \
-    {                  \
-        bcost = cost;  \
-        bmx = mx;      \
-        bmy = my;      \
-        bdir = dir;    \
-    } \
+    COPY4_IF_LT( bcost, cost, bmx, mx, bmy, my, bdir, dir ); \
 }
 
 static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_iters, int *p_halfpel_thresh, int b_refine_qpel )
@@ -865,19 +859,38 @@ static void refine_subpel( x264_t *h, x264_me_t *m, int hpel_iters, int qpel_ite
     }
 
     /* quarterpel diamond search */
-    bdir = -1;
-    for( int i = qpel_iters; i > 0; i-- )
+    if( h->mb.i_subpel_refine > 1 )
     {
-        if( bmy <= h->mb.mv_min_spel[1] || bmy >= h->mb.mv_max_spel[1] || bmx <= h->mb.mv_min_spel[0] || bmx >= h->mb.mv_max_spel[0] )
-            break;
-        odir = bdir;
+        bdir = -1;
+        for( int i = qpel_iters; i > 0; i-- )
+        {
+            if( bmy <= h->mb.mv_min_spel[1] || bmy >= h->mb.mv_max_spel[1] || bmx <= h->mb.mv_min_spel[0] || bmx >= h->mb.mv_max_spel[0] )
+                break;
+            odir = bdir;
+            int omx = bmx, omy = bmy;
+            COST_MV_SATD( omx, omy - 1, 0 );
+            COST_MV_SATD( omx, omy + 1, 1 );
+            COST_MV_SATD( omx - 1, omy, 2 );
+            COST_MV_SATD( omx + 1, omy, 3 );
+            if( (bmx == omx) & (bmy == omy) )
+                break;
+        }
+    }
+    /* Special simplified case for subme=1 */
+    else if( bmy > h->mb.mv_min_spel[1] && bmy < h->mb.mv_max_spel[1] && bmx > h->mb.mv_min_spel[0] && bmx < h->mb.mv_max_spel[0] )
+    {
+        int costs[4];
         int omx = bmx, omy = bmy;
-        COST_MV_SATD( omx, omy - 1, 0 );
-        COST_MV_SATD( omx, omy + 1, 1 );
-        COST_MV_SATD( omx - 1, omy, 2 );
-        COST_MV_SATD( omx + 1, omy, 3 );
-        if( bmx == omx && bmy == omy )
-            break;
+        /* We have to use mc_luma because all strides must be the same to use fpelcmp_x4 */
+        h->mc.mc_luma( pix[0]   , 32, m->p_fref, m->i_stride[0], omx, omy-1, bw, bh, &m->weight[0] );
+        h->mc.mc_luma( pix[0]+16, 32, m->p_fref, m->i_stride[0], omx, omy+1, bw, bh, &m->weight[0] );
+        h->mc.mc_luma( pix[1]   , 32, m->p_fref, m->i_stride[0], omx-1, omy, bw, bh, &m->weight[0] );
+        h->mc.mc_luma( pix[1]+16, 32, m->p_fref, m->i_stride[0], omx+1, omy, bw, bh, &m->weight[0] );
+        h->pixf.fpelcmp_x4[i_pixel]( m->p_fenc[0], pix[0], pix[0]+16, pix[1], pix[1]+16, 32, costs );
+        COPY2_IF_LT( bcost, costs[0] + p_cost_mvx[omx  ] + p_cost_mvy[omy-1], bmy, omy-1 );
+        COPY2_IF_LT( bcost, costs[1] + p_cost_mvx[omx  ] + p_cost_mvy[omy+1], bmy, omy+1 );
+        COPY3_IF_LT( bcost, costs[2] + p_cost_mvx[omx-1] + p_cost_mvy[omy  ], bmx, omx-1, bmy, omy );
+        COPY3_IF_LT( bcost, costs[3] + p_cost_mvx[omx+1] + p_cost_mvy[omy  ], bmx, omx+1, bmy, omy );
     }
 
     m->cost = bcost;