git.sesse.net Git - x264/blob - common/opencl/motionsearch.cl

   1 /* Hierarchical (iterative) OpenCL lowres motion search */
   2
   3 inline int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height )
   4 {
   5     /* edge macroblocks might not have a direct descendant, use nearest */
   6     x = select( x >> 1, (x - (mb_width&1)) >> 1, x == mb_width-1 );
   7     y = select( y >> 1, (y - (mb_height&1)) >> 1, y == mb_height-1 );
   8     return (mb_width>>1) * y + x;
   9 }
  10
  11 /* Four threads calculate an 8x8 SAD.  Each does two rows */
  12 int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs )
  13 {
  14     frefpos.y += idx << 1;
  15     fencpos.y += idx << 1;
  16     int cost = 0;
  17     if( frefpos.x < 0 )
  18     {
  19         /* slow path when MV goes past left edge.  The GPU clamps reads from
  20          * (-1, 0) to (0,0), so you get pixels [0, 1, 2, 3] when what you really
  21          * want are [0, 0, 1, 2]
  22          */
  23         for( int y = 0; y < 2; y++ )
  24         {
  25             for( int x = 0; x < 8; x++ )
  26             {
  27                 pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
  28                 pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
  29                 cost += abs_diff( enc, ref );
  30             }
  31         }
  32     }
  33     else
  34     {
  35         uint4 enc, ref, costs = 0;
  36         enc = read_imageui( fenc, sampler, fencpos );
  37         ref = read_imageui( fref, sampler, frefpos );
  38         costs += abs_diff( enc, ref );
  39         enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) );
  40         ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) );
  41         costs += abs_diff( enc, ref );
  42         enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) );
  43         ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) );
  44         costs += abs_diff( enc, ref );
  45         enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) );
  46         ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) );
  47         costs += abs_diff( enc, ref );
  48         cost = costs.s0 + costs.s1 + costs.s2 + costs.s3;
  49     }
  50     costs[idx] = cost;
  51     return costs[0] + costs[1] + costs[2] + costs[3];
  52 }
  53
  54 /* One thread performs 8x8 SAD */
  55 int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos )
  56 {
  57     if( frefpos.x < 0 )
  58     {
  59         /* slow path when MV goes past left edge */
  60         int cost = 0;
  61         for( int y = 0; y < 8; y++ )
  62         {
  63             for( int x = 0; x < 8; x++ )
  64             {
  65                 uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
  66                 uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
  67                 cost += abs_diff( enc, ref );
  68             }
  69         }
  70         return cost;
  71     }
  72     else
  73     {
  74         uint4 enc, ref, cost = 0;
  75         for( int y = 0; y < 8; y++ )
  76         {
  77             for( int x = 0; x < 8; x += 4 )
  78             {
  79                 enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) );
  80                 ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) );
  81                 cost += abs_diff( enc, ref );
  82             }
  83         }
  84         return cost.s0 + cost.s1 + cost.s2 + cost.s3;
  85     }
  86 }
  87 /*
  88  * hierarchical motion estimation
  89  *
  90  * Each kernel launch is a single iteration
  91  *
  92  * MB per work group is determined by lclx / 4 * lcly
  93  *
  94  * global launch dimensions:  [mb_width * 4, mb_height]
  95  */
  96 kernel void hierarchical_motion( read_only image2d_t  fenc,
  97                                  read_only image2d_t  fref,
  98                                  const global short2 *in_mvs,
  99                                  global short2       *out_mvs,
 100                                  global int16_t      *out_mv_costs,
 101                                  global short2       *mvp_buffer,
 102                                  local int16_t       *cost_local,
 103                                  local short2        *mvc_local,
 104                                  int                  mb_width,
 105                                  int                  lambda,
 106                                  int                  me_range,
 107                                  int                  scale,
 108                                  int                  b_shift_index,
 109                                  int                  b_first_iteration,
 110                                  int                  b_reverse_references )
 111 {
 112     int mb_x = get_global_id( 0 ) >> 2;
 113     if( mb_x >= mb_width )
 114         return;
 115     int mb_height = get_global_size( 1 );
 116     int mb_i = get_global_id( 0 ) & 3;
 117     int mb_y = get_global_id( 1 );
 118     int mb_xy = mb_y * mb_width + mb_x;
 119     const int mb_size = 8;
 120     int2 coord = (int2)(mb_x, mb_y) * mb_size;
 121
 122     const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
 123     cost_local += 4 * mb_in_group;
 124
 125     int i_mvc = 0;
 126     mvc_local += 4 * mb_in_group;
 127     mvc_local[mb_i] = 0;
 128     int2 mvp =0;
 129
 130     if( !b_first_iteration )
 131     {
 132 #define MVC( DX, DY )\
 133     {\
 134         int px = mb_x + DX;\
 135         int py = mb_y + DY;\
 136         mvc_local[i_mvc] = b_shift_index ? in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )] : \
 137                                            in_mvs[mb_width * py + px];\
 138         mvc_local[i_mvc] >>= (short) scale;\
 139         i_mvc++;\
 140     }
 141         /* Find MVP from median of MVCs */
 142         if( b_reverse_references )
 143         {
 144             /* odd iterations: derive MVP from down and right */
 145             if( mb_x < mb_width - 1 )
 146                 MVC( 1, 0 );
 147             if( mb_y < mb_height - 1 )
 148             {
 149                 MVC( 0, 1 );
 150                 if( mb_x > b_shift_index )
 151                     MVC( -1, 1 );
 152                 if( mb_x < mb_width - 1 )
 153                     MVC( 1, 1 );
 154             }
 155         }
 156         else
 157         {
 158             /* even iterations: derive MVP from up and left */
 159             if( mb_x > 0 )
 160                 MVC( -1, 0 );
 161             if( mb_y > 0 )
 162             {
 163                 MVC( 0, -1 );
 164                 if( mb_x < mb_width - 1 )
 165                     MVC( 1, -1 );
 166                 if( mb_x > b_shift_index )
 167                     MVC( -1, -1 );
 168             }
 169         }
 170 #undef MVC
 171         mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
 172     }
 173     /* current mvp matches the previous mvp and we have not changed scale.  We know
 174      * we're going to arrive at the same MV again, so just copy the previous
 175      * result to our output. */
 176     if( !b_shift_index && mvp.x == mvp_buffer[mb_xy].x && mvp.y == mvp_buffer[mb_xy].y )
 177     {
 178         out_mvs[mb_xy] = in_mvs[mb_xy];
 179         return;
 180     }
 181     mvp_buffer[mb_xy] = convert_short2_sat(mvp);
 182     int2 mv_min = -mb_size * (int2)(mb_x, mb_y) - 4;
 183     int2 mv_max = mb_size * ((int2)(mb_width, mb_height) - (int2)(mb_x, mb_y) - 1) + 4;
 184
 185     int2 bestmv = clamp(mvp, mv_min, mv_max);
 186     int2 refcrd = coord + bestmv;
 187
 188     /* measure cost at bestmv */
 189     int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +
 190                 lambda * mv_cost( abs_diff( bestmv, mvp ) << (2 + scale) );
 191
 192     do
 193     {
 194         /* measure costs at offsets from bestmv */
 195         refcrd = coord + bestmv + dia_offs[mb_i];
 196         int2 trymv = bestmv + dia_offs[mb_i];
 197         int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) +
 198                    lambda * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );
 199
 200         cost_local[mb_i] = (cost<<2) | mb_i;
 201         cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );
 202
 203         if( (cost >> 2) >= bcost )
 204             break;
 205
 206         bestmv += dia_offs[cost&3];
 207         bcost = cost>>2;
 208
 209         if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y )
 210             break;
 211     }
 212     while( --me_range > 0 );
 213
 214     int2 trymv = 0, diff = 0;
 215
 216 #define COST_MV_NO_PAD( L )\
 217     trymv = clamp( trymv, mv_min, mv_max );\
 218     diff = convert_int2_sat(abs_diff( mvp, trymv ));\
 219     if( diff.x > 1 || diff.y > 1 ) {\
 220         int2 refcrd = coord + trymv;\
 221         int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\
 222                    L * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );\
 223         if( cost < bcost ) { bcost = cost; bestmv = trymv; } }
 224
 225     COST_MV_NO_PAD( 0 );
 226
 227     if( !b_first_iteration )
 228     {
 229         /* try cost at previous iteration's MV, if MVP was too far away */
 230         int2 prevmv = b_shift_index ? convert_int2_sat(in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )]) : convert_int2_sat(in_mvs[mb_xy]);
 231         prevmv >>= scale;
 232         trymv = prevmv;
 233         COST_MV_NO_PAD( lambda );
 234     }
 235
 236     for( int i = 0; i < i_mvc; i++ )
 237     {
 238         /* try cost at each candidate MV, if MVP was too far away */
 239         trymv = convert_int2_sat( mvc_local[i] );
 240         COST_MV_NO_PAD( lambda );
 241     }
 242
 243     if( mb_i == 0 )
 244     {
 245         bestmv <<= scale;
 246         out_mvs[mb_xy] = convert_short2_sat(bestmv);
 247         out_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
 248     }
 249 }