git.sesse.net Git - x264/blob - common/opencl/bidir.cl

   1 /* Mode selection routines, select the least SATD cost mode for each lowres
   2  * macroblock.  When measuring B slices, this includes measuring the cost of
   3  * three bidir modes.  */
   4
   5 /* Four threads cooperatively measure 8x8 BIDIR cost with SATD */
   6 int bidir_satd_8x8_ii_coop4( read_only image2d_t fenc_lowres,
   7                              int2 fencpos,
   8                              read_only image2d_t fref0_planes,
   9                              int2 qpos0,
  10                              read_only image2d_t fref1_planes,
  11                              int2 qpos1,
  12                              int weight,
  13                              local sum2_t *tmpp,
  14                              int idx )
  15 {
  16     volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
  17     sum2_t b0, b1, b2, b3;
  18     sum2_t sum = 0;
  19
  20     // fencpos is full-pel position of original MB
  21     // qpos0 is qpel position within reference frame 0
  22     // qpos1 is qpel position within reference frame 1
  23
  24     int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2);
  25     int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2);
  26
  27     int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1));
  28     int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2);
  29     int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2);
  30
  31     int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2);
  32     int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2);
  33
  34     int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1));
  35     int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2);
  36     int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2);
  37
  38     uint mask_shift0A = 8 * hpel0A, mask_shift0B = 8 * hpel0B;
  39     uint mask_shift1A = 8 * hpel1A, mask_shift1B = 8 * hpel1B;
  40
  41     uint vA, vB;
  42     uint enc, ref0, ref1;
  43     uint a0, a1;
  44     const int weight2 = 64 - weight;
  45
  46 #define READ_BIDIR_DIFF( OUT, X )\
  47     enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\
  48     vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 >> mask_shift0A) & 0xFF;\
  49     vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 >> mask_shift0B) & 0xFF;\
  50     ref0 = rhadd( vA, vB );\
  51     vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 >> mask_shift1A) & 0xFF;\
  52     vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 >> mask_shift1B) & 0xFF;\
  53     ref1 = rhadd( vA, vB );\
  54     OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6);
  55
  56 #define READ_DIFF_EX( OUT, a, b )\
  57     READ_BIDIR_DIFF( a0, a );\
  58     READ_BIDIR_DIFF( a1, b );\
  59     OUT = a0 + (a1<<BITS_PER_SUM);
  60
  61 #define ROW_8x4_SATD( a, b, c )\
  62     fencpos.y += a;\
  63     fref0Apos.y += b;\
  64     fref0Bpos.y += b;\
  65     fref1Apos.y += c;\
  66     fref1Bpos.y += c;\
  67     READ_DIFF_EX( b0, 0, 4 );\
  68     READ_DIFF_EX( b1, 1, 5 );\
  69     READ_DIFF_EX( b2, 2, 6 );\
  70     READ_DIFF_EX( b3, 3, 7 );\
  71     HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
  72     HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
  73     sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );
  74
  75     ROW_8x4_SATD( 0, 0, 0 );
  76     ROW_8x4_SATD( 4, 4, 4 );
  77
  78 #undef READ_BIDIR_DIFF
  79 #undef READ_DIFF_EX
  80 #undef ROW_8x4_SATD
  81
  82     return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
  83 }
  84
  85 /*
  86  * mode selection - pick the least cost partition type for each 8x8 macroblock.
  87  * Intra, list0 or list1.  When measuring a B slice, also test three bidir
  88  * possibilities.
  89  *
  90  * fenc_lowres_mvs[0|1] and fenc_lowres_mv_costs[0|1] are large buffers that
  91  * hold many frames worth of motion vectors.  We must offset into the correct
  92  * location for this frame's vectors:
  93  *
  94  *   CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
  95  *   GPU equivalent: fenc_lowres_mvs0[(b - p0 - 1) * mb_count]
  96  *
  97  * global launch dimensions for P slice estimate:  [mb_width, mb_height]
  98  * global launch dimensions for B slice estimate:  [mb_width * 4, mb_height]
  99  */
 100 kernel void mode_selection( read_only image2d_t   fenc_lowres,
 101                             read_only image2d_t   fref0_planes,
 102                             read_only image2d_t   fref1_planes,
 103                             const global short2  *fenc_lowres_mvs0,
 104                             const global short2  *fenc_lowres_mvs1,
 105                             const global short2  *fref1_lowres_mvs0,
 106                             const global int16_t *fenc_lowres_mv_costs0,
 107                             const global int16_t *fenc_lowres_mv_costs1,
 108                             const global uint16_t *fenc_intra_cost,
 109                             global uint16_t      *lowres_costs,
 110                             global int           *frame_stats,
 111                             local int16_t        *cost_local,
 112                             local sum2_t         *satd_local,
 113                             int                   mb_width,
 114                             int                   bipred_weight,
 115                             int                   dist_scale_factor,
 116                             int                   b,
 117                             int                   p0,
 118                             int                   p1,
 119                             int                   lambda )
 120 {
 121     int mb_x = get_global_id( 0 );
 122     int b_bidir = b < p1;
 123     if( b_bidir )
 124     {
 125         /* when mode_selection is run for B frames, it must perform BIDIR SATD
 126          * measurements, so it is launched with four times as many threads in
 127          * order to spread the work around more of the GPU.  And it can add
 128          * padding threads in the X direction. */
 129         mb_x >>= 2;
 130         if( mb_x >= mb_width )
 131             return;
 132     }
 133     int mb_y = get_global_id( 1 );
 134     int mb_height = get_global_size( 1 );
 135     int mb_count = mb_width * mb_height;
 136     int mb_xy = mb_x + mb_y * mb_width;
 137
 138     /* Initialize int frame_stats[4] for next kernel (sum_inter_cost) */
 139     if( mb_x < 4 && mb_y == 0 )
 140         frame_stats[mb_x] = 0;
 141
 142     int bcost = COST_MAX;
 143     int list_used = 0;
 144
 145     if( !b_bidir )
 146     {
 147         int icost = fenc_intra_cost[mb_xy];
 148         COPY2_IF_LT( bcost, icost, list_used, 0 );
 149     }
 150     if( b != p0 )
 151     {
 152         int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy];
 153         COPY2_IF_LT( bcost, mv_cost0, list_used, 1 );
 154     }
 155     if( b != p1 )
 156     {
 157         int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy];
 158         COPY2_IF_LT( bcost, mv_cost1, list_used, 2 );
 159     }
 160
 161     if( b_bidir )
 162     {
 163         int2 coord = (int2)(mb_x, mb_y) << 3;
 164         int mb_i = get_global_id( 0 ) & 3;
 165         int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
 166         cost_local += mb_in_group * 4;
 167         satd_local += mb_in_group * 16;
 168
 169 #define TRY_BIDIR( mv0, mv1, penalty )\
 170 {\
 171     int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\
 172     int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\
 173     cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\
 174     int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
 175     COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\
 176 }
 177
 178         /* temporal prediction */
 179         short2 dmv0, dmv1;
 180         short2 mvr = fref1_lowres_mvs0[mb_xy];
 181         dmv0 = (mvr * (short) dist_scale_factor + (short) 128) >> (short) 8;
 182         dmv1 = dmv0 - mvr;
 183         TRY_BIDIR( dmv0, dmv1, 0 )
 184
 185         if( as_uint( dmv0 ) || as_uint( dmv1 ) )
 186         {
 187             /* B-direct prediction */
 188             dmv0 = 0; dmv1 = 0;
 189             TRY_BIDIR( dmv0, dmv1, 0 );
 190         }
 191
 192         /* L0+L1 prediction */
 193         dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy];
 194         dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy];
 195         TRY_BIDIR( dmv0, dmv1, 5 );
 196 #undef TRY_BIDIR
 197     }
 198
 199     lowres_costs[mb_xy] = min( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
 200 }
 201
 202 /*
 203  * parallel sum inter costs
 204  *
 205  * global launch dimensions: [256, mb_height]
 206  */
 207 kernel void sum_inter_cost( const global uint16_t *fenc_lowres_costs,
 208                             const global uint16_t *inv_qscale_factor,
 209                             global int           *fenc_row_satds,
 210                             global int           *frame_stats,
 211                             int                   mb_width,
 212                             int                   bframe_bias,
 213                             int                   b,
 214                             int                   p0,
 215                             int                   p1 )
 216 {
 217     int y = get_global_id( 1 );
 218     int mb_height = get_global_size( 1 );
 219
 220     int row_satds = 0;
 221     int cost_est = 0;
 222     int cost_est_aq = 0;
 223     int intra_mbs = 0;
 224
 225     for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
 226     {
 227         int mb_xy = x + y * mb_width;
 228         int cost = fenc_lowres_costs[mb_xy] & LOWRES_COST_MASK;
 229         int list = fenc_lowres_costs[mb_xy] >> LOWRES_COST_SHIFT;
 230         int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
 231
 232         if( list == 0 && b_frame_score_mb )
 233             intra_mbs++;
 234
 235         int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
 236
 237         row_satds += cost_aq;
 238
 239         if( b_frame_score_mb )
 240         {
 241             cost_est += cost;
 242             cost_est_aq += cost_aq;
 243         }
 244     }
 245
 246     local int buffer[256];
 247     int x = get_global_id( 0 );
 248
 249     row_satds   = parallel_sum( row_satds, x, buffer );
 250     cost_est    = parallel_sum( cost_est, x, buffer );
 251     cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
 252     intra_mbs   = parallel_sum( intra_mbs, x, buffer );
 253
 254     if( b != p1 )
 255         // Use floating point math to avoid 32bit integer overflow conditions
 256         cost_est = (int)((float)cost_est * 100.0f / (120.0f + (float)bframe_bias));
 257
 258     if( get_global_id( 0 ) == 0 )
 259     {
 260         fenc_row_satds[y] = row_satds;
 261         atomic_add( frame_stats + COST_EST, cost_est );
 262         atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
 263         atomic_add( frame_stats + INTRA_MBS, intra_mbs );
 264     }
 265 }