git.sesse.net Git - x264/blob - common/opencl/subpel.cl

   1 /* OpenCL lowres subpel Refine */
   2
   3 /* Each thread performs 8x8 SAD.  4 threads per MB, so the 4 DIA HPEL offsets are
   4  * calculated simultaneously */
   5 int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
   6 {
   7     int2 frefpos = qpos >> 2;
   8     int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2);
   9     uint mask_shift = 8 * hpel_idx;
  10
  11     uint4 cost4 = 0;
  12
  13     for( int y = 0; y < 8; y++ )
  14     {
  15         uint4 enc, val4;
  16         enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y));
  17         val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 >> mask_shift) & 0xFF;
  18         val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 >> mask_shift) & 0xFF;
  19         val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 >> mask_shift) & 0xFF;
  20         val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 >> mask_shift) & 0xFF;
  21         cost4 += abs_diff( enc, val4 );
  22
  23         enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y));
  24         val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 >> mask_shift) & 0xFF;
  25         val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 >> mask_shift) & 0xFF;
  26         val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 >> mask_shift) & 0xFF;
  27         val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 >> mask_shift) & 0xFF;
  28         cost4 += abs_diff( enc, val4 );
  29     }
  30
  31     return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3;
  32 }
  33
  34 /* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */
  35 int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
  36 {
  37     int2 frefApos = qpos >> 2;
  38     int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2);
  39
  40     int2 qposB = qpos + ((qpos & 1) << 1);
  41     int2 frefBpos = qposB >> 2;
  42     int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2);
  43
  44     uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
  45
  46     int cost = 0;
  47
  48     for( int y = 0; y < 8; y++ )
  49     {
  50         for( int x = 0; x < 8; x++ )
  51         {
  52             uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0;
  53             uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 >> mask_shift0) & 0xFF;
  54             uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 >> mask_shift1) & 0xFF;
  55             cost += abs_diff( enc, rhadd( vA, vB ) );
  56         }
  57     }
  58
  59     return cost;
  60 }
  61
  62 /* Four threads measure 8x8 SATD cost at a QPEL offset into an HPEL plane
  63  *
  64  * Each thread collects 1/4 of the rows of diffs and processes one quarter of
  65  * the transforms
  66  */
  67 int satd_8x8_ii_qpel_coop4( read_only image2d_t fenc,
  68                             int2 fencpos,
  69                             read_only image2d_t fref_planes,
  70                             int2 qpos,
  71                             local sum2_t *tmpp,
  72                             int idx )
  73 {
  74     volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
  75     sum2_t b0, b1, b2, b3;
  76
  77     // fencpos is full-pel position of original MB
  78     // qpos is qpel position within reference frame
  79     int2 frefApos = qpos >> 2;
  80     int hpelA = ((qpos.x&2)>>1) + (qpos.y&2);
  81
  82     int2 qposB = qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1));
  83     int2 frefBpos = qposB >> 2;
  84     int hpelB = ((qposB.x&2)>>1) + (qposB.y&2);
  85
  86     uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
  87
  88     uint vA, vB;
  89     uint a0, a1;
  90     uint enc;
  91     sum2_t sum = 0;
  92
  93 #define READ_DIFF( OUT, X )\
  94     enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\
  95     vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 >> mask_shift0) & 0xFF;\
  96     vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 >> mask_shift1) & 0xFF;\
  97     OUT = enc - rhadd( vA, vB );
  98
  99 #define READ_DIFF_EX( OUT, a, b )\
 100     {\
 101         READ_DIFF( a0, a );\
 102         READ_DIFF( a1, b );\
 103         OUT = a0 + (a1<<BITS_PER_SUM);\
 104     }
 105 #define ROW_8x4_SATD( a, b )\
 106     {\
 107         fencpos.y += a;\
 108         frefApos.y += b;\
 109         frefBpos.y += b;\
 110         READ_DIFF_EX( b0, 0, 4 );\
 111         READ_DIFF_EX( b1, 1, 5 );\
 112         READ_DIFF_EX( b2, 2, 6 );\
 113         READ_DIFF_EX( b3, 3, 7 );\
 114         HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
 115         HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
 116         sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );\
 117     }
 118     ROW_8x4_SATD( 0, 0 );
 119     ROW_8x4_SATD( 4, 4 );
 120
 121 #undef READ_DIFF
 122 #undef READ_DIFF_EX
 123 #undef ROW_8x4_SATD
 124     return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
 125 }
 126
 127 constant int2 hpoffs[4] =
 128 {
 129     {0, -2}, {-2, 0}, {2, 0}, {0, 2}
 130 };
 131
 132 /* sub pixel refinement of motion vectors, output MVs and costs are moved from
 133  * temporary buffers into final per-frame buffer
 134  *
 135  * global launch dimensions:  [mb_width * 4, mb_height]
 136  *
 137  * With X being the source 16x16 pixels, F is the lowres pixel used by the
 138  * motion search.  We will now utilize the H V and C pixels (stored in separate
 139  * planes) to search at half-pel increments.
 140  *
 141  * X X X X X X
 142  *  F H F H F
 143  * X X X X X X
 144  *  V C V C V
 145  * X X X X X X
 146  *  F H F H F
 147  * X X X X X X
 148  *
 149  * The YX HPEL bits of the motion vector selects the plane we search in.  The
 150  * four planes are packed in the fref_planes 2D image buffer.  Each sample
 151  * returns:  s0 = F, s1 = H, s2 = V, s3 = C */
 152 kernel void subpel_refine( read_only image2d_t   fenc,
 153                            read_only image2d_t   fref_planes,
 154                            const global short2  *in_mvs,
 155                            const global int16_t *in_sad_mv_costs,
 156                            local int16_t        *cost_local,
 157                            local sum2_t         *satd_local,
 158                            local short2         *mvc_local,
 159                            global short2        *fenc_lowres_mv,
 160                            global int16_t       *fenc_lowres_mv_costs,
 161                            int                   mb_width,
 162                            int                   lambda,
 163                            int                   b,
 164                            int                   ref,
 165                            int                   b_islist1 )
 166 {
 167     int mb_x = get_global_id( 0 ) >> 2;
 168     if( mb_x >= mb_width )
 169         return;
 170     int mb_height = get_global_size( 1 );
 171
 172     int mb_i = get_global_id( 0 ) & 3;
 173     int mb_y = get_global_id( 1 );
 174     int mb_xy = mb_y * mb_width + mb_x;
 175
 176     /* fenc_lowres_mv and fenc_lowres_mv_costs are large buffers that
 177      * hold many frames worth of motion vectors.  We must offset into the correct
 178      * location for this frame's vectors.  The kernel will be passed the correct
 179      * directional buffer for the direction of the search: list1 or list0
 180      *
 181      *   CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
 182      *   GPU equivalent: fenc_lowres_mvs[(b - p0 - 1) * mb_count] */
 183     fenc_lowres_mv +=       (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
 184     fenc_lowres_mv_costs += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
 185
 186     /* Adjust pointers into local memory buffers for this thread's data */
 187     int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
 188     cost_local += mb_in_group * 4;
 189     satd_local += mb_in_group * 16;
 190     mvc_local += mb_in_group * 4;
 191
 192     int i_mvc = 0;
 193
 194     mvc_local[0] = mvc_local[1] = mvc_local[2] = mvc_local[3] = 0;
 195
 196 #define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)];
 197     if( mb_x > 0 )
 198         MVC( -1, 0 );
 199     if( mb_y > 0 )
 200     {
 201         MVC( 0, -1 );
 202         if( mb_x < mb_width - 1 )
 203             MVC( 1, -1 );
 204         if( mb_x > 0 )
 205             MVC( -1, -1 );
 206     }
 207 #undef MVC
 208     int2 mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
 209
 210     int bcost =  in_sad_mv_costs[mb_xy];
 211     int2 coord = (int2)(mb_x, mb_y) << 3;
 212     int2 bmv = convert_int2_sat( in_mvs[mb_xy] );
 213
 214     /* Make mvp and bmv QPEL MV */
 215     mvp <<= 2; bmv <<= 2;
 216
 217 #define HPEL_QPEL( ARR, FUNC )\
 218     {\
 219         int2 trymv = bmv + ARR[mb_i];\
 220         int2 qpos = (coord << 2) + trymv;\
 221         int cost = FUNC( fenc, coord, fref_planes, qpos ) + lambda * mv_cost( abs_diff( trymv, mvp ) );\
 222         cost_local[mb_i] = (cost<<2) + mb_i;\
 223         cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );\
 224         if( (cost>>2) < bcost )\
 225         {\
 226             bmv += ARR[cost&3];\
 227             bcost = cost>>2;\
 228         }\
 229     }
 230
 231     HPEL_QPEL( hpoffs, sad_8x8_ii_hpel );
 232     HPEL_QPEL( dia_offs, sad_8x8_ii_qpel );
 233     fenc_lowres_mv[mb_xy] = convert_short2_sat( bmv );
 234
 235     /* remeasure cost of bmv using SATD */
 236     int2 qpos = (coord << 2) + bmv;
 237     cost_local[mb_i] = satd_8x8_ii_qpel_coop4( fenc, coord, fref_planes, qpos, satd_local, mb_i );
 238     bcost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];
 239     bcost += lambda * mv_cost( abs_diff( bmv, mvp ) );
 240
 241     fenc_lowres_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
 242 }