1 /* Hierarchical (iterative) OpenCL lowres motion search */
3 inline int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height )
5 /* edge macroblocks might not have a direct descendant, use nearest */
6 x = select( x >> 1, (x - (mb_width&1)) >> 1, x == mb_width-1 );
7 y = select( y >> 1, (y - (mb_height&1)) >> 1, y == mb_height-1 );
8 return (mb_width>>1) * y + x;
11 /* Four threads calculate an 8x8 SAD. Each does two rows */
12 int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs )
14 frefpos.y += idx << 1;
15 fencpos.y += idx << 1;
19 /* slow path when MV goes past left edge. The GPU clamps reads from
20 * (-1, 0) to (0,0), so you get pixels [0, 1, 2, 3] when what you really
21 * want are [0, 0, 1, 2]
23 for( int y = 0; y < 2; y++ )
25 for( int x = 0; x < 8; x++ )
27 pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
28 pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
29 cost += abs_diff( enc, ref );
35 uint4 enc, ref, costs = 0;
36 enc = read_imageui( fenc, sampler, fencpos );
37 ref = read_imageui( fref, sampler, frefpos );
38 costs += abs_diff( enc, ref );
39 enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) );
40 ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) );
41 costs += abs_diff( enc, ref );
42 enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) );
43 ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) );
44 costs += abs_diff( enc, ref );
45 enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) );
46 ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) );
47 costs += abs_diff( enc, ref );
48 cost = costs.s0 + costs.s1 + costs.s2 + costs.s3;
51 return costs[0] + costs[1] + costs[2] + costs[3];
54 /* One thread performs 8x8 SAD */
55 int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos )
59 /* slow path when MV goes past left edge */
61 for( int y = 0; y < 8; y++ )
63 for( int x = 0; x < 8; x++ )
65 uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
66 uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
67 cost += abs_diff( enc, ref );
74 uint4 enc, ref, cost = 0;
75 for( int y = 0; y < 8; y++ )
77 for( int x = 0; x < 8; x += 4 )
79 enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) );
80 ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) );
81 cost += abs_diff( enc, ref );
84 return cost.s0 + cost.s1 + cost.s2 + cost.s3;
88 * hierarchical motion estimation
90 * Each kernel launch is a single iteration
92 * MB per work group is determined by lclx / 4 * lcly
94 * global launch dimensions: [mb_width * 4, mb_height]
96 kernel void hierarchical_motion( read_only image2d_t fenc,
97 read_only image2d_t fref,
98 const global short2 *in_mvs,
99 global short2 *out_mvs,
100 global int16_t *out_mv_costs,
101 global short2 *mvp_buffer,
102 local int16_t *cost_local,
103 local short2 *mvc_local,
109 int b_first_iteration,
110 int b_reverse_references )
112 int mb_x = get_global_id( 0 ) >> 2;
113 if( mb_x >= mb_width )
115 int mb_height = get_global_size( 1 );
116 int mb_i = get_global_id( 0 ) & 3;
117 int mb_y = get_global_id( 1 );
118 int mb_xy = mb_y * mb_width + mb_x;
119 const int mb_size = 8;
120 int2 coord = (int2)(mb_x, mb_y) * mb_size;
122 const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
123 cost_local += 4 * mb_in_group;
126 mvc_local += 4 * mb_in_group;
130 if( !b_first_iteration )
132 #define MVC( DX, DY )\
136 mvc_local[i_mvc] = b_shift_index ? in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )] : \
137 in_mvs[mb_width * py + px];\
138 mvc_local[i_mvc] >>= (short) scale;\
141 /* Find MVP from median of MVCs */
142 if( b_reverse_references )
144 /* odd iterations: derive MVP from down and right */
145 if( mb_x < mb_width - 1 )
147 if( mb_y < mb_height - 1 )
150 if( mb_x > b_shift_index )
152 if( mb_x < mb_width - 1 )
158 /* even iterations: derive MVP from up and left */
164 if( mb_x < mb_width - 1 )
166 if( mb_x > b_shift_index )
171 mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
173 /* current mvp matches the previous mvp and we have not changed scale. We know
174 * we're going to arrive at the same MV again, so just copy the previous
175 * result to our output. */
176 if( !b_shift_index && mvp.x == mvp_buffer[mb_xy].x && mvp.y == mvp_buffer[mb_xy].y )
178 out_mvs[mb_xy] = in_mvs[mb_xy];
181 mvp_buffer[mb_xy] = convert_short2_sat(mvp);
182 int2 mv_min = -mb_size * (int2)(mb_x, mb_y) - 4;
183 int2 mv_max = mb_size * ((int2)(mb_width, mb_height) - (int2)(mb_x, mb_y) - 1) + 4;
185 int2 bestmv = clamp(mvp, mv_min, mv_max);
186 int2 refcrd = coord + bestmv;
188 /* measure cost at bestmv */
189 int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +
190 lambda * mv_cost( abs_diff( bestmv, mvp ) << (2 + scale) );
194 /* measure costs at offsets from bestmv */
195 refcrd = coord + bestmv + dia_offs[mb_i];
196 int2 trymv = bestmv + dia_offs[mb_i];
197 int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) +
198 lambda * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );
200 cost_local[mb_i] = (cost<<2) | mb_i;
201 cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );
203 if( (cost >> 2) >= bcost )
206 bestmv += dia_offs[cost&3];
209 if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y )
212 while( --me_range > 0 );
214 int2 trymv = 0, diff = 0;
216 #define COST_MV_NO_PAD( L )\
217 trymv = clamp( trymv, mv_min, mv_max );\
218 diff = convert_int2_sat(abs_diff( mvp, trymv ));\
219 if( diff.x > 1 || diff.y > 1 ) {\
220 int2 refcrd = coord + trymv;\
221 int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\
222 L * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );\
223 if( cost < bcost ) { bcost = cost; bestmv = trymv; } }
227 if( !b_first_iteration )
229 /* try cost at previous iteration's MV, if MVP was too far away */
230 int2 prevmv = b_shift_index ? convert_int2_sat(in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )]) : convert_int2_sat(in_mvs[mb_xy]);
233 COST_MV_NO_PAD( lambda );
236 for( int i = 0; i < i_mvc; i++ )
238 /* try cost at each candidate MV, if MVP was too far away */
239 trymv = convert_int2_sat( mvc_local[i] );
240 COST_MV_NO_PAD( lambda );
246 out_mvs[mb_xy] = convert_short2_sat(bestmv);
247 out_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );