1 /*****************************************************************************
2 * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
3 *****************************************************************************
4 * Copyright (C) 2012-2013 x264 project
6 * Authors: Steve Borho <sborho@multicorewareinc.com>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22 * This program is also available under a commercial proprietary license.
23 * For more information, contact us at licensing@x264.com.
24 *****************************************************************************/
26 #include "common/common.h"
27 #include "macroblock.h"
35 void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
37 /* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined
38 * in the OpenCL headers shipped with NVIDIA drivers. We need to be
39 * able to compile on an NVIDIA machine and run optimally on an AMD GPU. */
40 #define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
42 #define OCLCHECK( method, ... )\
43 status = ocl->method( __VA_ARGS__ );\
44 if( status != CL_SUCCESS ) {\
45 h->param.b_opencl = 0;\
46 h->opencl.b_fatal_error = 1;\
47 x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\
51 void x264_opencl_flush( x264_t *h )
53 x264_opencl_function_t *ocl = h->opencl.ocl;
55 ocl->clFinish( h->opencl.queue );
57 /* Finish copies from the GPU by copying from the page-locked buffer to
58 * their final destination */
59 for( int i = 0; i < h->opencl.num_copies; i++ )
60 memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
61 h->opencl.num_copies = 0;
62 h->opencl.pl_occupancy = 0;
65 static void *x264_opencl_alloc_locked( x264_t *h, int bytes )
67 if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
68 x264_opencl_flush( h );
69 assert( bytes < PAGE_LOCKED_BUF_SIZE );
70 char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
71 h->opencl.pl_occupancy += bytes;
75 int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
77 if( fenc->b_intra_calculated )
79 fenc->b_intra_calculated = 1;
81 x264_opencl_function_t *ocl = h->opencl.ocl;
82 int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
84 #define CREATEBUF( out, flags, size )\
85 out = ocl->clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
86 if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
87 #define CREATEIMAGE( out, flags, pf, width, height )\
88 out = ocl->clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
89 if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
91 int mb_count = h->mb.i_mb_count;
94 if( !h->opencl.lowres_mv_costs )
96 /* Allocate shared memory buffers */
97 int width = h->mb.i_mb_width * 8 * sizeof(pixel);
98 int height = h->mb.i_mb_height * 8 * sizeof(pixel);
100 cl_image_format pixel_format;
101 pixel_format.image_channel_order = CL_R;
102 pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
103 CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
105 for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
107 pixel_format.image_channel_order = CL_RGBA;
108 pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
109 CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
114 CREATEBUF( h->opencl.lowres_mv_costs, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
115 CREATEBUF( h->opencl.lowres_costs[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
116 CREATEBUF( h->opencl.lowres_costs[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
117 CREATEBUF( h->opencl.mv_buffers[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
118 CREATEBUF( h->opencl.mv_buffers[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
119 CREATEBUF( h->opencl.mvp_buffer, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
120 CREATEBUF( h->opencl.frame_stats[0], CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
121 CREATEBUF( h->opencl.frame_stats[1], CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
122 CREATEBUF( h->opencl.row_satds[0], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
123 CREATEBUF( h->opencl.row_satds[1], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
124 CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY, luma_length );
125 CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY, luma_length );
128 if( !fenc->opencl.intra_cost )
130 /* Allocate per-frame buffers */
131 int width = h->mb.i_mb_width * 8 * sizeof(pixel);
132 int height = h->mb.i_mb_height * 8 * sizeof(pixel);
134 cl_image_format pixel_format;
135 pixel_format.image_channel_order = CL_R;
136 pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
137 CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
139 for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
141 pixel_format.image_channel_order = CL_RGBA;
142 pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
143 CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
147 CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY, mb_count * sizeof(int16_t) );
148 CREATEBUF( fenc->opencl.intra_cost, CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
149 CREATEBUF( fenc->opencl.lowres_mvs0, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
150 CREATEBUF( fenc->opencl.lowres_mvs1, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
151 CREATEBUF( fenc->opencl.lowres_mv_costs0, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
152 CREATEBUF( fenc->opencl.lowres_mv_costs1, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
156 /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
158 char *locked = x264_opencl_alloc_locked( h, luma_length );
159 memcpy( locked, fenc->plane[0], luma_length );
160 OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
163 if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
165 int size = h->mb.i_mb_count * sizeof(int16_t);
166 locked = x264_opencl_alloc_locked( h, size );
167 memcpy( locked, fenc->i_inv_qscale_factor, size );
168 OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
172 /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
175 OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
176 OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
177 gdim[0] = h->mb.i_mb_count;
178 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
181 int stride = fenc->i_stride[0];
183 OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
184 OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
185 OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
186 OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
187 gdim[0] = 8 * h->mb.i_mb_width;
188 gdim[1] = 8 * h->mb.i_mb_height;
189 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
191 for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
193 /* Workaround for AMD Southern Island:
195 * Alternate kernel instances. No perf impact to this, so we do it for
196 * all GPUs. It prevents the same kernel from being enqueued
197 * back-to-back, avoiding a dependency calculation bug in the driver.
199 cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
202 OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
203 OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
206 if( gdim[0] < 16 || gdim[1] < 16 )
208 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
212 gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
213 gdim[1] = 8*h->mb.i_mb_height;
218 /* For presets slow, slower, and placebo, check all 10 intra modes that the
219 * C lookahead supports. For faster presets, only check the most frequent 8
222 int slow = h->param.analyse.i_subpel_refine > 7;
223 OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
224 OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
225 OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
226 OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
227 OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
228 OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
229 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
232 gdim[1] = h->mb.i_mb_height;
236 OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
237 OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
238 OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
239 OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
240 OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
241 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
243 if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
244 x264_opencl_flush( h );
246 int size = h->mb.i_mb_count * sizeof(int16_t);
247 locked = x264_opencl_alloc_locked( h, size );
248 OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
249 h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
250 h->opencl.copies[h->opencl.num_copies].src = locked;
251 h->opencl.copies[h->opencl.num_copies].bytes = size;
252 h->opencl.num_copies++;
254 size = h->mb.i_mb_height * sizeof(int);
255 locked = x264_opencl_alloc_locked( h, size );
256 OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
257 h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
258 h->opencl.copies[h->opencl.num_copies].src = locked;
259 h->opencl.copies[h->opencl.num_copies].bytes = size;
260 h->opencl.num_copies++;
262 size = sizeof(int) * 4;
263 locked = x264_opencl_alloc_locked( h, size );
264 OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
265 h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
266 h->opencl.copies[h->opencl.num_copies].src = locked;
267 h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
268 h->opencl.num_copies++;
269 h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
270 h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
271 h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
272 h->opencl.num_copies++;
274 h->opencl.last_buf = !h->opencl.last_buf;
278 /* This function was tested emprically on a number of AMD and NV GPUs. Making a
279 * function which returns perfect launch dimensions is impossible; some
280 * applications will have self-tuning code to try many possible variables and
281 * measure the runtime. Here we simply make an educated guess based on what we
282 * know GPUs typically prefer. */
283 static void x264_optimal_launch_dims( x264_t *h, size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
285 x264_opencl_function_t *ocl = h->opencl.ocl;
286 size_t max_work_group = 256; /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
287 size_t preferred_multiple = 64;
290 ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
291 ocl->clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
292 ocl->clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
294 ldims[0] = preferred_multiple;
297 /* make ldims[1] an even divisor of gdims[1] */
298 while( gdims[1] & (ldims[1] - 1) )
303 /* make total ldims fit under the max work-group dimensions for the device */
304 while( ldims[0] * ldims[1] > max_work_group )
306 if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
312 if( ldims[0] > gdims[0] )
314 /* remove preferred multiples until we're close to gdims[0] */
315 while( gdims[0] + preferred_multiple < ldims[0] )
316 ldims[0] -= preferred_multiple;
321 /* make gdims an even multiple of ldims */
322 gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
323 gdims[0] *= ldims[0];
326 /* make ldims smaller to spread work across compute units */
327 while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
329 if( ldims[0] > preferred_multiple )
331 else if( ldims[1] > 1 )
336 /* for smaller GPUs, try not to abuse their texture cache */
337 if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
341 int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
343 x264_opencl_function_t *ocl = h->opencl.ocl;
344 x264_frame_t *fenc = frames[b];
345 x264_frame_t *fref = frames[ref];
347 cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
348 cl_mem ref_luma_hpel;
351 if( w && w->weightfn )
355 gdims[0] = 8 * h->mb.i_mb_width;
356 gdims[1] = 8 * h->mb.i_mb_height;
358 /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
359 for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
362 OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
363 OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
364 OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
365 OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
366 OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
367 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
371 if( gdims[0] < 16 || gdims[1] < 16 )
376 gdims[0] = 8 * h->mb.i_mb_width;
377 gdims[1] = 8 * h->mb.i_mb_height;
379 OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
380 OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
381 OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
382 OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
383 OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
384 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
386 /* Use weighted reference planes for motion search */
387 for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
388 ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
389 ref_luma_hpel = h->opencl.weighted_luma_hpel;
393 /* Use unweighted reference planes for motion search */
394 for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
395 ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
396 ref_luma_hpel = fref->opencl.luma_hpel;
399 const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
400 int b_first_iteration = 1;
401 int b_reverse_references = 1;
405 int mb_per_group = 0;
406 int cost_local_size = 0;
407 int mvc_local_size = 0;
414 for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
416 mb_width = h->mb.i_mb_width >> scale;
418 gdims[1] = h->mb.i_mb_height >> scale;
419 if( gdims[0] < 2 || gdims[1] < 2 )
422 x264_optimal_launch_dims( h, gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
424 mb_per_group = (ldims[0] >> 2) * ldims[1];
425 cost_local_size = 4 * mb_per_group * sizeof(int16_t);
426 mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
427 int scaled_me_range = h->param.analyse.i_me_range >> scale;
428 int b_shift_index = 1;
431 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
432 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
433 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
434 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
435 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
436 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
437 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
438 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
439 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
440 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
441 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
442 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
443 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
444 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
445 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
447 for( int iter = 0; iter < num_iterations[scale]; iter++ )
449 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
452 b_first_iteration = 0;
454 /* alternate top-left vs bot-right MB references at lower scales, so
455 * motion field smooths more quickly. */
457 b_reverse_references ^= 1;
459 b_reverse_references = 0;
461 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
462 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
463 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
464 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
465 OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
469 int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
471 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
472 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
473 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
474 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
475 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
476 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
477 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
481 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
482 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
486 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
487 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
490 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
491 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
492 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
493 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
494 OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
496 if( h->opencl.b_device_AMD_SI )
498 /* workaround for AMD Southern Island driver scheduling bug (fixed in
499 * July 2012), perform meaningless small copy to add a data dependency */
500 OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
503 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
505 int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
507 if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
508 x264_opencl_flush( h );
510 char *locked = x264_opencl_alloc_locked( h, mvlen );
511 h->opencl.copies[h->opencl.num_copies].src = locked;
512 h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
516 int mvs_offset = mvlen * (ref - b - 1);
517 OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
518 h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
522 int mvs_offset = mvlen * (b - ref - 1);
523 OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
524 h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
527 h->opencl.num_copies++;
532 int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
534 x264_opencl_function_t *ocl = h->opencl.ocl;
536 x264_frame_t *fenc = frames[b];
537 x264_frame_t *fref0 = frames[p0];
538 x264_frame_t *fref1 = frames[p1];
540 int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
542 /* Tasks for this kernel:
543 * 1. Select least cost mode (intra, ref0, ref1)
544 * list_used 0, 1, 2, or 3. if B frame, do not allow intra
545 * 2. if B frame, try bidir predictions.
546 * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); */
547 size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
548 size_t ldim_bidir[2];
549 size_t *ldims = NULL;
550 int cost_local_size = 4;
551 int satd_local_size = 4;
554 /* For B frames, use 4 threads per MB for BIDIR checks */
557 x264_optimal_launch_dims( h, gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
558 int mb_per_group = (ldims[0] >> 2) * ldims[1];
559 cost_local_size = 4 * mb_per_group * sizeof(int16_t);
560 satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
564 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
565 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
566 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
567 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
568 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
569 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
570 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
571 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
572 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
573 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
574 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
575 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
576 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
577 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
578 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
579 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
580 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
581 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
582 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
583 OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
584 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
586 /* Sum costs across rows, atomicAdd down frame */
587 size_t gdim[2] = { 256, h->mb.i_mb_height };
588 size_t ldim[2] = { 256, 1 };
591 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
592 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
593 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
594 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
595 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
596 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
597 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
598 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
599 OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
600 OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
602 if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
603 x264_opencl_flush( h );
605 int size = h->mb.i_mb_count * sizeof(int16_t);
606 char *locked = x264_opencl_alloc_locked( h, size );
607 h->opencl.copies[h->opencl.num_copies].src = locked;
608 h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
609 h->opencl.copies[h->opencl.num_copies].bytes = size;
610 OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
611 h->opencl.num_copies++;
613 size = h->mb.i_mb_height * sizeof(int);
614 locked = x264_opencl_alloc_locked( h, size );
615 h->opencl.copies[h->opencl.num_copies].src = locked;
616 h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
617 h->opencl.copies[h->opencl.num_copies].bytes = size;
618 OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
619 h->opencl.num_copies++;
621 size = 4 * sizeof(int);
622 locked = x264_opencl_alloc_locked( h, size );
623 OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
624 h->opencl.last_buf = !h->opencl.last_buf;
626 h->opencl.copies[h->opencl.num_copies].src = locked;
627 h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
628 h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
629 h->opencl.num_copies++;
630 h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
631 h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
632 h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
633 h->opencl.num_copies++;
635 if( b == p1 ) // P frames only
637 h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
638 h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
639 h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
640 h->opencl.num_copies++;
645 void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda )
647 if( h->param.b_opencl )
650 /* Temporarily boost priority of this lookahead thread and the OpenCL
651 * driver's thread until the end of this function. On AMD GPUs this
652 * greatly reduces the latency of enqueuing kernels and getting results
654 HANDLE id = GetCurrentThread();
655 h->opencl.lookahead_thread_pri = GetThreadPriority( id );
656 SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
657 x264_opencl_function_t *ocl = h->opencl.ocl;
658 cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
659 if( status == CL_SUCCESS )
661 h->opencl.opencl_thread_pri = GetThreadPriority( id );
662 SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
666 /* precalculate intra and I frames */
667 for( int i = 0; i <= num_frames; i++ )
668 x264_opencl_lowres_init( h, frames[i], lambda );
669 x264_opencl_flush( h );
671 if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && h->param.i_bframe )
673 /* For trellis B-Adapt, precompute exhaustive motion searches */
674 for( int b = 0; b <= num_frames; b++ )
676 for( int j = 1; j < h->param.i_bframe; j++ )
679 if( p0 >= 0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF )
681 const x264_weight_t *w = x264_weight_none;
683 if( h->param.analyse.i_weighted_pred )
686 x264_weights_analyse( h, frames[b], frames[p0], 1 );
687 w = frames[b]->weight[0];
689 frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
690 x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
693 if( p1 <= num_frames && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF )
695 frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
696 x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
701 x264_opencl_flush( h );
707 void x264_opencl_slicetype_end( x264_t *h )
710 if( h->param.b_opencl )
712 HANDLE id = GetCurrentThread();
713 SetThreadPriority( id, h->opencl.lookahead_thread_pri );
714 x264_opencl_function_t *ocl = h->opencl.ocl;
715 cl_int status = ocl->clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
716 if( status == CL_SUCCESS )
717 SetThreadPriority( id, h->opencl.opencl_thread_pri );
722 int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b )
724 if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
729 int dist_scale_factor = 128;
730 const x264_weight_t *w = x264_weight_none;
732 // avoid duplicating work
733 frames[b]->i_cost_est[b-p0][p1-b] = 0;
735 do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
736 do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
739 if( h->param.analyse.i_weighted_pred && b == p1 )
742 x264_weights_analyse( h, frames[b], frames[p0], 1 );
743 w = frames[b]->weight[0];
745 frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
748 frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
750 frames[b]->i_intra_mbs[b-p0] = 0;
752 dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
754 frames[b]->i_cost_est[b-p0][p1-b] = 0;
755 frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
757 x264_opencl_lowres_init( h, frames[b], lambda );
761 x264_opencl_lowres_init( h, frames[p0], lambda );
762 x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
766 x264_opencl_lowres_init( h, frames[p1], lambda );
767 x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
769 x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor );