.digress_x264
dataDec.txt
log.dec
+common/oclobj.h
+x264_lookahead.clbin
vpath %.asm $(SRCPATH)
vpath %.rc $(SRCPATH)
+GENERATED =
+
all: default
default:
endif
endif
+QUOTED_CFLAGS := $(CFLAGS)
+
+ifeq ($(HAVE_OPENCL),yes)
+empty:=
+space:=$(empty) $(empty)
+escaped:=\ $(empty)
+open:=(
+escopen:=\(
+close:=)
+escclose:=\)
+SAFE_INC_DIR := $(subst $(space),$(escaped),$(OPENCL_INC_DIR))
+SAFE_INC_DIR := $(subst $(open),$(escopen),$(SAFE_INC_DIR))
+SAFE_INC_DIR := $(subst $(close),$(escclose),$(SAFE_INC_DIR))
+SAFE_LIB_DIR := $(subst $(space),$(escaped),$(OPENCL_LIB_DIR))
+SAFE_LIB_DIR := $(subst $(open),$(escopen),$(SAFE_LIB_DIR))
+SAFE_LIB_DIR := $(subst $(close),$(escclose),$(SAFE_LIB_DIR))
+# For normal CFLAGS and LDFLAGS, we must escape spaces with a backslash to
+# make gcc happy
+CFLAGS += -I$(SAFE_INC_DIR) -DCL_USE_DEPRECATED_OPENCL_1_1_APIS
+LDFLAGS += -l$(OPENCL_LIB) -L$(SAFE_LIB_DIR)
+# For the CFLAGS used by the .depend rule, we must add quotes because
+# the rule does an extra level of shell expansions
+QUOTED_CFLAGS += -I"$(OPENCL_INC_DIR)" -DCL_USE_DEPRECATED_OPENCL_1_1_APIS
+common/oclobj.h: common/opencl/x264-cl.h $(wildcard $(SRCPATH)/common/opencl/*.cl)
+ cat $^ | perl $(SRCPATH)/tools/cltostr.pl x264_opencl_source > $@
+GENERATED += common/oclobj.h
+SRCS += common/opencl.c encoder/slicetype-cl.c
+endif
+
OBJS += $(SRCS:%.c=%.o)
OBJCLI += $(SRCCLI:%.c=%.o)
OBJSO += $(SRCSO:%.c=%.o)
lib-static: $(LIBX264)
lib-shared: $(SONAME)
-$(LIBX264): .depend $(OBJS) $(OBJASM)
+$(LIBX264): $(GENERATED) .depend $(OBJS) $(OBJASM)
rm -f $(LIBX264)
$(AR)$@ $(OBJS) $(OBJASM)
$(if $(RANLIB), $(RANLIB) $@)
-$(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO)
+$(SONAME): $(GENERATED) .depend $(OBJS) $(OBJASM) $(OBJSO)
$(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
ifneq ($(EXE),)
checkasm: checkasm$(EXE)
endif
-x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264)
+x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
$(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
-checkasm$(EXE): .depend $(OBJCHK) $(LIBX264)
+checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264)
$(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
$(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend
.depend: config.mak
@rm -f .depend
- @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
+ @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(QUOTED_CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
config.mak:
./configure
clean:
rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
- rm -f checkasm checkasm.exe $(OBJCHK)
+ rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin
rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock
distclean: clean
param->b_pic_struct = 0;
param->b_fake_interlaced = 0;
param->i_frame_packing = -1;
+ param->b_opencl = 0;
+ param->i_opencl_device = 0;
+ param->opencl_device_id = NULL;
+ param->psz_clbin_file = NULL;
}
static int x264_param_apply_preset( x264_param_t *param, const char *preset )
p->b_fake_interlaced = atobool(value);
OPT("frame-packing")
p->i_frame_packing = atoi(value);
+ OPT("opencl")
+ p->b_opencl = atobool( value );
+ OPT("opencl-clbin")
+ p->psz_clbin_file = strdup( value );
+ OPT("opencl-device")
+ p->i_opencl_device = atoi( value );
else
return X264_PARAM_BAD_NAME;
#undef OPT
s += sprintf( s, "bitdepth=%d ", BIT_DEPTH );
}
- s += sprintf( s, "cabac=%d", p->b_cabac );
+ if( p->b_opencl )
+ s += sprintf( s, "opencl=%d", p->b_opencl );
+ s += sprintf( s, " cabac=%d", p->b_cabac );
s += sprintf( s, " ref=%d", p->i_frame_reference );
s += sprintf( s, " deblock=%d:%d:%d", p->b_deblocking_filter,
p->i_deblocking_filter_alphac0, p->i_deblocking_filter_beta );
memset( var, 0, size );\
} while( 0 )
+#define ARRAY_SIZE(array) (sizeof(array)/sizeof(array[0]))
+
#define X264_BFRAME_MAX 16
#define X264_REF_MAX 16
#define X264_THREAD_MAX 128
#include <assert.h>
#include <limits.h>
+#if HAVE_OPENCL
+#include "opencl.h"
+#endif
+
#if HAVE_INTERLACED
# define MB_INTERLACED h->mb.b_interlaced
# define SLICE_MBAFF h->sh.b_mbaff
struct visualize_t *visualize;
#endif
x264_lookahead_t *lookahead;
+
+#if HAVE_OPENCL
+ x264_opencl_t opencl;
+#endif
};
// included at the end because it needs x264_t
}
x264_pthread_mutex_destroy( &frame->mutex );
x264_pthread_cond_destroy( &frame->cv );
+#if HAVE_OPENCL
+ x264_opencl_frame_delete( frame );
+#endif
}
x264_free( frame );
}
/* user frame properties */
uint8_t *mb_info;
void (*mb_info_free)( void* );
+
+#if HAVE_OPENCL
+ x264_frame_opencl_t opencl;
+#endif
} x264_frame_t;
/* synchronized frame list */
--- /dev/null
+/*****************************************************************************
+ * opencl.c: OpenCL initialization and kernel compilation
+ *****************************************************************************
+ * Copyright (C) 2012-2013 x264 project
+ *
+ * Authors: Steve Borho <sborho@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common.h"
+#if _WIN32
+#include <windows.h>
+#else
+#include <dlfcn.h> //dlopen, dlsym, dlclose
+#endif
+
+/* define from recent cl_ext.h, copied here in case headers are old */
+#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD 0x4042
+
+/* Requires full include path in case of out-of-tree builds */
+#include "common/oclobj.h"
+
+static int x264_detect_switchable_graphics();
+
+/* Try to load the cached compiled program binary, verify the device context is
+ * still valid before reuse */
+static cl_program x264_opencl_cache_load( x264_t *h, char *devname, char *devvendor, char *driverversion )
+{
+ cl_program program = NULL;
+ cl_int status;
+
+ /* try to load cached program binary */
+ FILE *fp = fopen( h->param.psz_clbin_file, "rb" );
+ if( !fp )
+ return NULL;
+
+ fseek( fp, 0L, SEEK_END );
+ size_t size = ftell( fp );
+ rewind( fp );
+ uint8_t *binary;
+ CHECKED_MALLOC( binary, size );
+
+ fread( binary, 1, size, fp );
+ const uint8_t *ptr = (const uint8_t*)binary;
+
+#define CHECK_STRING( STR )\
+ do {\
+ size_t len = strlen( STR );\
+ if( size <= len || strncmp( (char*)ptr, STR, len ) )\
+ goto fail;\
+ else {\
+ size -= (len+1); ptr += (len+1);\
+ }\
+ } while( 0 )
+
+ CHECK_STRING( devname );
+ CHECK_STRING( devvendor );
+ CHECK_STRING( driverversion );
+ CHECK_STRING( x264_opencl_source_hash );
+#undef CHECK_STRING
+
+ program = clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status );
+ if( status != CL_SUCCESS )
+ program = NULL;
+
+fail:
+ fclose( fp );
+ x264_free( binary );
+ return program;
+}
+
+/* Save the compiled program binary to a file for later reuse. Device context
+ * is also saved in the cache file so we do not reuse stale binaries */
+static void x264_opencl_cache_save( x264_t *h, cl_program program, char *devname, char *devvendor, char *driverversion )
+{
+ FILE *fp = fopen( h->param.psz_clbin_file, "wb" );
+ if( !fp )
+ {
+ x264_log( h, X264_LOG_INFO, "OpenCL: unable to open clbin file for write");
+ return;
+ }
+
+ size_t size;
+ cl_int status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL );
+ if( status == CL_SUCCESS )
+ {
+ uint8_t *binary;
+ CHECKED_MALLOC( binary, size );
+ status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &binary, NULL );
+ if( status == CL_SUCCESS )
+ {
+ fputs( devname, fp );
+ fputc( '\n', fp );
+ fputs( devvendor, fp );
+ fputc( '\n', fp );
+ fputs( driverversion, fp );
+ fputc( '\n', fp );
+ fputs( x264_opencl_source_hash, fp );
+ fputc( '\n', fp );
+ fwrite( binary, 1, size, fp );
+ }
+ else
+ x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary, no cache file generated");
+ x264_free( binary );
+ }
+ else
+ x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary size, no cache file generated");
+ fclose( fp );
+
+fail:
+ return;
+}
+
+/* The OpenCL source under common/opencl will be merged into common/oclobj.h by
+ * the Makefile. It defines a x264_opencl_source byte array which we will pass
+ * to clCreateProgramWithSource(). We also attempt to use a cache file for the
+ * compiled binary, stored in the current working folder. */
+static cl_program x264_opencl_compile( x264_t *h )
+{
+ cl_program program;
+ cl_int status;
+
+ char devname[64];
+ char devvendor[64];
+ char driverversion[64];
+ status = clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(devname), devname, NULL );
+ status |= clGetDeviceInfo( h->opencl.device, CL_DEVICE_VENDOR, sizeof(devvendor), devvendor, NULL );
+ status |= clGetDeviceInfo( h->opencl.device, CL_DRIVER_VERSION, sizeof(driverversion), driverversion, NULL );
+ if( status != CL_SUCCESS )
+ return NULL;
+
+ // Most AMD GPUs have vector registers
+ int vectorize = !strcmp( devvendor, "Advanced Micro Devices, Inc." );
+ h->opencl.b_device_AMD_SI = 0;
+
+ if( vectorize )
+ {
+ /* Disable OpenCL on Intel/AMD switchable graphics devices */
+ if( x264_detect_switchable_graphics() )
+ {
+ x264_log( h, X264_LOG_INFO, "OpenCL acceleration disabled, switchable graphics detected\n" );
+ return NULL;
+ }
+
+ /* Detect AMD SouthernIsland or newer device (single-width registers) */
+ cl_uint simdwidth = 4;
+ status = clGetDeviceInfo( h->opencl.device, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, sizeof(cl_uint), &simdwidth, NULL );
+ if( status == CL_SUCCESS && simdwidth == 1 )
+ {
+ vectorize = 0;
+ h->opencl.b_device_AMD_SI = 1;
+ }
+ }
+
+ x264_log( h, X264_LOG_INFO, "OpenCL acceleration enabled with %s %s %s\n", devvendor, devname, h->opencl.b_device_AMD_SI ? "(SI)" : "" );
+
+ program = x264_opencl_cache_load( h, devname, devvendor, driverversion );
+ if( !program )
+ {
+ /* clCreateProgramWithSource() requires a pointer variable, you cannot just use &x264_opencl_source */
+ x264_log( h, X264_LOG_INFO, "Compiling OpenCL kernels...\n" );
+ const char *strptr = (const char*)x264_opencl_source;
+ size_t size = sizeof(x264_opencl_source);
+ program = clCreateProgramWithSource( h->opencl.context, 1, &strptr, &size, &status );
+ if( status != CL_SUCCESS || !program )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: unable to create program\n" );
+ return NULL;
+ }
+ }
+
+ /* Build the program binary for the OpenCL device */
+ const char *buildopts = vectorize ? "-DVECTORIZE=1" : "";
+ status = clBuildProgram( program, 1, &h->opencl.device, buildopts, NULL, NULL );
+ if( status == CL_SUCCESS )
+ {
+ x264_opencl_cache_save( h, program, devname, devvendor, driverversion );
+ return program;
+ }
+
+ /* Compile failure, should not happen with production code. */
+
+ size_t build_log_len = 0;
+
+ status = clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, NULL, &build_log_len );
+ if( status != CL_SUCCESS )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to query build log\n" );
+ return NULL;
+ }
+
+ char *build_log;
+ CHECKED_MALLOC( build_log, build_log_len );
+ if( !build_log )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to alloc build log\n" );
+ return NULL;
+ }
+
+ status = clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, build_log, NULL );
+ if( status != CL_SUCCESS )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to get build log\n" );
+ x264_free( build_log );
+ return NULL;
+ }
+
+ FILE *lg = fopen( "x264_kernel_build_log.txt", "w" );
+ if( lg )
+ {
+ fwrite( build_log, 1, build_log_len, lg );
+ fclose( lg );
+ x264_log( h, X264_LOG_WARNING, "OpenCL: kernel build errors written to x264_kernel_build_log.txt\n" );
+ }
+
+ x264_free( build_log );
+fail:
+ return NULL;
+}
+
+static void x264_opencl_free_lookahead( x264_t *h )
+{
+#define RELEASE( a, f ) if( a ) f( a );
+ RELEASE( h->opencl.intra_kernel, clReleaseKernel )
+ RELEASE( h->opencl.rowsum_intra_kernel, clReleaseKernel )
+ RELEASE( h->opencl.downscale_kernel1, clReleaseKernel )
+ RELEASE( h->opencl.downscale_kernel2, clReleaseKernel )
+ RELEASE( h->opencl.downscale_hpel_kernel, clReleaseKernel )
+ RELEASE( h->opencl.weightp_hpel_kernel, clReleaseKernel )
+ RELEASE( h->opencl.weightp_scaled_images_kernel, clReleaseKernel )
+ RELEASE( h->opencl.memset_kernel, clReleaseKernel )
+ RELEASE( h->opencl.hme_kernel, clReleaseKernel )
+ RELEASE( h->opencl.subpel_refine_kernel, clReleaseKernel )
+ RELEASE( h->opencl.mode_select_kernel, clReleaseKernel )
+ RELEASE( h->opencl.rowsum_inter_kernel, clReleaseKernel )
+ RELEASE( h->opencl.lookahead_program, clReleaseProgram )
+ RELEASE( h->opencl.row_satds[0], clReleaseMemObject )
+ RELEASE( h->opencl.row_satds[1], clReleaseMemObject )
+ RELEASE( h->opencl.frame_stats[0], clReleaseMemObject )
+ RELEASE( h->opencl.frame_stats[1], clReleaseMemObject )
+ RELEASE( h->opencl.mv_buffers[0], clReleaseMemObject )
+ RELEASE( h->opencl.mv_buffers[1], clReleaseMemObject )
+ RELEASE( h->opencl.mvp_buffer, clReleaseMemObject )
+ RELEASE( h->opencl.luma_16x16_image[0], clReleaseMemObject )
+ RELEASE( h->opencl.luma_16x16_image[1], clReleaseMemObject )
+ RELEASE( h->opencl.lowres_mv_costs, clReleaseMemObject )
+ RELEASE( h->opencl.lowres_costs[0], clReleaseMemObject )
+ RELEASE( h->opencl.lowres_costs[1], clReleaseMemObject )
+ RELEASE( h->opencl.page_locked_buffer, clReleaseMemObject )
+ RELEASE( h->opencl.weighted_luma_hpel, clReleaseMemObject )
+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+ RELEASE( h->opencl.weighted_scaled_images[i], clReleaseMemObject )
+#undef RELEASE
+}
+
+int x264_opencl_init_lookahead( x264_t *h )
+{
+ if( !h->param.rc.i_lookahead )
+ return -1;
+
+ static const char const *kernelnames[] = {
+ "mb_intra_cost_satd_8x8",
+ "sum_intra_cost",
+ "downscale_hpel",
+ "downscale1",
+ "downscale2",
+ "memset_int16",
+ "weightp_scaled_images",
+ "weightp_hpel",
+ "hierarchical_motion",
+ "subpel_refine",
+ "mode_selection",
+ "sum_inter_cost"
+ };
+ cl_kernel *kernels[] = {
+ &h->opencl.intra_kernel,
+ &h->opencl.rowsum_intra_kernel,
+ &h->opencl.downscale_hpel_kernel,
+ &h->opencl.downscale_kernel1,
+ &h->opencl.downscale_kernel2,
+ &h->opencl.memset_kernel,
+ &h->opencl.weightp_scaled_images_kernel,
+ &h->opencl.weightp_hpel_kernel,
+ &h->opencl.hme_kernel,
+ &h->opencl.subpel_refine_kernel,
+ &h->opencl.mode_select_kernel,
+ &h->opencl.rowsum_inter_kernel
+ };
+ cl_int status;
+
+ h->opencl.lookahead_program = x264_opencl_compile( h );
+ if( !h->opencl.lookahead_program )
+ {
+ x264_opencl_free_lookahead( h );
+ return -1;
+ }
+
+ for( int i = 0; i < ARRAY_SIZE(kernelnames); i++ )
+ {
+ *kernels[i] = clCreateKernel( h->opencl.lookahead_program, kernelnames[i], &status );
+ if( status != CL_SUCCESS )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to compile kernel '%s' (%d)\n", kernelnames[i], status );
+ x264_opencl_free_lookahead( h );
+ return -1;
+ }
+ }
+
+ h->opencl.page_locked_buffer = clCreateBuffer( h->opencl.context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, PAGE_LOCKED_BUF_SIZE, NULL, &status );
+ if( status != CL_SUCCESS )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to allocate page-locked buffer, error '%d'\n", status );
+ x264_opencl_free_lookahead( h );
+ return -1;
+ }
+ h->opencl.page_locked_ptr = clEnqueueMapBuffer( h->opencl.queue, h->opencl.page_locked_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+ 0, PAGE_LOCKED_BUF_SIZE, 0, NULL, NULL, &status );
+ if( status != CL_SUCCESS )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to map page-locked buffer, error '%d'\n", status );
+ x264_opencl_free_lookahead( h );
+ return -1;
+ }
+
+ return 0;
+}
+
+static void x264_opencl_error_notify( const char *errinfo, const void *private_info, size_t cb, void *user_data )
+{
+ /* Any error notification can be assumed to be fatal to the OpenCL context.
+ * We need to stop using it immediately to prevent further damage. */
+ x264_t *h = (x264_t*)user_data;
+ h->param.b_opencl = 0;
+ h->opencl.b_fatal_error = 1;
+ x264_log( h, X264_LOG_ERROR, "OpenCL: %s\n", errinfo );
+ x264_log( h, X264_LOG_ERROR, "OpenCL: fatal error, aborting encode\n" );
+}
+
+int x264_opencl_init( x264_t *h )
+{
+ cl_int status;
+ cl_uint numPlatforms;
+ int ret = -1;
+
+ status = clGetPlatformIDs( 0, NULL, &numPlatforms );
+ if( status != CL_SUCCESS || numPlatforms == 0 )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n");
+ return -1;
+ }
+
+ cl_platform_id *platforms = (cl_platform_id*)x264_malloc( numPlatforms * sizeof(cl_platform_id) );
+ status = clGetPlatformIDs( numPlatforms, platforms, NULL );
+ if( status != CL_SUCCESS )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n");
+ x264_free( platforms );
+ return -1;
+ }
+
+ /* Select the first OpenCL platform with a GPU device that supports our
+ * required image (texture) formats */
+ for( cl_uint i = 0; i < numPlatforms; ++i )
+ {
+ cl_uint gpu_count = 0;
+ status = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &gpu_count );
+ if( status != CL_SUCCESS || !gpu_count )
+ continue;
+
+ cl_device_id *devices = x264_malloc( sizeof(cl_device_id) * gpu_count );
+ if( !devices )
+ continue;
+
+ status = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, gpu_count, devices, NULL );
+ if( status != CL_SUCCESS )
+ {
+ x264_free( devices );
+ continue;
+ }
+
+ /* Find a GPU device that supports our image formats */
+ for( cl_uint gpu = 0; gpu < gpu_count; gpu++ )
+ {
+ h->opencl.device = devices[gpu];
+
+ /* if the user has specified an exact device ID, skip all other
+ * GPUs. If this device matches, allow it to continue through the
+ * checks for supported images, etc. */
+ if( h->param.opencl_device_id && devices[gpu] != (cl_device_id) h->param.opencl_device_id )
+ continue;
+
+ cl_bool image_support;
+ clGetDeviceInfo( h->opencl.device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL );
+ if( !image_support )
+ continue;
+
+ cl_context context = clCreateContext( NULL, 1, &h->opencl.device, (void*)x264_opencl_error_notify, (void*)h, &status );
+ if( status != CL_SUCCESS )
+ continue;
+
+ cl_uint imagecount = 0;
+ clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, 0, NULL, &imagecount );
+ if( !imagecount )
+ {
+ clReleaseContext( context );
+ continue;
+ }
+
+ cl_image_format *imageType = x264_malloc( sizeof(cl_image_format) * imagecount );
+ if( !imageType )
+ {
+ clReleaseContext( context );
+ continue;
+ }
+
+ clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, imagecount, imageType, NULL );
+
+ int b_has_r = 0;
+ int b_has_rgba = 0;
+ for( cl_uint j = 0; j < imagecount; j++ )
+ {
+ if( imageType[j].image_channel_order == CL_R &&
+ imageType[j].image_channel_data_type == CL_UNSIGNED_INT32 )
+ b_has_r = 1;
+ else if( imageType[j].image_channel_order == CL_RGBA &&
+ imageType[j].image_channel_data_type == CL_UNSIGNED_INT8 )
+ b_has_rgba = 1;
+ }
+ x264_free( imageType );
+ if( !b_has_r || !b_has_rgba )
+ {
+ char devname[64];
+ status = clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(devname), devname, NULL );
+ if( status == CL_SUCCESS )
+ {
+ /* emit warning if we are discarding the user's explicit choice */
+ int level = h->param.opencl_device_id ? X264_LOG_WARNING : X264_LOG_DEBUG;
+ x264_log( h, level, "OpenCL: %s does not support required image formats\n", devname);
+ }
+ clReleaseContext( context );
+ continue;
+ }
+
+ /* user selection of GPU device, skip N first matches */
+ if( h->param.i_opencl_device )
+ {
+ h->param.i_opencl_device--;
+ clReleaseContext( context );
+ continue;
+ }
+
+ h->opencl.queue = clCreateCommandQueue( context, h->opencl.device, 0, &status );
+ if( status != CL_SUCCESS )
+ {
+ clReleaseContext( context );
+ continue;
+ }
+
+ h->opencl.context = context;
+
+ ret = 0;
+ break;
+ }
+
+ x264_free( devices );
+
+ if( !ret )
+ break;
+ }
+
+ x264_free( platforms );
+
+ if( !h->param.psz_clbin_file )
+ h->param.psz_clbin_file = "x264_lookahead.clbin";
+
+ if( ret )
+ x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to find a compatible device\n");
+ else
+ ret = x264_opencl_init_lookahead( h );
+
+ return ret;
+}
+
+void x264_opencl_frame_delete( x264_frame_t *frame )
+{
+#define RELEASEBUF(mem) if( mem ) clReleaseMemObject( mem );
+ for( int j = 0; j < NUM_IMAGE_SCALES; j++ )
+ RELEASEBUF( frame->opencl.scaled_image2Ds[j] );
+ RELEASEBUF( frame->opencl.luma_hpel );
+ RELEASEBUF( frame->opencl.inv_qscale_factor );
+ RELEASEBUF( frame->opencl.intra_cost );
+ RELEASEBUF( frame->opencl.lowres_mvs0 );
+ RELEASEBUF( frame->opencl.lowres_mvs1 );
+ RELEASEBUF( frame->opencl.lowres_mv_costs0 );
+ RELEASEBUF( frame->opencl.lowres_mv_costs1 );
+#undef RELEASEBUF
+}
+
+void x264_opencl_free( x264_t *h )
+{
+ if( h->opencl.queue )
+ clFinish(h->opencl.queue );
+
+ x264_opencl_free_lookahead( h );
+
+ if( h->opencl.queue )
+ clReleaseCommandQueue( h->opencl.queue );
+ if( h->opencl.context )
+ clReleaseContext( h->opencl.context );
+}
+
+/* OpenCL misbehaves on hybrid laptops with Intel iGPU and AMD dGPU, so
+ * we consult AMD's ADL interface to detect this situation and disable
+ * OpenCL on these machines (Linux and Windows) */
+#ifndef _WIN32
+#define __stdcall
+#define HINSTANCE void *
+#endif
+typedef void* ( __stdcall *ADL_MAIN_MALLOC_CALLBACK )( int );
+typedef int ( *ADL_MAIN_CONTROL_CREATE )(ADL_MAIN_MALLOC_CALLBACK, int );
+typedef int ( *ADL_ADAPTER_NUMBEROFADAPTERS_GET ) ( int* );
+typedef int ( *ADL_POWERXPRESS_SCHEME_GET ) ( int, int *, int *, int * );
+typedef int ( *ADL_MAIN_CONTROL_DESTROY )();
+#define ADL_OK 0
+#define ADL_PX_SCHEME_DYNAMIC 2
+
+void* __stdcall adl_malloc_wrapper( int iSize ) { return x264_malloc( iSize ); }
+
+static int x264_detect_switchable_graphics()
+{
+ ADL_MAIN_CONTROL_CREATE ADL_Main_Control_Create;
+ ADL_ADAPTER_NUMBEROFADAPTERS_GET ADL_Adapter_NumberOfAdapters_Get;
+ ADL_POWERXPRESS_SCHEME_GET ADL_PowerXpress_Scheme_Get;
+ ADL_MAIN_CONTROL_DESTROY ADL_Main_Control_Destroy;
+ HINSTANCE hDLL;
+ int ret = 0;
+
+#if _WIN32
+ hDLL = LoadLibrary( "atiadlxx.dll" );
+ if( !hDLL )
+ hDLL = LoadLibrary( "atiadlxy.dll" );
+#else
+ hDLL = dlopen( "libatiadlxx.so", RTLD_LAZY|RTLD_GLOBAL );
+#define GetProcAddress dlsym
+#endif
+ if( !hDLL )
+ return ret;
+
+ ADL_Main_Control_Create = (ADL_MAIN_CONTROL_CREATE) GetProcAddress(hDLL, "ADL_Main_Control_Create");
+ ADL_Main_Control_Destroy = (ADL_MAIN_CONTROL_DESTROY) GetProcAddress(hDLL, "ADL_Main_Control_Destroy");
+ ADL_Adapter_NumberOfAdapters_Get = (ADL_ADAPTER_NUMBEROFADAPTERS_GET) GetProcAddress(hDLL, "ADL_Adapter_NumberOfAdapters_Get");
+ ADL_PowerXpress_Scheme_Get = (ADL_POWERXPRESS_SCHEME_GET) GetProcAddress(hDLL, "ADL_PowerXpress_Scheme_Get");
+ if( !ADL_Main_Control_Destroy || !ADL_Main_Control_Destroy || !ADL_Adapter_NumberOfAdapters_Get ||
+ !ADL_PowerXpress_Scheme_Get )
+ goto bail;
+
+ if( ADL_OK != ADL_Main_Control_Create( adl_malloc_wrapper, 1) )
+ goto bail;
+
+ int numAdapters = 0;
+ if( ADL_OK != ADL_Adapter_NumberOfAdapters_Get( &numAdapters ) )
+ {
+ ADL_Main_Control_Destroy();
+ goto bail;
+ }
+
+ for( int i = 0; i < numAdapters; i++ )
+ {
+ int PXSchemeRange, PXSchemeCurrentState, PXSchemeDefaultState;
+ if( ADL_OK != ADL_PowerXpress_Scheme_Get( i, &PXSchemeRange, &PXSchemeCurrentState, &PXSchemeDefaultState) )
+ break;
+
+ if( PXSchemeRange >= ADL_PX_SCHEME_DYNAMIC )
+ {
+ ret = 1;
+ break;
+ }
+ }
+
+ ADL_Main_Control_Destroy();
+
+bail:
+#if _WIN32
+ FreeLibrary( hDLL );
+#else
+ dlclose( hDLL );
+#endif
+
+ return ret;
+}
--- /dev/null
+/*****************************************************************************
+ * opencl.h: OpenCL structures and defines
+ *****************************************************************************
+ * Copyright (C) 2012-2013 x264 project
+ *
+ * Authors: Steve Borho <sborho@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_OPENCL_H
+#define X264_OPENCL_H
+
+#include "x264.h"
+#include "common/common.h"
+
+#include <CL/cl.h>
+
+/* Number of downscale resolutions to use for motion search */
+#define NUM_IMAGE_SCALES 4
+
+/* Number of PCIe copies that can be queued before requiring a flush */
+#define MAX_FINISH_COPIES 1024
+
+/* Size (in bytes) of the page-locked buffer used for PCIe xfers */
+#define PAGE_LOCKED_BUF_SIZE 32 * 1024 * 1024
+
+typedef struct
+{
+ cl_context context;
+ cl_device_id device;
+ cl_command_queue queue;
+
+ cl_program lookahead_program;
+ cl_int last_buf;
+
+ cl_mem page_locked_buffer;
+ char *page_locked_ptr;
+ int pl_occupancy;
+
+ struct
+ {
+ void *src;
+ void *dest;
+ int bytes;
+ } copies[MAX_FINISH_COPIES];
+ int num_copies;
+
+ int b_device_AMD_SI;
+ int b_fatal_error;
+ int lookahead_thread_pri;
+ int opencl_thread_pri;
+
+ /* downscale lowres luma */
+ cl_kernel downscale_hpel_kernel;
+ cl_kernel downscale_kernel1;
+ cl_kernel downscale_kernel2;
+ cl_mem luma_16x16_image[2];
+
+ /* weightp filtering */
+ cl_kernel weightp_hpel_kernel;
+ cl_kernel weightp_scaled_images_kernel;
+ cl_mem weighted_scaled_images[NUM_IMAGE_SCALES];
+ cl_mem weighted_luma_hpel;
+
+ /* intra */
+ cl_kernel memset_kernel;
+ cl_kernel intra_kernel;
+ cl_kernel rowsum_intra_kernel;
+ cl_mem row_satds[2];
+
+ /* hierarchical motion estimation */
+ cl_kernel hme_kernel;
+ cl_kernel subpel_refine_kernel;
+ cl_mem mv_buffers[2];
+ cl_mem lowres_mv_costs;
+ cl_mem mvp_buffer;
+
+ /* bidir */
+ cl_kernel mode_select_kernel;
+ cl_kernel rowsum_inter_kernel;
+ cl_mem lowres_costs[2];
+ cl_mem frame_stats[2]; /* cost_est, cost_est_aq, intra_mbs */
+} x264_opencl_t;
+
+typedef struct
+{
+ cl_mem scaled_image2Ds[NUM_IMAGE_SCALES];
+ cl_mem luma_hpel;
+ cl_mem inv_qscale_factor;
+ cl_mem intra_cost;
+ cl_mem lowres_mvs0;
+ cl_mem lowres_mvs1;
+ cl_mem lowres_mv_costs0;
+ cl_mem lowres_mv_costs1;
+} x264_frame_opencl_t;
+
+typedef struct x264_frame x264_frame;
+
+int x264_opencl_init( x264_t *h );
+int x264_opencl_init_lookahead( x264_t *h );
+void x264_opencl_free( x264_t *h );
+void x264_opencl_frame_delete( x264_frame *frame );
+
+#endif
--- /dev/null
+/* Mode selection routines, select the least SATD cost mode for each lowres
+ * macroblock. When measuring B slices, this includes measuring the cost of
+ * three bidir modes. */
+
+/* Four threads cooperatively measure 8x8 BIDIR cost with SATD */
+int bidir_satd_8x8_ii_coop4( read_only image2d_t fenc_lowres,
+ int2 fencpos,
+ read_only image2d_t fref0_planes,
+ int2 qpos0,
+ read_only image2d_t fref1_planes,
+ int2 qpos1,
+ int weight,
+ local sum2_t *tmpp,
+ int idx )
+{
+ volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
+ sum2_t b0, b1, b2, b3;
+ sum2_t sum = 0;
+
+ // fencpos is full-pel position of original MB
+ // qpos0 is qpel position within reference frame 0
+ // qpos1 is qpel position within reference frame 1
+
+ int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2);
+ int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2);
+
+ int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1));
+ int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2);
+ int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2);
+
+ int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2);
+ int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2);
+
+ int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1));
+ int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2);
+ int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2);
+
+ uint mask_shift0A = 8 * hpel0A, mask_shift0B = 8 * hpel0B;
+ uint mask_shift1A = 8 * hpel1A, mask_shift1B = 8 * hpel1B;
+
+ uint vA, vB;
+ uint enc, ref0, ref1;
+ uint a0, a1;
+ const int weight2 = 64 - weight;
+
+#define READ_BIDIR_DIFF( OUT, X )\
+ enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\
+ vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 >> mask_shift0A) & 0xFF;\
+ vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 >> mask_shift0B) & 0xFF;\
+ ref0 = rhadd( vA, vB );\
+ vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 >> mask_shift1A) & 0xFF;\
+ vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 >> mask_shift1B) & 0xFF;\
+ ref1 = rhadd( vA, vB );\
+ OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6);
+
+#define READ_DIFF_EX( OUT, a, b )\
+ READ_BIDIR_DIFF( a0, a );\
+ READ_BIDIR_DIFF( a1, b );\
+ OUT = a0 + (a1<<BITS_PER_SUM);
+
+#define ROW_8x4_SATD( a, b, c )\
+ fencpos.y += a;\
+ fref0Apos.y += b;\
+ fref0Bpos.y += b;\
+ fref1Apos.y += c;\
+ fref1Bpos.y += c;\
+ READ_DIFF_EX( b0, 0, 4 );\
+ READ_DIFF_EX( b1, 1, 5 );\
+ READ_DIFF_EX( b2, 2, 6 );\
+ READ_DIFF_EX( b3, 3, 7 );\
+ HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
+ HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
+ sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );
+
+ ROW_8x4_SATD( 0, 0, 0 );
+ ROW_8x4_SATD( 4, 4, 4 );
+
+#undef READ_BIDIR_DIFF
+#undef READ_DIFF_EX
+#undef ROW_8x4_SATD
+
+ return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
+}
+
+/*
+ * mode selection - pick the least cost partition type for each 8x8 macroblock.
+ * Intra, list0 or list1. When measuring a B slice, also test three bidir
+ * possibilities.
+ *
+ * fenc_lowres_mvs[0|1] and fenc_lowres_mv_costs[0|1] are large buffers that
+ * hold many frames worth of motion vectors. We must offset into the correct
+ * location for this frame's vectors:
+ *
+ * CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
+ * GPU equivalent: fenc_lowres_mvs0[(b - p0 - 1) * mb_count]
+ *
+ * global launch dimensions for P slice estimate: [mb_width, mb_height]
+ * global launch dimensions for B slice estimate: [mb_width * 4, mb_height]
+ */
+kernel void mode_selection( read_only image2d_t fenc_lowres,
+ read_only image2d_t fref0_planes,
+ read_only image2d_t fref1_planes,
+ const global short2 *fenc_lowres_mvs0,
+ const global short2 *fenc_lowres_mvs1,
+ const global short2 *fref1_lowres_mvs0,
+ const global int16_t *fenc_lowres_mv_costs0,
+ const global int16_t *fenc_lowres_mv_costs1,
+ const global uint16_t *fenc_intra_cost,
+ global uint16_t *lowres_costs,
+ global int *frame_stats,
+ local int16_t *cost_local,
+ local sum2_t *satd_local,
+ int mb_width,
+ int bipred_weight,
+ int dist_scale_factor,
+ int b,
+ int p0,
+ int p1,
+ int lambda )
+{
+ int mb_x = get_global_id( 0 );
+ int b_bidir = b < p1;
+ if( b_bidir )
+ {
+ /* when mode_selection is run for B frames, it must perform BIDIR SATD
+ * measurements, so it is launched with four times as many threads in
+ * order to spread the work around more of the GPU. And it can add
+ * padding threads in the X direction. */
+ mb_x >>= 2;
+ if( mb_x >= mb_width )
+ return;
+ }
+ int mb_y = get_global_id( 1 );
+ int mb_height = get_global_size( 1 );
+ int mb_count = mb_width * mb_height;
+ int mb_xy = mb_x + mb_y * mb_width;
+
+ /* Initialize int frame_stats[4] for next kernel (sum_inter_cost) */
+ if( mb_x < 4 && mb_y == 0 )
+ frame_stats[mb_x] = 0;
+
+ int bcost = COST_MAX;
+ int list_used = 0;
+
+ if( !b_bidir )
+ {
+ int icost = fenc_intra_cost[mb_xy];
+ COPY2_IF_LT( bcost, icost, list_used, 0 );
+ }
+ if( b != p0 )
+ {
+ int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy];
+ COPY2_IF_LT( bcost, mv_cost0, list_used, 1 );
+ }
+ if( b != p1 )
+ {
+ int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy];
+ COPY2_IF_LT( bcost, mv_cost1, list_used, 2 );
+ }
+
+ if( b_bidir )
+ {
+ int2 coord = (int2)(mb_x, mb_y) << 3;
+ int mb_i = get_global_id( 0 ) & 3;
+ int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
+ cost_local += mb_in_group * 4;
+ satd_local += mb_in_group * 16;
+
+#define TRY_BIDIR( mv0, mv1, penalty )\
+{\
+ int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\
+ int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\
+ cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\
+ int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
+ COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\
+}
+
+ /* temporal prediction */
+ short2 dmv0, dmv1;
+ short2 mvr = fref1_lowres_mvs0[mb_xy];
+ dmv0 = (mvr * (short) dist_scale_factor + (short) 128) >> (short) 8;
+ dmv1 = dmv0 - mvr;
+ TRY_BIDIR( dmv0, dmv1, 0 )
+
+ if( as_uint( dmv0 ) || as_uint( dmv1 ) )
+ {
+ /* B-direct prediction */
+ dmv0 = 0; dmv1 = 0;
+ TRY_BIDIR( dmv0, dmv1, 0 );
+ }
+
+ /* L0+L1 prediction */
+ dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy];
+ dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy];
+ TRY_BIDIR( dmv0, dmv1, 5 );
+#undef TRY_BIDIR
+ }
+
+ lowres_costs[mb_xy] = min( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
+}
+
+/*
+ * parallel sum inter costs
+ *
+ * global launch dimensions: [256, mb_height]
+ */
+kernel void sum_inter_cost( const global uint16_t *fenc_lowres_costs,
+ const global uint16_t *inv_qscale_factor,
+ global int *fenc_row_satds,
+ global int *frame_stats,
+ int mb_width,
+ int bframe_bias,
+ int b,
+ int p0,
+ int p1 )
+{
+ int y = get_global_id( 1 );
+ int mb_height = get_global_size( 1 );
+
+ int row_satds = 0;
+ int cost_est = 0;
+ int cost_est_aq = 0;
+ int intra_mbs = 0;
+
+ for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
+ {
+ int mb_xy = x + y * mb_width;
+ int cost = fenc_lowres_costs[mb_xy] & LOWRES_COST_MASK;
+ int list = fenc_lowres_costs[mb_xy] >> LOWRES_COST_SHIFT;
+ int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
+
+ if( list == 0 && b_frame_score_mb )
+ intra_mbs++;
+
+ int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
+
+ row_satds += cost_aq;
+
+ if( b_frame_score_mb )
+ {
+ cost_est += cost;
+ cost_est_aq += cost_aq;
+ }
+ }
+
+ local int buffer[256];
+ int x = get_global_id( 0 );
+
+ row_satds = parallel_sum( row_satds, x, buffer );
+ cost_est = parallel_sum( cost_est, x, buffer );
+ cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
+ intra_mbs = parallel_sum( intra_mbs, x, buffer );
+
+ if( b != p1 )
+ // Use floating point math to avoid 32bit integer overflow conditions
+ cost_est = (int)((float)cost_est * 100.0f / (120.0f + (float)bframe_bias));
+
+ if( get_global_id( 0 ) == 0 )
+ {
+ fenc_row_satds[y] = row_satds;
+ atomic_add( frame_stats + COST_EST, cost_est );
+ atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
+ atomic_add( frame_stats + INTRA_MBS, intra_mbs );
+ }
+}
--- /dev/null
+/*
+ * downscale lowres luma: full-res buffer to down scale image, and to packed hpel image
+ *
+ * --
+ *
+ * fenc_img is an output image (area of memory referenced through a texture
+ * cache). A read of any pixel location (x,y) returns four pixel values:
+ *
+ * val.s0 = P(x,y)
+ * val.s1 = P(x+1,y)
+ * val.s2 = P(x+2,y)
+ * val.s3 = P(x+3,y)
+ *
+ * This is a 4x replication of the lowres pixels, a trade-off between memory
+ * size and read latency.
+ *
+ * --
+ *
+ * hpel_planes is an output image that contains the four HPEL planes used for
+ * subpel refinement. A read of any pixel location (x,y) returns a UInt32 with
+ * the four planar values C | V | H | F
+ *
+ * launch dimensions: [lowres-width, lowres-height]
+ */
+kernel void downscale_hpel( const global pixel *fenc,
+ write_only image2d_t fenc_img,
+ write_only image2d_t hpel_planes,
+ int stride )
+{
+ int x = get_global_id( 0 );
+ int y = get_global_id( 1 );
+ uint4 values;
+
+ fenc += y * stride * 2;
+ const global pixel *src1 = fenc + stride;
+ const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride;
+ int2 pos = (int2)(x, y);
+ pixel right, left;
+
+ right = rhadd( fenc[x*2], src1[x*2] );
+ left = rhadd( fenc[x*2+1], src1[x*2+1] );
+ values.s0 = rhadd( right, left ); // F
+
+ right = rhadd( fenc[2*x+1], src1[2*x+1] );
+ left = rhadd( fenc[2*x+2], src1[2*x+2] );
+ values.s1 = rhadd( right, left ); // H
+
+ right = rhadd( src1[2*x], src2[2*x] );
+ left = rhadd( src1[2*x+1], src2[2*x+1] );
+ values.s2 = rhadd( right, left ); // V
+
+ right = rhadd( src1[2*x+1], src2[2*x+1] );
+ left = rhadd( src1[2*x+2], src2[2*x+2] );
+ values.s3 = rhadd( right, left ); // C
+
+ uint4 val = (uint4) ((values.s3 & 0xff) << 24) | ((values.s2 & 0xff) << 16) | ((values.s1 & 0xff) << 8) | (values.s0 & 0xff);
+ write_imageui( hpel_planes, pos, val );
+
+ x = select( x, x+1, x+1 < get_global_size( 0 ) );
+ right = rhadd( fenc[x*2], src1[x*2] );
+ left = rhadd( fenc[x*2+1], src1[x*2+1] );
+ values.s1 = rhadd( right, left );
+
+ x = select( x, x+1, x+1 < get_global_size( 0 ) );
+ right = rhadd( fenc[x*2], src1[x*2] );
+ left = rhadd( fenc[x*2+1], src1[x*2+1] );
+ values.s2 = rhadd( right, left );
+
+ x = select( x, x+1, x+1 < get_global_size( 0 ) );
+ right = rhadd( fenc[x*2], src1[x*2] );
+ left = rhadd( fenc[x*2+1], src1[x*2+1] );
+ values.s3 = rhadd( right, left );
+
+ write_imageui( fenc_img, pos, values );
+}
+
+/*
+ * downscale lowres hierarchical motion search image, copy from one image to
+ * another decimated image. This kernel is called iteratively to generate all
+ * of the downscales.
+ *
+ * launch dimensions: [lower_res width, lower_res height]
+ */
+kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res )
+{
+ int x = get_global_id( 0 );
+ int y = get_global_id( 1 );
+ int2 pos = (int2)(x, y);
+ int gs = get_global_size( 0 );
+ uint4 top, bot, values;
+ top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
+ bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
+ values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
+
+ /* these select statements appear redundant, and they should be, but tests break when
+ * they are not here. I believe this was caused by a driver bug
+ */
+ values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
+ top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
+ bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
+ values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
+ values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
+ write_imageui( lower_res, pos, (uint4)(values) );
+}
+
+/*
+ * Second copy of downscale kernel, no differences. This is a (no perf loss)
+ * workaround for a scheduling bug in current Tahiti drivers. This bug has
+ * theoretically been fixed in the July 2012 driver release from AMD.
+ */
+kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res )
+{
+ int x = get_global_id( 0 );
+ int y = get_global_id( 1 );
+ int2 pos = (int2)(x, y);
+ int gs = get_global_size( 0 );
+ uint4 top, bot, values;
+ top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
+ bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
+ values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
+
+ // see comment in above function copy
+ values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
+ top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
+ bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
+ values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
+ values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
+ write_imageui( lower_res, pos, (uint4)(values) );
+}
+
+/* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */
+kernel void memset_int16( global int16_t *buf, int16_t value )
+{
+ buf[get_global_id( 0 )] = value;
+}
--- /dev/null
+/* Lookahead lowres intra analysis
+ *
+ * Each intra analysis function has been implemented twice, once for scalar GPUs
+ * (NV) and once for vectorized GPUs (AMD pre-Southern Islands). x264 detects
+ * the GPU type and sets the -DVECTORIZE compile flag accordingly.
+ *
+ * All the intra analysis functions were based on their C versions in pixel.c
+ * and produce the exact same results.
+ */
+
+/* force all clamp arguments and return value to int, prevent ambiguous types */
+#define clamp_int( X, MIN, MAX ) (int) clamp( (int)(X), (int)(MIN), (int)(MAX) )
+
+#if VECTORIZE
+int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 )
+{
+ int8 a_v, d_v;
+ int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13;
+ int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33;
+
+ d_v = convert_int8( vload8( 0, data ) );
+ a_v.s01234567 = (d_v - pr0).s04152637;
+ HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+ data += data_stride;
+ d_v = convert_int8( vload8( 0, data ) );
+ a_v.s01234567 = (d_v - pr1).s04152637;
+ HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+ data += data_stride;
+ d_v = convert_int8( vload8( 0, data ) );
+ a_v.s01234567 = (d_v - pr2).s04152637;
+ HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+ data += data_stride;
+ d_v = convert_int8( vload8( 0, data ) );
+ a_v.s01234567 = (d_v - pr3).s04152637;
+ HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+ uint8 sum_v;
+
+ HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 );
+ sum_v = abs( a_v );
+
+ HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 );
+ sum_v += abs( a_v );
+
+ HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 );
+ sum_v += abs( a_v );
+
+ HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 );
+ sum_v += abs( a_v );
+
+ uint4 sum2 = sum_v.hi + sum_v.lo;
+ uint2 sum3 = sum2.hi + sum2.lo;
+ return ( sum3.hi + sum3.lo ) >> 1;
+}
+#else
+SATD_C_8x4_Q( satd_8x4_lp, const local, private )
+#endif
+
+/****************************************************************************
+ * 8x8 prediction for intra luma block
+ ****************************************************************************/
+
+#define F1 rhadd
+#define F2( a, b, c ) ( a+2*b+c+2 )>>2
+
+#if VECTORIZE
+int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
+{
+ int8 pr0, pr1, pr2, pr3;
+
+ // Upper half of pred[]
+ pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2;
+ pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
+ pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
+ pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+ pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+ pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+ pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+ pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+
+ pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
+ pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
+ pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+ pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+ pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+ pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+ pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+ pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+
+ pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
+ pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+ pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+ pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+ pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+ pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+ pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+ pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+
+ pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+ pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+ pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+ pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+ pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+ pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+ pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+ pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+ // Lower half of pred[]
+ pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+ pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+ pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+ pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+ pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+ pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+ pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+ pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+
+ pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+ pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+ pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+ pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+ pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+ pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+ pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+ pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
+
+ pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+ pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+ pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+ pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+ pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+ pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+ pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
+ pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
+
+ pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+ pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+ pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+ pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+ pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+ pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
+ pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
+ pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2;
+
+ return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+ int8 pr0, pr1, pr2, pr3;
+
+ // Upper half of pred[]
+ pr3.s0 = F2( left[1], left[2], left[3] );
+ pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] );
+ pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top );
+ pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
+ pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
+ pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
+ pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
+ pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
+ pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] );
+ pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] );
+ pr0.s7 = F2( top[5], top[6], top[7] );
+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+ // Lower half of pred[]
+ pr3.s0 = F2( left[5], left[6], left[7] );
+ pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] );
+ pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] );
+ pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] );
+ pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] );
+ pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] );
+ pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top );
+ pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] );
+ pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] );
+ pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] );
+ pr0.s7 = F2( top[1], top[2], top[3] );
+ return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+ int8 pr0, pr1, pr2, pr3;
+
+ // Upper half of pred[]
+ pr2.s0 = F2( left[1], left[0], left_top );
+ pr3.s0 = F2( left[2], left[1], left[0] );
+ pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] );
+ pr0.s0 = pr2.s1 = F1( left_top, top[0] );
+ pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] );
+ pr0.s1 = pr2.s2 = F1( top[0], top[1] );
+ pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] );
+ pr0.s2 = pr2.s3 = F1( top[1], top[2] );
+ pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] );
+ pr0.s3 = pr2.s4 = F1( top[2], top[3] );
+ pr1.s4 = pr3.s5 = F2( top[2], top[3], top[4] );
+ pr0.s4 = pr2.s5 = F1( top[3], top[4] );
+ pr1.s5 = pr3.s6 = F2( top[3], top[4], top[5] );
+ pr0.s5 = pr2.s6 = F1( top[4], top[5] );
+ pr1.s6 = pr3.s7 = F2( top[4], top[5], top[6] );
+ pr0.s6 = pr2.s7 = F1( top[5], top[6] );
+ pr1.s7 = F2( top[5], top[6], top[7] );
+ pr0.s7 = F1( top[6], top[7] );
+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+ // Lower half of pred[]
+ pr2.s0 = F2( left[5], left[4], left[3] );
+ pr3.s0 = F2( left[6], left[5], left[4] );
+ pr0.s0 = pr2.s1 = F2( left[3], left[2], left[1] );
+ pr1.s0 = pr3.s1 = F2( left[4], left[3], left[2] );
+ pr0.s1 = pr2.s2 = F2( left[1], left[0], left_top );
+ pr1.s1 = pr3.s2 = F2( left[2], left[1], left[0] );
+ pr1.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
+ pr0.s2 = pr2.s3 = F1( left_top, top[0] );
+ pr1.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
+ pr0.s3 = pr2.s4 = F1( top[0], top[1] );
+ pr1.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
+ pr0.s4 = pr2.s5 = F1( top[1], top[2] );
+ pr1.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
+ pr0.s5 = pr2.s6 = F1( top[2], top[3] );
+ pr1.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
+ pr0.s6 = pr2.s7 = F1( top[3], top[4] );
+ pr1.s7 = F2( top[3], top[4], top[5] );
+ pr0.s7 = F1( top[4], top[5] );
+ return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+#undef PRED
+}
+
+int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+ int8 pr0, pr1, pr2, pr3;
+
+ // Upper half of pred[]
+ pr0.s0 = F1( left_top, left[0] ); pr0.s1 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
+ pr0.s2 = F2( top[1], top[0], left_top ); pr0.s3 = F2( top[2], top[1], top[0] );
+ pr0.s4 = F2( top[3], top[2], top[1] ); pr0.s5 = F2( top[4], top[3], top[2] );
+ pr0.s6 = F2( top[5], top[4], top[3] ); pr0.s7 = F2( top[6], top[5], top[4] );
+
+ pr1.s0 = F1( left[0], left[1] ); pr1.s1 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
+ pr1.s2 = F1( left_top, left[0] ); pr1.s3 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
+ pr1.s4 = F2( top[1], top[0], left_top ); pr1.s5 = F2( top[2], top[1], top[0] );
+ pr1.s6 = F2( top[3], top[2], top[1] ); pr1.s7 = F2( top[4], top[3], top[2] );
+
+ pr2.s0 = F1( left[1], left[2] ); pr2.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+ pr2.s2 = F1( left[0], left[1] ); pr2.s3 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
+ pr2.s4 = F1( left_top, left[0] ); pr2.s5 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
+ pr2.s6 = F2( top[1], top[0], left_top ); pr2.s7 = F2( top[2], top[1], top[0] );
+
+ pr3.s0 = F1( left[2], left[3] ); pr3.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+ pr3.s2 = F1( left[1], left[2] ); pr3.s3 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+ pr3.s4 = F1( left[0], left[1] ); pr3.s5 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
+ pr3.s6 = F1( left_top, left[0] ); pr3.s7 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+ // Lower half of pred[]
+ pr0.s0 = F1( left[3], left[4] ); pr0.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+ pr0.s2 = F1( left[2], left[3] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+ pr0.s4 = F1( left[1], left[2] ); pr0.s5 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+ pr0.s6 = F1( left[0], left[1] ); pr0.s7 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
+
+ pr1.s0 = F1( left[4], left[5] ); pr1.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+ pr1.s2 = F1( left[3], left[4] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+ pr1.s4 = F1( left[2], left[3] ); pr1.s5 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+ pr1.s6 = F1( left[1], left[2] ); pr1.s7 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+
+ pr2.s0 = F1( left[5], left[6] ); pr2.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+ pr2.s2 = F1( left[4], left[5] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+ pr2.s4 = F1( left[3], left[4] ); pr2.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+ pr2.s6 = F1( left[2], left[3] ); pr2.s7 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+
+ pr3.s0 = F1( left[6], left[7] ); pr3.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+ pr3.s2 = F1( left[5], left[6] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+ pr3.s4 = F1( left[4], left[5] ); pr3.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+ pr3.s6 = F1( left[3], left[4] ); pr3.s7 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+ return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
+{
+ int8 pr0, pr1, pr2, pr3;
+
+ // Upper half of pred[]
+ pr0.s0 = F1( top[0], top[1] );
+ pr1.s0 = F2( top[0], top[1], top[2] );
+ pr2.s0 = pr0.s1 = F1( top[1], top[2] );
+ pr3.s0 = pr1.s1 = F2( top[1], top[2], top[3] );
+ pr2.s1 = pr0.s2 = F1( top[2], top[3] );
+ pr3.s1 = pr1.s2 = F2( top[2], top[3], top[4] );
+ pr2.s2 = pr0.s3 = F1( top[3], top[4] );
+ pr3.s2 = pr1.s3 = F2( top[3], top[4], top[5] );
+ pr2.s3 = pr0.s4 = F1( top[4], top[5] );
+ pr3.s3 = pr1.s4 = F2( top[4], top[5], top[6] );
+ pr2.s4 = pr0.s5 = F1( top[5], top[6] );
+ pr3.s4 = pr1.s5 = F2( top[5], top[6], top[7] );
+ pr2.s5 = pr0.s6 = F1( top[6], top[7] );
+ pr3.s5 = pr1.s6 = F2( top[6], top[7], top[8] );
+ pr2.s6 = pr0.s7 = F1( top[7], top[8] );
+ pr3.s6 = pr1.s7 = F2( top[7], top[8], top[9] );
+ pr2.s7 = F1( top[8], top[9] );
+ pr3.s7 = F2( top[8], top[9], top[10] );
+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+ // Lower half of pred[]
+ pr0.s0 = F1( top[2], top[3] );
+ pr1.s0 = F2( top[2], top[3], top[4] );
+ pr2.s0 = pr0.s1 = F1( top[3], top[4] );
+ pr3.s0 = pr1.s1 = F2( top[3], top[4], top[5] );
+ pr2.s1 = pr0.s2 = F1( top[4], top[5] );
+ pr3.s1 = pr1.s2 = F2( top[4], top[5], top[6] );
+ pr2.s2 = pr0.s3 = F1( top[5], top[6] );
+ pr3.s2 = pr1.s3 = F2( top[5], top[6], top[7] );
+ pr2.s3 = pr0.s4 = F1( top[6], top[7] );
+ pr3.s3 = pr1.s4 = F2( top[6], top[7], top[8] );
+ pr2.s4 = pr0.s5 = F1( top[7], top[8] );
+ pr3.s4 = pr1.s5 = F2( top[7], top[8], top[9] );
+ pr2.s5 = pr0.s6 = F1( top[8], top[9] );
+ pr3.s5 = pr1.s6 = F2( top[8], top[9], top[10] );
+ pr2.s6 = pr0.s7 = F1( top[9], top[10] );
+ pr3.s6 = pr1.s7 = F2( top[9], top[10], top[11] );
+ pr2.s7 = F1( top[10], top[11] );
+ pr3.s7 = F2( top[10], top[11], top[12] );
+ return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
+{
+ int8 pr0, pr1, pr2, pr3;
+
+ // Upper half of pred[]
+ pr0.s0 = F1( left[0], left[1] ); pr0.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+ pr0.s2 = F1( left[1], left[2] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+ pr0.s4 = F1( left[2], left[3] ); pr0.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+ pr0.s6 = F1( left[3], left[4] ); pr0.s7 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+
+ pr1.s0 = F1( left[1], left[2] ); pr1.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+ pr1.s2 = F1( left[2], left[3] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+ pr1.s4 = F1( left[3], left[4] ); pr1.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+ pr1.s6 = F1( left[4], left[5] ); pr1.s7 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+
+ pr2.s0 = F1( left[2], left[3] ); pr2.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+ pr2.s2 = F1( left[3], left[4] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+ pr2.s4 = F1( left[4], left[5] ); pr2.s5 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+ pr2.s6 = F1( left[5], left[6] ); pr2.s7 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+
+ pr3.s0 = F1( left[3], left[4] ); pr3.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+ pr3.s2 = F1( left[4], left[5] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+ pr3.s4 = F1( left[5], left[6] ); pr3.s5 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+ pr3.s6 = F1( left[6], left[7] ); pr3.s7 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+ // Lower half of pred[]
+ pr0.s0 = F1( left[4], left[5] ); pr0.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+ pr0.s2 = F1( left[5], left[6] ); pr0.s3 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+ pr0.s4 = F1( left[6], left[7] ); pr0.s5 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
+ pr0.s6 = left[7]; pr0.s7 = left[7];
+
+ pr1.s0 = F1( left[5], left[6] ); pr1.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+ pr1.s2 = F1( left[6], left[7] ); pr1.s3 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
+ pr1.s4 = left[7]; pr1.s5 = left[7];
+ pr1.s6 = left[7]; pr1.s7 = left[7];
+
+ pr2.s0 = F1( left[6], left[7] ); pr2.s1 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
+ pr2.s2 = left[7]; pr2.s3 = left[7];
+ pr2.s4 = left[7]; pr2.s5 = left[7];
+ pr2.s6 = left[7]; pr2.s7 = left[7];
+
+ pr3 = (int8)left[7];
+
+ return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8c_h( const local pixel *src, int src_stride )
+{
+ const local pixel *src_l = src;
+ int8 pr0, pr1, pr2, pr3;
+
+ // Upper half of pred[]
+ pr0 = (int8)src[-1]; src += src_stride;
+ pr1 = (int8)src[-1]; src += src_stride;
+ pr2 = (int8)src[-1]; src += src_stride;
+ pr3 = (int8)src[-1]; src += src_stride;
+ int satd = satd_8x4_intra_lr( src_l, src_stride, pr0, pr1, pr2, pr3 );
+
+ //Lower half of pred[]
+ pr0 = (int8)src[-1]; src += src_stride;
+ pr1 = (int8)src[-1]; src += src_stride;
+ pr2 = (int8)src[-1]; src += src_stride;
+ pr3 = (int8)src[-1];
+ return satd + satd_8x4_intra_lr( src_l + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8c_v( const local pixel *src, int src_stride )
+{
+ int8 pred = convert_int8( vload8( 0, &src[-src_stride] ));
+ return satd_8x4_intra_lr( src, src_stride, pred, pred, pred, pred ) +
+ satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pred, pred, pred, pred );
+}
+
+int x264_predict_8x8c_p( const local pixel *src, int src_stride )
+{
+ int H = 0, V = 0;
+ for( int i = 0; i < 4; i++ )
+ {
+ H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
+ V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
+ }
+
+ int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
+ int b = (17 * H + 16) >> 5;
+ int c = (17 * V + 16) >> 5;
+ int i00 = a - 3 * b - 3 * c + 16;
+
+ // Upper half of pred[]
+ int pix = i00;
+ int8 pr0, pr1, pr2, pr3;
+ pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+ pix = i00;
+ pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+ pix = i00;
+ pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+ pix = i00;
+ pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+ int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+ //Lower half of pred[]
+ pix = i00;
+ pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+ pix = i00;
+ pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+ pix = i00;
+ pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+ pix = i00;
+ pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+ pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+ return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
+{
+ int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+ for( int i = 0; i < 4; i++ )
+ {
+ s0 += src[i - src_stride];
+ s1 += src[i + 4 - src_stride];
+ s2 += src[-1 + i * src_stride];
+ s3 += src[-1 + (i+4)*src_stride];
+ }
+
+ // Upper half of pred[]
+ int8 dc0;
+ dc0.lo = (int4)( (s0 + s2 + 4) >> 3 );
+ dc0.hi = (int4)( (s1 + 2) >> 2 );
+ int satd = satd_8x4_intra_lr( src, src_stride, dc0, dc0, dc0, dc0 );
+
+ // Lower half of pred[]
+ dc0.lo = (int4)( (s3 + 2) >> 2 );
+ dc0.hi = (int4)( (s1 + s3 + 4) >> 3 );
+ return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, dc0, dc0, dc0, dc0 );
+}
+
+#else /* not vectorized: private is cheap registers are scarce */
+
+int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
+{
+ private pixel pred[32];
+
+ // Upper half of pred[]
+ for( int y = 0; y < 4; y++ )
+ {
+ for( int x = 0; x < 8; x++ )
+ {
+ pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 );
+ pred[x + y*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
+ }
+ }
+ int satd = satd_8x4_lp( src, src_stride, pred, 8 );
+ //Lower half of pred[]
+ for( int y = 4; y < 8; y++ )
+ {
+ for( int x = 0; x < 8; x++ )
+ {
+ pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 );
+ pred[x + ( y - 4 )*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
+ }
+ }
+ pred[31] = ( 2 + top[14] + 3*top[15] ) >> 2;
+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+ return satd;
+}
+
+int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+ private pixel pred[32];
+#define PRED( x, y ) pred[(x) + (y)*8]
+ // Upper half of pred[]
+ PRED( 0, 3 ) = F2( left[1], left[2], left[3] );
+ PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[0], left[1], left[2] );
+ PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[1], left[0], left_top );
+ PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
+ PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
+ PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
+ PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
+ PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
+ PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( top[3], top[4], top[5] );
+ PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[4], top[5], top[6] );
+ PRED( 7, 0 ) = F2( top[5], top[6], top[7] );
+ int satd = satd_8x4_lp( src, src_stride, pred, 8 );
+
+ // Lower half of pred[]
+ PRED( 0, 3 ) = F2( left[5], left[6], left[7] );
+ PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[4], left[5], left[6] );
+ PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[3], left[4], left[5] );
+ PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[2], left[3], left[4] );
+ PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left[1], left[2], left[3] );
+ PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( left[0], left[1], left[2] );
+ PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( left[1], left[0], left_top );
+ PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( left[0], left_top, top[0] );
+ PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( left_top, top[0], top[1] );
+ PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[0], top[1], top[2] );
+ PRED( 7, 0 ) = F2( top[1], top[2], top[3] );
+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+ return satd;
+#undef PRED
+}
+
+int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+ private pixel pred[32];
+#define PRED( x, y ) pred[(x) + (y)*8]
+ // Upper half of pred[]
+ PRED( 0, 2 ) = F2( left[1], left[0], left_top );
+ PRED( 0, 3 ) = F2( left[2], left[1], left[0] );
+ PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[0], left_top, top[0] );
+ PRED( 0, 0 ) = PRED( 1, 2 ) = F1( left_top, top[0] );
+ PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left_top, top[0], top[1] );
+ PRED( 1, 0 ) = PRED( 2, 2 ) = F1( top[0], top[1] );
+ PRED( 2, 1 ) = PRED( 3, 3 ) = F2( top[0], top[1], top[2] );
+ PRED( 2, 0 ) = PRED( 3, 2 ) = F1( top[1], top[2] );
+ PRED( 3, 1 ) = PRED( 4, 3 ) = F2( top[1], top[2], top[3] );
+ PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[2], top[3] );
+ PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[2], top[3], top[4] );
+ PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[3], top[4] );
+ PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[3], top[4], top[5] );
+ PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[4], top[5] );
+ PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[4], top[5], top[6] );
+ PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[5], top[6] );
+ PRED( 7, 1 ) = F2( top[5], top[6], top[7] );
+ PRED( 7, 0 ) = F1( top[6], top[7] );
+ int satd = satd_8x4_lp( src, src_stride, pred, 8 );
+
+ //Lower half of pred[]
+ PRED( 0, 2 ) = F2( left[5], left[4], left[3] );
+ PRED( 0, 3 ) = F2( left[6], left[5], left[4] );
+ PRED( 0, 0 ) = PRED( 1, 2 ) = F2( left[3], left[2], left[1] );
+ PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[4], left[3], left[2] );
+ PRED( 1, 0 ) = PRED( 2, 2 ) = F2( left[1], left[0], left_top );
+ PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left[2], left[1], left[0] );
+ PRED( 2, 1 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
+ PRED( 2, 0 ) = PRED( 3, 2 ) = F1( left_top, top[0] );
+ PRED( 3, 1 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
+ PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[0], top[1] );
+ PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
+ PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[1], top[2] );
+ PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
+ PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[2], top[3] );
+ PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
+ PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[3], top[4] );
+ PRED( 7, 1 ) = F2( top[3], top[4], top[5] );
+ PRED( 7, 0 ) = F1( top[4], top[5] );
+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+ return satd;
+#undef PRED
+}
+
+inline uint32_t pack16to32( uint32_t a, uint32_t b )
+{
+ return a + (b << 16);
+}
+
+inline uint32_t pack8to16( uint32_t a, uint32_t b )
+{
+ return a + (b << 8);
+}
+
+int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+ private pixel pred[32];
+ int satd;
+ int p1 = pack8to16( (F1( left[6], left[7] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
+ int p2 = pack8to16( (F1( left[5], left[6] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
+ int p3 = pack8to16( (F1( left[4], left[5] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
+ int p4 = pack8to16( (F1( left[3], left[4] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
+ int p5 = pack8to16( (F1( left[2], left[3] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
+ int p6 = pack8to16( (F1( left[1], left[2] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
+ int p7 = pack8to16( (F1( left[0], left[1] )), ((left_top + 2 * left[0] + left[1] + 2) >> 2) );
+ int p8 = pack8to16( (F1( left_top, left[0] )), ((left[0] + 2 * left_top + top[0] + 2) >> 2) );
+ int p9 = pack8to16( (F2( top[1], top[0], left_top )), (F2( top[2], top[1], top[0] )) );
+ int p10 = pack8to16( (F2( top[3], top[2], top[1] )), (F2( top[4], top[3], top[2] )) );
+ int p11 = pack8to16( (F2( top[5], top[4], top[3] )), (F2( top[6], top[5], top[4] )) );
+ // Upper half of pred[]
+ vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[0 + 0 * 8] );
+ vstore4( as_uchar4( pack16to32( p10, p11 ) ), 0, &pred[4 + 0 * 8] );
+ vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[0 + 1 * 8] );
+ vstore4( as_uchar4( pack16to32( p9, p10 ) ), 0, &pred[4 + 1 * 8] );
+ vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[0 + 2 * 8] );
+ vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[4 + 2 * 8] );
+ vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[0 + 3 * 8] );
+ vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[4 + 3 * 8] );
+ satd = satd_8x4_lp( src, src_stride, pred, 8 );
+ // Lower half of pred[]
+ vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[0 + 0 * 8] );
+ vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[4 + 0 * 8] );
+ vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[0 + 1 * 8] );
+ vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[4 + 1 * 8] );
+ vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[0 + 2 * 8] );
+ vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[4 + 2 * 8] );
+ vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[0 + 3 * 8] );
+ vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[4 + 3 * 8] );
+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+ return satd;
+}
+
+int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
+{
+ private pixel pred[32];
+ int satd;
+#define PRED( x, y ) pred[(x) + (y)*8]
+ // Upper half of pred[]
+ PRED( 0, 0 ) = F1( top[0], top[1] );
+ PRED( 0, 1 ) = F2( top[0], top[1], top[2] );
+ PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[1], top[2] );
+ PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[1], top[2], top[3] );
+ PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[2], top[3] );
+ PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[2], top[3], top[4] );
+ PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[3], top[4] );
+ PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[3], top[4], top[5] );
+ PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[4], top[5] );
+ PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[4], top[5], top[6] );
+ PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[5], top[6] );
+ PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[5], top[6], top[7] );
+ PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[6], top[7] );
+ PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[6], top[7], top[8] );
+ PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[7], top[8] );
+ PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[7], top[8], top[9] );
+ PRED( 7, 2 ) = F1( top[8], top[9] );
+ PRED( 7, 3 ) = F2( top[8], top[9], top[10] );
+ satd = satd_8x4_lp( src, src_stride, pred, 8 );
+ // Lower half of pred[]
+ PRED( 0, 0 ) = F1( top[2], top[3] );
+ PRED( 0, 1 ) = F2( top[2], top[3], top[4] );
+ PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[3], top[4] );
+ PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[3], top[4], top[5] );
+ PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[4], top[5] );
+ PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[4], top[5], top[6] );
+ PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[5], top[6] );
+ PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[5], top[6], top[7] );
+ PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[6], top[7] );
+ PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[6], top[7], top[8] );
+ PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[7], top[8] );
+ PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[7], top[8], top[9] );
+ PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[8], top[9] );
+ PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[8], top[9], top[10] );
+ PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[9], top[10] );
+ PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[9], top[10], top[11] );
+ PRED( 7, 2 ) = F1( top[10], top[11] );
+ PRED( 7, 3 ) = F2( top[10], top[11], top[12] );
+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+ return satd;
+#undef PRED
+}
+
+int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
+{
+ private pixel pred[32];
+ int satd;
+ int p1 = pack8to16( (F1( left[0], left[1] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
+ int p2 = pack8to16( (F1( left[1], left[2] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
+ int p3 = pack8to16( (F1( left[2], left[3] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
+ int p4 = pack8to16( (F1( left[3], left[4] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
+ int p5 = pack8to16( (F1( left[4], left[5] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
+ int p6 = pack8to16( (F1( left[5], left[6] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
+ int p7 = pack8to16( (F1( left[6], left[7] )), ((left[6] + 2 * left[7] + left[7] + 2) >> 2) );
+ int p8 = pack8to16( left[7], left[7] );
+ // Upper half of pred[]
+ vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
+ satd = satd_8x4_lp( src, src_stride, pred, 8 );
+ // Lower half of pred[]
+ vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
+ vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+ return satd;
+}
+
+int x264_predict_8x8c_h( const local pixel *src, int src_stride )
+{
+ private pixel pred[32];
+ const local pixel *src_l = src;
+
+ // Upper half of pred[]
+ vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
+ vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
+ vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
+ vstore8( (uchar8)(src[-1]), 3, pred ); src += src_stride;
+ int satd = satd_8x4_lp( src_l, src_stride, pred, 8 );
+
+ // Lower half of pred[]
+ vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
+ vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
+ vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
+ vstore8( (uchar8)(src[-1]), 3, pred );
+ return satd + satd_8x4_lp( src_l + ( src_stride << 2 ), src_stride, pred, 8 );
+}
+
+int x264_predict_8x8c_v( const local pixel *src, int src_stride )
+{
+ private pixel pred[32];
+ uchar16 v16;
+ v16.lo = vload8( 0, &src[-src_stride] );
+ v16.hi = vload8( 0, &src[-src_stride] );
+
+ vstore16( v16, 0, pred );
+ vstore16( v16, 1, pred );
+
+ return satd_8x4_lp( src, src_stride, pred, 8 ) +
+ satd_8x4_lp( src + (src_stride << 2), src_stride, pred, 8 );
+}
+
+int x264_predict_8x8c_p( const local pixel *src, int src_stride )
+{
+ int H = 0, V = 0;
+ private pixel pred[32];
+ int satd;
+
+ for( int i = 0; i < 4; i++ )
+ {
+ H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
+ V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
+ }
+
+ int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
+ int b = (17 * H + 16) >> 5;
+ int c = (17 * V + 16) >> 5;
+ int i00 = a - 3 * b - 3 * c + 16;
+
+ // Upper half of pred[]
+ for( int y = 0; y < 4; y++ )
+ {
+ int pix = i00;
+ for( int x = 0; x < 8; x++ )
+ {
+ pred[x + y*8] = x264_clip_pixel( pix >> 5 );
+ pix += b;
+ }
+ i00 += c;
+ }
+ satd = satd_8x4_lp( src, src_stride, pred, 8 );
+ // Lower half of pred[]
+ for( int y = 0; y < 4; y++ )
+ {
+ int pix = i00;
+ for( int x = 0; x < 8; x++ )
+ {
+ pred[x + y*8] = x264_clip_pixel( pix >> 5 );
+ pix += b;
+ }
+ i00 += c;
+ }
+ satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+ return satd;
+}
+
+int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
+{
+ private pixel pred[32];
+ int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+ for( int i = 0; i < 4; i++ )
+ {
+ s0 += src[i - src_stride];
+ s1 += src[i + 4 - src_stride];
+ s2 += src[-1 + i * src_stride];
+ s3 += src[-1 + (i+4)*src_stride];
+ }
+
+ // Upper half of pred[]
+ uchar8 dc0;
+ dc0.lo = (uchar4)( (s0 + s2 + 4) >> 3 );
+ dc0.hi = (uchar4)( (s1 + 2) >> 2 );
+ vstore8( dc0, 0, pred );
+ vstore8( dc0, 1, pred );
+ vstore8( dc0, 2, pred );
+ vstore8( dc0, 3, pred );
+ int satd = satd_8x4_lp( src, src_stride, pred, 8 );
+
+ // Lower half of pred[]
+ dc0.lo = (uchar4)( (s3 + 2) >> 2 );
+ dc0.hi = (uchar4)( (s1 + s3 + 4) >> 3 );
+ vstore8( dc0, 0, pred );
+ vstore8( dc0, 1, pred );
+ vstore8( dc0, 2, pred );
+ vstore8( dc0, 3, pred );
+ return satd + satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+}
+#endif
+
+/* Find the least cost intra mode for 32 8x8 macroblocks per workgroup
+ *
+ * Loads 33 macroblocks plus the pixels directly above them into local memory,
+ * padding where necessary with edge pixels. It then cooperatively calculates
+ * smoothed top and left pixels for use in some of the analysis.
+ *
+ * Then groups of 32 threads each calculate a single intra mode for each 8x8
+ * block. Since consecutive threads are calculating the same intra mode there
+ * is no code-path divergence. 8 intra costs are calculated simultaneously. If
+ * the "slow" argument is not zero, the final two (least likely) intra modes are
+ * tested in a second pass. The slow mode is only enabled for presets slow,
+ * slower, and placebo.
+ *
+ * This allows all of the pixels functions to read pixels from local memory, and
+ * avoids re-fetching edge pixels from global memory. And it allows us to
+ * calculate all of the intra mode costs simultaneously without branch divergence.
+ *
+ * Local dimension: [ 32, 8 ]
+ * Global dimensions: [ paddedWidth, height ] */
+kernel void mb_intra_cost_satd_8x8( read_only image2d_t fenc,
+ global uint16_t *fenc_intra_cost,
+ global int *frame_stats,
+ int lambda,
+ int mb_width,
+ int slow )
+{
+#define CACHE_STRIDE 265
+#define BLOCK_OFFSET 266
+ local pixel cache[2385];
+ local int cost_buf[32];
+ local pixel top[32 * 16];
+ local pixel left[32 * 8];
+ local pixel left_top[32];
+
+ int lx = get_local_id( 0 );
+ int ly = get_local_id( 1 );
+ int gx = get_global_id( 0 );
+ int gy = get_global_id( 1 );
+ int gidx = get_group_id( 0 );
+ int gidy = get_group_id( 1 );
+ int linear_id = ly * get_local_size( 0 ) + lx;
+ int satd = COST_MAX;
+ int basex = gidx << 8;
+ int basey = (gidy << 3) - 1;
+
+ /* Load 33 8x8 macroblocks and the pixels above them into local cache */
+ for( int y = 0; y < 9 && linear_id < (33<<3)>>2; y++ )
+ {
+ int x = linear_id << 2;
+ uint4 data = read_imageui( fenc, sampler, (int2)(x + basex, y + basey) );
+ cache[y * CACHE_STRIDE + 1 + x] = data.s0;
+ cache[y * CACHE_STRIDE + 1 + x + 1] = data.s1;
+ cache[y * CACHE_STRIDE + 1 + x + 2] = data.s2;
+ cache[y * CACHE_STRIDE + 1 + x + 3] = data.s3;
+ }
+ /* load pixels on left edge */
+ if( linear_id < 9 )
+ cache[linear_id * CACHE_STRIDE] = read_imageui( fenc, sampler, (int2)( basex - 1, linear_id + basey) ).s0;
+
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ // Cooperatively build the top edge for the macroblock using lowpass filter
+ int j = ly;
+ top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
+ 2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
+ cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
+ j += 8;
+ top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
+ 2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
+ cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
+ // Cooperatively build the left edge for the macroblock using lowpass filter
+ left[lx*8 + ly] = ( cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*(ly - 1)] +
+ 2*cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*ly] +
+ cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*clamp((ly + 1), 0, 7 )] + 2 ) >> 2;
+ // One left_top per macroblock
+ if( 0 == ly )
+ {
+ left_top[lx] = ( cache[BLOCK_OFFSET + 8*lx - 1] + 2*cache[BLOCK_OFFSET + 8*lx - 1 - CACHE_STRIDE] +
+ cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE] + 2 ) >> 2;
+ cost_buf[lx] = COST_MAX;
+ }
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ // each warp/wavefront generates a different prediction type; no divergence
+ switch( ly )
+ {
+ case 0:
+ satd = x264_predict_8x8c_h( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
+ break;
+ case 1:
+ satd = x264_predict_8x8c_v( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
+ break;
+ case 2:
+ satd = x264_predict_8x8c_dc( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
+ break;
+ case 3:
+ satd = x264_predict_8x8c_p( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
+ break;
+ case 4:
+ satd = x264_predict_8x8_ddr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
+ break;
+ case 5:
+ satd = x264_predict_8x8_vr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
+ break;
+ case 6:
+ satd = x264_predict_8x8_hd( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
+ break;
+ case 7:
+ satd = x264_predict_8x8_hu( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &left[8*lx] );
+ break;
+ default:
+ break;
+ }
+ atom_min( &cost_buf[lx], satd );
+ if( slow )
+ {
+ // Do the remaining two (least likely) prediction modes
+ switch( ly )
+ {
+ case 0: // DDL
+ satd = x264_predict_8x8_ddl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
+ atom_min( &cost_buf[lx], satd );
+ break;
+ case 1: // VL
+ satd = x264_predict_8x8_vl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
+ atom_min( &cost_buf[lx], satd );
+ break;
+ default:
+ break;
+ }
+ }
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ if( (0 == ly) && (gx < mb_width) )
+ fenc_intra_cost[gidy * mb_width + gx] = cost_buf[lx]+ 5*lambda;
+
+ // initialize the frame_stats[2] buffer for kernel sum_intra_cost().
+ if( gx < 2 && gy == 0 )
+ frame_stats[gx] = 0;
+#undef CACHE_STRIDE
+#undef BLOCK_OFFSET
+}
+
+/*
+ * parallel sum intra costs
+ *
+ * global launch dimensions: [256, mb_height]
+ */
+kernel void sum_intra_cost( const global uint16_t *fenc_intra_cost,
+ const global uint16_t *inv_qscale_factor,
+ global int *fenc_row_satds,
+ global int *frame_stats,
+ int mb_width )
+{
+ int y = get_global_id( 1 );
+ int mb_height = get_global_size( 1 );
+
+ int row_satds = 0;
+ int cost_est = 0;
+ int cost_est_aq = 0;
+
+ for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
+ {
+ int mb_xy = x + y * mb_width;
+ int cost = fenc_intra_cost[mb_xy];
+ int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
+ int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
+
+ row_satds += cost_aq;
+ if( b_frame_score_mb )
+ {
+ cost_est += cost;
+ cost_est_aq += cost_aq;
+ }
+ }
+
+ local int buffer[256];
+ int x = get_global_id( 0 );
+
+ row_satds = parallel_sum( row_satds, x, buffer );
+ cost_est = parallel_sum( cost_est, x, buffer );
+ cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
+
+ if( get_global_id( 0 ) == 0 )
+ {
+ fenc_row_satds[y] = row_satds;
+ atomic_add( frame_stats + COST_EST, cost_est );
+ atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
+ }
+}
--- /dev/null
+/* Hierarchical (iterative) OpenCL lowres motion search */
+
+inline int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height )
+{
+ /* edge macroblocks might not have a direct descendant, use nearest */
+ x = select( x >> 1, (x - (mb_width&1)) >> 1, x == mb_width-1 );
+ y = select( y >> 1, (y - (mb_height&1)) >> 1, y == mb_height-1 );
+ return (mb_width>>1) * y + x;
+}
+
+/* Four threads calculate an 8x8 SAD. Each does two rows */
+int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs )
+{
+ frefpos.y += idx << 1;
+ fencpos.y += idx << 1;
+ int cost = 0;
+ if( frefpos.x < 0 )
+ {
+ /* slow path when MV goes past left edge. The GPU clamps reads from
+ * (-1, 0) to (0,0), so you get pixels [0, 1, 2, 3] when what you really
+ * want are [0, 0, 1, 2]
+ */
+ for( int y = 0; y < 2; y++ )
+ {
+ for( int x = 0; x < 8; x++ )
+ {
+ pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
+ pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
+ cost += abs_diff( enc, ref );
+ }
+ }
+ }
+ else
+ {
+ uint4 enc, ref, costs = 0;
+ enc = read_imageui( fenc, sampler, fencpos );
+ ref = read_imageui( fref, sampler, frefpos );
+ costs += abs_diff( enc, ref );
+ enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) );
+ ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) );
+ costs += abs_diff( enc, ref );
+ enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) );
+ ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) );
+ costs += abs_diff( enc, ref );
+ enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) );
+ ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) );
+ costs += abs_diff( enc, ref );
+ cost = costs.s0 + costs.s1 + costs.s2 + costs.s3;
+ }
+ costs[idx] = cost;
+ return costs[0] + costs[1] + costs[2] + costs[3];
+}
+
+/* One thread performs 8x8 SAD */
+int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos )
+{
+ if( frefpos.x < 0 )
+ {
+ /* slow path when MV goes past left edge */
+ int cost = 0;
+ for( int y = 0; y < 8; y++ )
+ {
+ for( int x = 0; x < 8; x++ )
+ {
+ uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
+ uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
+ cost += abs_diff( enc, ref );
+ }
+ }
+ return cost;
+ }
+ else
+ {
+ uint4 enc, ref, cost = 0;
+ for( int y = 0; y < 8; y++ )
+ {
+ for( int x = 0; x < 8; x += 4 )
+ {
+ enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) );
+ ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) );
+ cost += abs_diff( enc, ref );
+ }
+ }
+ return cost.s0 + cost.s1 + cost.s2 + cost.s3;
+ }
+}
+/*
+ * hierarchical motion estimation
+ *
+ * Each kernel launch is a single iteration
+ *
+ * MB per work group is determined by lclx / 4 * lcly
+ *
+ * global launch dimensions: [mb_width * 4, mb_height]
+ */
+kernel void hierarchical_motion( read_only image2d_t fenc,
+ read_only image2d_t fref,
+ const global short2 *in_mvs,
+ global short2 *out_mvs,
+ global int16_t *out_mv_costs,
+ global short2 *mvp_buffer,
+ local int16_t *cost_local,
+ local short2 *mvc_local,
+ int mb_width,
+ int lambda,
+ int me_range,
+ int scale,
+ int b_shift_index,
+ int b_first_iteration,
+ int b_reverse_references )
+{
+ int mb_x = get_global_id( 0 ) >> 2;
+ if( mb_x >= mb_width )
+ return;
+ int mb_height = get_global_size( 1 );
+ int mb_i = get_global_id( 0 ) & 3;
+ int mb_y = get_global_id( 1 );
+ int mb_xy = mb_y * mb_width + mb_x;
+ const int mb_size = 8;
+ int2 coord = (int2)(mb_x, mb_y) * mb_size;
+
+ const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
+ cost_local += 4 * mb_in_group;
+
+ int i_mvc = 0;
+ mvc_local += 4 * mb_in_group;
+ mvc_local[mb_i] = 0;
+ int2 mvp =0;
+
+ if( !b_first_iteration )
+ {
+#define MVC( DX, DY )\
+ {\
+ int px = mb_x + DX;\
+ int py = mb_y + DY;\
+ mvc_local[i_mvc] = b_shift_index ? in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )] : \
+ in_mvs[mb_width * py + px];\
+ mvc_local[i_mvc] >>= (short) scale;\
+ i_mvc++;\
+ }
+ /* Find MVP from median of MVCs */
+ if( b_reverse_references )
+ {
+ /* odd iterations: derive MVP from down and right */
+ if( mb_x < mb_width - 1 )
+ MVC( 1, 0 );
+ if( mb_y < mb_height - 1 )
+ {
+ MVC( 0, 1 );
+ if( mb_x > b_shift_index )
+ MVC( -1, 1 );
+ if( mb_x < mb_width - 1 )
+ MVC( 1, 1 );
+ }
+ }
+ else
+ {
+ /* even iterations: derive MVP from up and left */
+ if( mb_x > 0 )
+ MVC( -1, 0 );
+ if( mb_y > 0 )
+ {
+ MVC( 0, -1 );
+ if( mb_x < mb_width - 1 )
+ MVC( 1, -1 );
+ if( mb_x > b_shift_index )
+ MVC( -1, -1 );
+ }
+ }
+#undef MVC
+ mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
+ }
+ /* current mvp matches the previous mvp and we have not changed scale. We know
+ * we're going to arrive at the same MV again, so just copy the previous
+ * result to our output. */
+ if( !b_shift_index && mvp.x == mvp_buffer[mb_xy].x && mvp.y == mvp_buffer[mb_xy].y )
+ {
+ out_mvs[mb_xy] = in_mvs[mb_xy];
+ return;
+ }
+ mvp_buffer[mb_xy] = convert_short2_sat(mvp);
+ int2 mv_min = -mb_size * (int2)(mb_x, mb_y) - 4;
+ int2 mv_max = mb_size * ((int2)(mb_width, mb_height) - (int2)(mb_x, mb_y) - 1) + 4;
+
+ int2 bestmv = clamp(mvp, mv_min, mv_max);
+ int2 refcrd = coord + bestmv;
+
+ /* measure cost at bestmv */
+ int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +
+ lambda * mv_cost( abs_diff( bestmv, mvp ) << (2 + scale) );
+
+ do
+ {
+ /* measure costs at offsets from bestmv */
+ refcrd = coord + bestmv + dia_offs[mb_i];
+ int2 trymv = bestmv + dia_offs[mb_i];
+ int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) +
+ lambda * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );
+
+ cost_local[mb_i] = (cost<<2) | mb_i;
+ cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );
+
+ if( (cost >> 2) >= bcost )
+ break;
+
+ bestmv += dia_offs[cost&3];
+ bcost = cost>>2;
+
+ if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y )
+ break;
+ }
+ while( --me_range > 0 );
+
+ int2 trymv = 0, diff = 0;
+
+#define COST_MV_NO_PAD( L )\
+ trymv = clamp( trymv, mv_min, mv_max );\
+ diff = convert_int2_sat(abs_diff( mvp, trymv ));\
+ if( diff.x > 1 || diff.y > 1 ) {\
+ int2 refcrd = coord + trymv;\
+ int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\
+ L * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );\
+ if( cost < bcost ) { bcost = cost; bestmv = trymv; } }
+
+ COST_MV_NO_PAD( 0 );
+
+ if( !b_first_iteration )
+ {
+ /* try cost at previous iteration's MV, if MVP was too far away */
+ int2 prevmv = b_shift_index ? convert_int2_sat(in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )]) : convert_int2_sat(in_mvs[mb_xy]);
+ prevmv >>= scale;
+ trymv = prevmv;
+ COST_MV_NO_PAD( lambda );
+ }
+
+ for( int i = 0; i < i_mvc; i++ )
+ {
+ /* try cost at each candidate MV, if MVP was too far away */
+ trymv = convert_int2_sat( mvc_local[i] );
+ COST_MV_NO_PAD( lambda );
+ }
+
+ if( mb_i == 0 )
+ {
+ bestmv <<= scale;
+ out_mvs[mb_xy] = convert_short2_sat(bestmv);
+ out_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
+ }
+}
--- /dev/null
+/* OpenCL lowres subpel Refine */
+
+/* Each thread performs 8x8 SAD. 4 threads per MB, so the 4 DIA HPEL offsets are
+ * calculated simultaneously */
+int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
+{
+ int2 frefpos = qpos >> 2;
+ int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2);
+ uint mask_shift = 8 * hpel_idx;
+
+ uint4 cost4 = 0;
+
+ for( int y = 0; y < 8; y++ )
+ {
+ uint4 enc, val4;
+ enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y));
+ val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 >> mask_shift) & 0xFF;
+ val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 >> mask_shift) & 0xFF;
+ val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 >> mask_shift) & 0xFF;
+ val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 >> mask_shift) & 0xFF;
+ cost4 += abs_diff( enc, val4 );
+
+ enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y));
+ val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 >> mask_shift) & 0xFF;
+ val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 >> mask_shift) & 0xFF;
+ val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 >> mask_shift) & 0xFF;
+ val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 >> mask_shift) & 0xFF;
+ cost4 += abs_diff( enc, val4 );
+ }
+
+ return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3;
+}
+
+/* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */
+int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
+{
+ int2 frefApos = qpos >> 2;
+ int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2);
+
+ int2 qposB = qpos + ((qpos & 1) << 1);
+ int2 frefBpos = qposB >> 2;
+ int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2);
+
+ uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
+
+ int cost = 0;
+
+ for( int y = 0; y < 8; y++ )
+ {
+ for( int x = 0; x < 8; x++ )
+ {
+ uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0;
+ uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 >> mask_shift0) & 0xFF;
+ uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 >> mask_shift1) & 0xFF;
+ cost += abs_diff( enc, rhadd( vA, vB ) );
+ }
+ }
+
+ return cost;
+}
+
+/* Four threads measure 8x8 SATD cost at a QPEL offset into an HPEL plane
+ *
+ * Each thread collects 1/4 of the rows of diffs and processes one quarter of
+ * the transforms
+ */
+int satd_8x8_ii_qpel_coop4( read_only image2d_t fenc,
+ int2 fencpos,
+ read_only image2d_t fref_planes,
+ int2 qpos,
+ local sum2_t *tmpp,
+ int idx )
+{
+ volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
+ sum2_t b0, b1, b2, b3;
+
+ // fencpos is full-pel position of original MB
+ // qpos is qpel position within reference frame
+ int2 frefApos = qpos >> 2;
+ int hpelA = ((qpos.x&2)>>1) + (qpos.y&2);
+
+ int2 qposB = qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1));
+ int2 frefBpos = qposB >> 2;
+ int hpelB = ((qposB.x&2)>>1) + (qposB.y&2);
+
+ uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
+
+ uint vA, vB;
+ uint a0, a1;
+ uint enc;
+ sum2_t sum = 0;
+
+#define READ_DIFF( OUT, X )\
+ enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\
+ vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 >> mask_shift0) & 0xFF;\
+ vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 >> mask_shift1) & 0xFF;\
+ OUT = enc - rhadd( vA, vB );
+
+#define READ_DIFF_EX( OUT, a, b )\
+ {\
+ READ_DIFF( a0, a );\
+ READ_DIFF( a1, b );\
+ OUT = a0 + (a1<<BITS_PER_SUM);\
+ }
+#define ROW_8x4_SATD( a, b )\
+ {\
+ fencpos.y += a;\
+ frefApos.y += b;\
+ frefBpos.y += b;\
+ READ_DIFF_EX( b0, 0, 4 );\
+ READ_DIFF_EX( b1, 1, 5 );\
+ READ_DIFF_EX( b2, 2, 6 );\
+ READ_DIFF_EX( b3, 3, 7 );\
+ HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
+ HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
+ sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );\
+ }
+ ROW_8x4_SATD( 0, 0 );
+ ROW_8x4_SATD( 4, 4 );
+
+#undef READ_DIFF
+#undef READ_DIFF_EX
+#undef ROW_8x4_SATD
+ return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
+}
+
+constant int2 hpoffs[4] =
+{
+ {0, -2}, {-2, 0}, {2, 0}, {0, 2}
+};
+
+/* sub pixel refinement of motion vectors, output MVs and costs are moved from
+ * temporary buffers into final per-frame buffer
+ *
+ * global launch dimensions: [mb_width * 4, mb_height]
+ *
+ * With X being the source 16x16 pixels, F is the lowres pixel used by the
+ * motion search. We will now utilize the H V and C pixels (stored in separate
+ * planes) to search at half-pel increments.
+ *
+ * X X X X X X
+ * F H F H F
+ * X X X X X X
+ * V C V C V
+ * X X X X X X
+ * F H F H F
+ * X X X X X X
+ *
+ * The YX HPEL bits of the motion vector selects the plane we search in. The
+ * four planes are packed in the fref_planes 2D image buffer. Each sample
+ * returns: s0 = F, s1 = H, s2 = V, s3 = C */
+kernel void subpel_refine( read_only image2d_t fenc,
+ read_only image2d_t fref_planes,
+ const global short2 *in_mvs,
+ const global int16_t *in_sad_mv_costs,
+ local int16_t *cost_local,
+ local sum2_t *satd_local,
+ local short2 *mvc_local,
+ global short2 *fenc_lowres_mv,
+ global int16_t *fenc_lowres_mv_costs,
+ int mb_width,
+ int lambda,
+ int b,
+ int ref,
+ int b_islist1 )
+{
+ int mb_x = get_global_id( 0 ) >> 2;
+ if( mb_x >= mb_width )
+ return;
+ int mb_height = get_global_size( 1 );
+
+ int mb_i = get_global_id( 0 ) & 3;
+ int mb_y = get_global_id( 1 );
+ int mb_xy = mb_y * mb_width + mb_x;
+
+ /* fenc_lowres_mv and fenc_lowres_mv_costs are large buffers that
+ * hold many frames worth of motion vectors. We must offset into the correct
+ * location for this frame's vectors. The kernel will be passed the correct
+ * directional buffer for the direction of the search: list1 or list0
+ *
+ * CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
+ * GPU equivalent: fenc_lowres_mvs[(b - p0 - 1) * mb_count] */
+ fenc_lowres_mv += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
+ fenc_lowres_mv_costs += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
+
+ /* Adjust pointers into local memory buffers for this thread's data */
+ int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
+ cost_local += mb_in_group * 4;
+ satd_local += mb_in_group * 16;
+ mvc_local += mb_in_group * 4;
+
+ int i_mvc = 0;
+
+ mvc_local[0] = mvc_local[1] = mvc_local[2] = mvc_local[3] = 0;
+
+#define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)];
+ if( mb_x > 0 )
+ MVC( -1, 0 );
+ if( mb_y > 0 )
+ {
+ MVC( 0, -1 );
+ if( mb_x < mb_width - 1 )
+ MVC( 1, -1 );
+ if( mb_x > 0 )
+ MVC( -1, -1 );
+ }
+#undef MVC
+ int2 mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
+
+ int bcost = in_sad_mv_costs[mb_xy];
+ int2 coord = (int2)(mb_x, mb_y) << 3;
+ int2 bmv = convert_int2_sat( in_mvs[mb_xy] );
+
+ /* Make mvp and bmv QPEL MV */
+ mvp <<= 2; bmv <<= 2;
+
+#define HPEL_QPEL( ARR, FUNC )\
+ {\
+ int2 trymv = bmv + ARR[mb_i];\
+ int2 qpos = (coord << 2) + trymv;\
+ int cost = FUNC( fenc, coord, fref_planes, qpos ) + lambda * mv_cost( abs_diff( trymv, mvp ) );\
+ cost_local[mb_i] = (cost<<2) + mb_i;\
+ cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );\
+ if( (cost>>2) < bcost )\
+ {\
+ bmv += ARR[cost&3];\
+ bcost = cost>>2;\
+ }\
+ }
+
+ HPEL_QPEL( hpoffs, sad_8x8_ii_hpel );
+ HPEL_QPEL( dia_offs, sad_8x8_ii_qpel );
+ fenc_lowres_mv[mb_xy] = convert_short2_sat( bmv );
+
+ /* remeasure cost of bmv using SATD */
+ int2 qpos = (coord << 2) + bmv;
+ cost_local[mb_i] = satd_8x8_ii_qpel_coop4( fenc, coord, fref_planes, qpos, satd_local, mb_i );
+ bcost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];
+ bcost += lambda * mv_cost( abs_diff( bmv, mvp ) );
+
+ fenc_lowres_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
+}
--- /dev/null
+/* Weightp filter a downscaled image into a temporary output buffer.
+ * This kernel is launched once for each scale.
+ *
+ * Launch dimensions: width x height (in pixels)
+ */
+kernel void weightp_scaled_images( read_only image2d_t in_plane,
+ write_only image2d_t out_plane,
+ uint offset,
+ uint scale,
+ uint denom )
+{
+ int gx = get_global_id( 0 );
+ int gy = get_global_id( 1 );
+ uint4 input_val;
+ uint4 output_val;
+
+ input_val = read_imageui( in_plane, sampler, (int2)(gx, gy));
+ output_val = (uint4)(offset) + ( ( ((uint4)(scale)) * input_val ) >> ((uint4)(denom)) );
+ write_imageui( out_plane, (int2)(gx, gy), output_val );
+}
+
+/* Weightp filter for the half-pel interpolated image
+ *
+ * Launch dimensions: width x height (in pixels)
+ */
+kernel void weightp_hpel( read_only image2d_t in_plane,
+ write_only image2d_t out_plane,
+ uint offset,
+ uint scale,
+ uint denom )
+{
+ int gx = get_global_id( 0 );
+ int gy = get_global_id( 1 );
+ uint input_val;
+ uint output_val;
+
+ input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)).s0;
+ //Unpack
+ uint4 temp;
+ temp.s0 = input_val & 0x00ff; temp.s1 = (input_val >> 8) & 0x00ff;
+ temp.s2 = (input_val >> 16) & 0x00ff; temp.s3 = (input_val >> 24) & 0x00ff;
+
+ temp = (uint4)(offset) + ( ( ((uint4)(scale)) * temp ) >> ((uint4)(denom)) );
+
+ //Pack
+ output_val = temp.s0 | (temp.s1 << 8) | (temp.s2 << 16) | (temp.s3 << 24);
+ write_imageui( out_plane, (int2)(gx, gy), output_val );
+}
--- /dev/null
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+
+constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+
+/* 7.18.1.1 Exact-width integer types */
+typedef signed char int8_t;
+typedef unsigned char uint8_t;
+typedef short int16_t;
+typedef unsigned short uint16_t;
+typedef int int32_t;
+typedef unsigned uint32_t;
+
+typedef uint8_t pixel;
+typedef uint16_t sum_t;
+typedef uint32_t sum2_t;
+
+#define LOWRES_COST_MASK ((1<<14)-1)
+#define LOWRES_COST_SHIFT 14
+#define COST_MAX (1<<28)
+
+#define PIXEL_MAX 255
+#define BITS_PER_SUM (8 * sizeof(sum_t))
+
+/* Constants for offsets into frame statistics buffer */
+#define COST_EST 0
+#define COST_EST_AQ 1
+#define INTRA_MBS 2
+
+#define COPY2_IF_LT( x, y, a, b )\
+ if((y)<(x))\
+ {\
+ (x) = (y);\
+ (a) = (b);\
+ }
+
+constant int2 dia_offs[4] =
+{
+ {0, -1}, {-1, 0}, {1, 0}, {0, 1},
+};
+
+inline pixel x264_clip_pixel( int x )
+{
+ return (pixel) clamp( x, (int) 0, (int) PIXEL_MAX );
+}
+
+inline int2 x264_median_mv( short2 a, short2 b, short2 c )
+{
+ short2 t1 = min(a, b);
+ short2 t2 = min(max(a, b), c);
+ return convert_int2(max(t1, t2));
+}
+
+inline sum2_t abs2( sum2_t a )
+{
+ sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
+ return (a + s) ^ s;
+}
+
+#define HADAMARD4( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
+ sum2_t t0 = s0 + s1;\
+ sum2_t t1 = s0 - s1;\
+ sum2_t t2 = s2 + s3;\
+ sum2_t t3 = s2 - s3;\
+ d0 = t0 + t2;\
+ d2 = t0 - t2;\
+ d1 = t1 + t3;\
+ d3 = t1 - t3;\
+}
+
+#define HADAMARD4V( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
+ int2 t0 = s0 + s1;\
+ int2 t1 = s0 - s1;\
+ int2 t2 = s2 + s3;\
+ int2 t3 = s2 - s3;\
+ d0 = t0 + t2;\
+ d2 = t0 - t2;\
+ d1 = t1 + t3;\
+ d3 = t1 - t3;\
+}
+
+#define SATD_C_8x4_Q( name, q1, q2 )\
+ int name( q1 pixel *pix1, int i_pix1, q2 pixel *pix2, int i_pix2 )\
+ {\
+ sum2_t tmp[4][4];\
+ sum2_t a0, a1, a2, a3;\
+ sum2_t sum = 0;\
+ for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )\
+ {\
+ a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);\
+ a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);\
+ a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);\
+ a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);\
+ HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3 );\
+ }\
+ for( int i = 0; i < 4; i++ )\
+ {\
+ HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );\
+ sum += abs2( a0 ) + abs2( a1 ) + abs2( a2 ) + abs2( a3 );\
+ }\
+ return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;\
+ }
+
+/*
+ * Utility function to perform a parallel sum reduction of an array of integers
+ */
+int parallel_sum( int value, int x, volatile local int *array )
+{
+ array[x] = value;
+ barrier( CLK_LOCAL_MEM_FENCE );
+
+ int dim = get_local_size( 0 );
+
+ while( dim > 1 )
+ {
+ dim >>= 1;
+
+ if( x < dim )
+ array[x] += array[x + dim];
+
+ if( dim > 32 )
+ barrier( CLK_LOCAL_MEM_FENCE );
+ }
+
+ return array[0];
+}
+
+int mv_cost( uint2 mvd )
+{
+ float2 mvdf = (float2)(mvd.x, mvd.y) + 1.0f;
+ float2 cost = round( log2(mvdf) * 2.0f + 0.718f + (float2)(!!mvd.x, !!mvd.y) );
+ return (int) (cost.x + cost.y);
+}
--system-libx264 use system libx264 instead of internal
--enable-shared build shared library
--enable-static build static library
+ --disable-opencl disable OpenCL features
--disable-gpl disable GPL-only features
--disable-thread disable multithreaded encoding
--enable-win32thread use win32threads (windows only)
bit_depth="8"
chroma_format="all"
compiler="GNU"
+opencl="yes"
CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)"
LDFLAGS="$LDFLAGS"
--host=*)
host="$optarg"
;;
+ --disable-opencl)
+ opencl="no"
+ ;;
--cross-prefix=*)
cross_prefix="$optarg"
;;
if [ "$bit_depth" -gt "8" ]; then
define HIGH_BIT_DEPTH
ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=1"
+ opencl="no"
else
ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=0"
fi
PROF_USE_LD=$PROF_USE_LD
EOF
+if [[ $host_os != mingw* ]]; then
+ # OpenCL support is only well tested on Windows/MinGW. If you
+ # wish to try it on an unsupported platform, swap the lines
+ # below. If OpenCL breaks, you get to keep both halves
+ #opencl="yes"
+ opencl="no"
+fi
+if [ "$opencl" = "yes" ]; then
+ log_check "looking for perl"
+ output=$(perl -v)
+ if [ "$output" = "" ]; then
+ echo 'OpenCL support requires perl to compile.'
+ echo 'use --disable-opencl to compile without OpenCL.'
+ exit 1
+ elif [[ $cross_prefix != "" && $host_os == mingw* ]] ; then
+ if cc_check "CL/cl.h" "-lOpenCL"; then
+ echo 'HAVE_OPENCL=yes' >> config.mak
+ echo 'OPENCL_LIB=OpenCL' >> config.mak
+ echo "OPENCL_INC_DIR=." >> config.mak
+ echo "OPENCL_LIB_DIR=." >> config.mak
+ define HAVE_OPENCL
+ else
+ opencl="no"
+ fi
+ elif [ "$CUDA_PATH" != "" ]; then
+ echo 'HAVE_OPENCL=yes' >> config.mak
+ echo 'OPENCL_LIB=OpenCL' >> config.mak
+ echo 'OPENCL_INC_DIR=$(CUDA_PATH)include' >> config.mak
+ if [ "$ARCH" = "X86" ]; then
+ echo 'OPENCL_LIB_DIR=$(CUDA_PATH)lib/Win32' >> config.mak
+ else
+ echo 'OPENCL_LIB_DIR=$(CUDA_PATH)lib/x64' >> config.mak
+ fi
+ define HAVE_OPENCL
+ elif [ -e "$AMDAPPSDKROOT/include/CL/cl.h" ]; then
+ if [[ $host_os = mingw* ]]; then
+ app_path=`echo "/$AMDAPPSDKROOT" | sed 's/\\\/\//g' | sed 's/://'`
+ else
+ app_path='$(AMDAPPSDKROOT)'
+ fi
+ echo 'HAVE_OPENCL=yes' >> config.mak
+ echo 'OPENCL_LIB=OpenCL' >> config.mak
+ echo OPENCL_INC_DIR=$app_path/include >> config.mak
+ if [ "$ARCH" = "X86" ]; then
+ echo OPENCL_LIB_DIR=$app_path/lib/x86 >> config.mak
+ else
+ echo OPENCL_LIB_DIR=$app_path/lib/x86_64 >> config.mak
+ fi
+ define HAVE_OPENCL
+ else
+ opencl="no"
+ fi
+fi
+
if [ $compiler = ICL ]; then
echo '%.o: %.c' >> config.mak
echo ' $(CC) $(CFLAGS) -c -Fo$@ $<' >> config.mak
visualize: $vis
bit depth: $bit_depth
chroma format: $chroma_format
+opencl: $opencl
EOF
echo >> config.log
#include "common/visualize.h"
#endif
+#if HAVE_OPENCL
+#include "common/opencl.h"
+#endif
+
//#define DEBUG_MB_TYPE
#define bs_write_ue bs_write_ue_big
if( h->i_thread_frames > 1 )
h->param.nalu_process = NULL;
+ if( h->param.b_opencl )
+ {
+#if !HAVE_OPENCL
+ x264_log( h, X264_LOG_WARNING, "OpenCL: not compiled with OpenCL support, disabling\n" );
+ h->param.b_opencl = 0;
+#elif BIT_DEPTH > 8
+ x264_log( h, X264_LOG_WARNING, "OpenCL lookahead does not support high bit depth, disabling opencl\n" );
+ h->param.b_opencl = 0;
+#else
+ if( h->param.i_width < 32 || h->param.i_height < 32 )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: frame size is too small, disabling opencl\n" );
+ h->param.b_opencl = 0;
+ }
+#endif
+ if( h->param.opencl_device_id && h->param.i_opencl_device )
+ {
+ x264_log( h, X264_LOG_WARNING, "OpenCL: device id and device skip count configured; dropping skip\n" );
+ h->param.i_opencl_device = 0;
+ }
+ }
+
h->param.i_keyint_max = x264_clip3( h->param.i_keyint_max, 1, X264_KEYINT_MAX_INFINITE );
if( h->param.i_keyint_max == 1 )
{
BOOLIFY( b_open_gop );
BOOLIFY( b_bluray_compat );
BOOLIFY( b_full_recon );
+ BOOLIFY( b_opencl );
BOOLIFY( analyse.b_transform_8x8 );
BOOLIFY( analyse.b_weighted_bipred );
BOOLIFY( analyse.b_chroma_me );
goto fail;
}
+#if HAVE_OPENCL
+ if( h->param.b_opencl && x264_opencl_init( h ) < 0 )
+ h->param.b_opencl = 0;
+#endif
+
if( x264_lookahead_init( h, i_slicetype_length ) )
goto fail;
int i_nal_type, i_nal_ref_idc, i_global_qp;
int overhead = NALU_OVERHEAD;
+#if HAVE_OPENCL
+ if( h->opencl.b_fatal_error )
+ return -1;
+#endif
+
if( h->i_thread_frames > 1 )
{
thread_prev = h->thread[ h->i_thread_phase ];
|| h->stat.i_mb_count[SLICE_TYPE_P][I_PCM]
|| h->stat.i_mb_count[SLICE_TYPE_B][I_PCM];
+#if HAVE_OPENCL
+ x264_opencl_free( h );
+#endif
+
x264_lookahead_delete( h );
if( h->param.b_sliced_threads )
--- /dev/null
+/*****************************************************************************
+ * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
+ *****************************************************************************
+ * Copyright (C) 2012-2013 x264 project
+ *
+ * Authors: Steve Borho <sborho@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macroblock.h"
+#include "me.h"
+
+#if HAVE_OPENCL
+#if _WIN32
+#include <windows.h>
+#endif
+
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
+
+/* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined
+ * in the OpenCL headers shipped with NVIDIA drivers. We need to be
+ * able to compile on an NVIDIA machine and run optimally on an AMD GPU. */
+#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
+
+#define OCLCHECK( method, ... )\
+ status = method( __VA_ARGS__ );\
+ if( status != CL_SUCCESS ) {\
+ h->param.b_opencl = 0;\
+ h->opencl.b_fatal_error = 1;\
+ x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\
+ return status;\
+ }
+
+void x264_opencl_flush( x264_t *h )
+{
+ clFinish( h->opencl.queue );
+
+ /* Finish copies from the GPU by copying from the page-locked buffer to
+ * their final destination */
+ for( int i = 0; i < h->opencl.num_copies; i++ )
+ memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
+ h->opencl.num_copies = 0;
+ h->opencl.pl_occupancy = 0;
+}
+
+static void *x264_opencl_alloc_locked( x264_t *h, int bytes )
+{
+ if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
+ x264_opencl_flush( h );
+ assert( bytes < PAGE_LOCKED_BUF_SIZE );
+ char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
+ h->opencl.pl_occupancy += bytes;
+ return ptr;
+}
+
+int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
+{
+ if( fenc->b_intra_calculated )
+ return 0;
+ fenc->b_intra_calculated = 1;
+
+ int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
+
+#define CREATEBUF( out, flags, size )\
+ out = clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
+ if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
+#define CREATEIMAGE( out, flags, pf, width, height )\
+ out = clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
+ if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
+
+ int mb_count = h->mb.i_mb_count;
+ cl_int status;
+
+ if( !h->opencl.lowres_mv_costs )
+ {
+ /* Allocate shared memory buffers */
+ int width = h->mb.i_mb_width * 8 * sizeof(pixel);
+ int height = h->mb.i_mb_height * 8 * sizeof(pixel);
+
+ cl_image_format pixel_format;
+ pixel_format.image_channel_order = CL_R;
+ pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
+ CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
+
+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+ {
+ pixel_format.image_channel_order = CL_RGBA;
+ pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
+ CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
+ width >>= 1;
+ height >>= 1;
+ }
+
+ CREATEBUF( h->opencl.lowres_mv_costs, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+ CREATEBUF( h->opencl.lowres_costs[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+ CREATEBUF( h->opencl.lowres_costs[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+ CREATEBUF( h->opencl.mv_buffers[0], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+ CREATEBUF( h->opencl.mv_buffers[1], CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+ CREATEBUF( h->opencl.mvp_buffer, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+ CREATEBUF( h->opencl.frame_stats[0], CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
+ CREATEBUF( h->opencl.frame_stats[1], CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
+ CREATEBUF( h->opencl.row_satds[0], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
+ CREATEBUF( h->opencl.row_satds[1], CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
+ CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY, luma_length );
+ CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY, luma_length );
+ }
+
+ if( !fenc->opencl.intra_cost )
+ {
+ /* Allocate per-frame buffers */
+ int width = h->mb.i_mb_width * 8 * sizeof(pixel);
+ int height = h->mb.i_mb_height * 8 * sizeof(pixel);
+
+ cl_image_format pixel_format;
+ pixel_format.image_channel_order = CL_R;
+ pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
+ CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
+
+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+ {
+ pixel_format.image_channel_order = CL_RGBA;
+ pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
+ CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
+ width >>= 1;
+ height >>= 1;
+ }
+ CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY, mb_count * sizeof(int16_t) );
+ CREATEBUF( fenc->opencl.intra_cost, CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
+ CREATEBUF( fenc->opencl.lowres_mvs0, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
+ CREATEBUF( fenc->opencl.lowres_mvs1, CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
+ CREATEBUF( fenc->opencl.lowres_mv_costs0, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
+ CREATEBUF( fenc->opencl.lowres_mv_costs1, CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
+ }
+#undef CREATEBUF
+
+ /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
+
+ char *locked = x264_opencl_alloc_locked( h, luma_length );
+ memcpy( locked, fenc->plane[0], luma_length );
+ OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
+
+ size_t gdim[2];
+ if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
+ {
+ int size = h->mb.i_mb_count * sizeof(int16_t);
+ locked = x264_opencl_alloc_locked( h, size );
+ memcpy( locked, fenc->i_inv_qscale_factor, size );
+ OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
+ }
+ else
+ {
+ /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
+ cl_uint arg = 0;
+ int16_t value = 256;
+ OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
+ OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
+ gdim[0] = h->mb.i_mb_count;
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
+ }
+
+ int stride = fenc->i_stride[0];
+ cl_uint arg = 0;
+ OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+ OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
+ OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
+ gdim[0] = 8 * h->mb.i_mb_width;
+ gdim[1] = 8 * h->mb.i_mb_height;
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
+
+ for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
+ {
+ /* Workaround for AMD Southern Island:
+ *
+ * Alternate kernel instances. No perf impact to this, so we do it for
+ * all GPUs. It prevents the same kernel from being enqueued
+ * back-to-back, avoiding a dependency calculation bug in the driver.
+ */
+ cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
+
+ arg = 0;
+ OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
+ OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
+ gdim[0] >>= 1;
+ gdim[1] >>= 1;
+ if( gdim[0] < 16 || gdim[1] < 16 )
+ break;
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
+ }
+
+ size_t ldim[2];
+ gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
+ gdim[1] = 8*h->mb.i_mb_height;
+ ldim[0] = 32;
+ ldim[1] = 8;
+ arg = 0;
+
+ /* For presets slow, slower, and placebo, check all 10 intra modes that the
+ * C lookahead supports. For faster presets, only check the most frequent 8
+ * modes
+ */
+ int slow = h->param.analyse.i_subpel_refine > 7;
+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+ OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
+
+ gdim[0] = 256;
+ gdim[1] = h->mb.i_mb_height;
+ ldim[0] = 256;
+ ldim[1] = 1;
+ arg = 0;
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
+
+ if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
+ x264_opencl_flush( h );
+
+ int size = h->mb.i_mb_count * sizeof(int16_t);
+ locked = x264_opencl_alloc_locked( h, size );
+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
+ h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
+ h->opencl.copies[h->opencl.num_copies].src = locked;
+ h->opencl.copies[h->opencl.num_copies].bytes = size;
+ h->opencl.num_copies++;
+
+ size = h->mb.i_mb_height * sizeof(int);
+ locked = x264_opencl_alloc_locked( h, size );
+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+ h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
+ h->opencl.copies[h->opencl.num_copies].src = locked;
+ h->opencl.copies[h->opencl.num_copies].bytes = size;
+ h->opencl.num_copies++;
+
+ size = sizeof(int) * 4;
+ locked = x264_opencl_alloc_locked( h, size );
+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+ h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
+ h->opencl.copies[h->opencl.num_copies].src = locked;
+ h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+ h->opencl.num_copies++;
+ h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
+ h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
+ h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+ h->opencl.num_copies++;
+
+ h->opencl.last_buf = !h->opencl.last_buf;
+ return 0;
+}
+
+/* This function was tested emprically on a number of AMD and NV GPUs. Making a
+ * function which returns perfect launch dimensions is impossible; some
+ * applications will have self-tuning code to try many possible variables and
+ * measure the runtime. Here we simply make an educated guess based on what we
+ * know GPUs typically prefer. */
+static void x264_optimal_launch_dims( size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
+{
+ size_t max_work_group = 256; /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
+ size_t preferred_multiple = 64;
+ cl_uint num_cus = 6;
+
+ clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
+ clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
+ clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
+
+ ldims[0] = preferred_multiple;
+ ldims[1] = 8;
+
+ /* make ldims[1] an even divisor of gdims[1] */
+ while( gdims[1] & (ldims[1] - 1) )
+ {
+ ldims[0] <<= 1;
+ ldims[1] >>= 1;
+ }
+ /* make total ldims fit under the max work-group dimensions for the device */
+ while( ldims[0] * ldims[1] > max_work_group )
+ {
+ if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
+ ldims[1] >>= 1;
+ else
+ ldims[0] >>= 1;
+ }
+
+ if( ldims[0] > gdims[0] )
+ {
+ /* remove preferred multiples until we're close to gdims[0] */
+ while( gdims[0] + preferred_multiple < ldims[0] )
+ ldims[0] -= preferred_multiple;
+ gdims[0] = ldims[0];
+ }
+ else
+ {
+ /* make gdims an even multiple of ldims */
+ gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
+ gdims[0] *= ldims[0];
+ }
+
+ /* make ldims smaller to spread work across compute units */
+ while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
+ {
+ if( ldims[0] > preferred_multiple )
+ ldims[0] >>= 1;
+ else if( ldims[1] > 1 )
+ ldims[1] >>= 1;
+ else
+ break;
+ }
+ /* for smaller GPUs, try not to abuse their texture cache */
+ if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
+ ldims[0] = 32;
+}
+
+int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
+{
+ x264_frame_t *fenc = frames[b];
+ x264_frame_t *fref = frames[ref];
+
+ cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
+ cl_mem ref_luma_hpel;
+ cl_int status;
+
+ if( w && w->weightfn )
+ {
+ size_t gdims[2];
+
+ gdims[0] = 8 * h->mb.i_mb_width;
+ gdims[1] = 8 * h->mb.i_mb_height;
+
+ /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+ {
+ cl_uint arg = 0;
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
+
+ gdims[0] >>= 1;
+ gdims[1] >>= 1;
+ if( gdims[0] < 16 || gdims[1] < 16 )
+ break;
+ }
+
+ cl_uint arg = 0;
+ gdims[0] = 8 * h->mb.i_mb_width;
+ gdims[1] = 8 * h->mb.i_mb_height;
+
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
+ OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
+
+ /* Use weighted reference planes for motion search */
+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+ ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
+ ref_luma_hpel = h->opencl.weighted_luma_hpel;
+ }
+ else
+ {
+ /* Use unweighted reference planes for motion search */
+ for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+ ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
+ ref_luma_hpel = fref->opencl.luma_hpel;
+ }
+
+ const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
+ int b_first_iteration = 1;
+ int b_reverse_references = 1;
+ int A = 1;
+
+
+ int mb_per_group = 0;
+ int cost_local_size = 0;
+ int mvc_local_size = 0;
+ int mb_width;
+
+ size_t gdims[2];
+ size_t ldims[2];
+
+ /* scale 0 is 8x8 */
+ for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
+ {
+ mb_width = h->mb.i_mb_width >> scale;
+ gdims[0] = mb_width;
+ gdims[1] = h->mb.i_mb_height >> scale;
+ if( gdims[0] < 2 || gdims[1] < 2 )
+ continue;
+ gdims[0] <<= 2;
+ x264_optimal_launch_dims( gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
+
+ mb_per_group = (ldims[0] >> 2) * ldims[1];
+ cost_local_size = 4 * mb_per_group * sizeof(int16_t);
+ mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
+ int scaled_me_range = h->param.analyse.i_me_range >> scale;
+ int b_shift_index = 1;
+
+ cl_uint arg = 0;
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
+
+ for( int iter = 0; iter < num_iterations[scale]; iter++ )
+ {
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
+
+ b_shift_index = 0;
+ b_first_iteration = 0;
+
+ /* alternate top-left vs bot-right MB references at lower scales, so
+ * motion field smooths more quickly. */
+ if( scale > 2 )
+ b_reverse_references ^= 1;
+ else
+ b_reverse_references = 0;
+ A = !A;
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
+ OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
+ }
+ }
+
+ int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
+ cl_uint arg = 0;
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
+
+ if( b_islist1 )
+ {
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
+ }
+ else
+ {
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
+ }
+
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
+ OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
+
+ if( h->opencl.b_device_AMD_SI )
+ {
+ /* workaround for AMD Southern Island driver scheduling bug (fixed in
+ * July 2012), perform meaningless small copy to add a data dependency */
+ OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
+ }
+
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
+
+ int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
+
+ if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
+ x264_opencl_flush( h );
+
+ char *locked = x264_opencl_alloc_locked( h, mvlen );
+ h->opencl.copies[h->opencl.num_copies].src = locked;
+ h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
+
+ if( b_islist1 )
+ {
+ int mvs_offset = mvlen * (ref - b - 1);
+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
+ h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
+ }
+ else
+ {
+ int mvs_offset = mvlen * (b - ref - 1);
+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
+ h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
+ }
+
+ h->opencl.num_copies++;
+
+ return 0;
+}
+
+int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
+{
+ cl_int status;
+ x264_frame_t *fenc = frames[b];
+ x264_frame_t *fref0 = frames[p0];
+ x264_frame_t *fref1 = frames[p1];
+
+ int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
+
+ /* Tasks for this kernel:
+ * 1. Select least cost mode (intra, ref0, ref1)
+ * list_used 0, 1, 2, or 3. if B frame, do not allow intra
+ * 2. if B frame, try bidir predictions.
+ * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); */
+ size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
+ size_t ldim_bidir[2];
+ size_t *ldims = NULL;
+ int cost_local_size = 4;
+ int satd_local_size = 4;
+ if( b < p1 )
+ {
+ /* For B frames, use 4 threads per MB for BIDIR checks */
+ ldims = ldim_bidir;
+ gdims[0] <<= 2;
+ x264_optimal_launch_dims( gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
+ int mb_per_group = (ldims[0] >> 2) * ldims[1];
+ cost_local_size = 4 * mb_per_group * sizeof(int16_t);
+ satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
+ }
+
+ cl_uint arg = 0;
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
+ OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
+
+ /* Sum costs across rows, atomicAdd down frame */
+ size_t gdim[2] = { 256, h->mb.i_mb_height };
+ size_t ldim[2] = { 256, 1 };
+
+ arg = 0;
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
+ OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
+ OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
+
+ if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
+ x264_opencl_flush( h );
+
+ int size = h->mb.i_mb_count * sizeof(int16_t);
+ char *locked = x264_opencl_alloc_locked( h, size );
+ h->opencl.copies[h->opencl.num_copies].src = locked;
+ h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
+ h->opencl.copies[h->opencl.num_copies].bytes = size;
+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+ h->opencl.num_copies++;
+
+ size = h->mb.i_mb_height * sizeof(int);
+ locked = x264_opencl_alloc_locked( h, size );
+ h->opencl.copies[h->opencl.num_copies].src = locked;
+ h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
+ h->opencl.copies[h->opencl.num_copies].bytes = size;
+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+ h->opencl.num_copies++;
+
+ size = 4 * sizeof(int);
+ locked = x264_opencl_alloc_locked( h, size );
+ OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+ h->opencl.last_buf = !h->opencl.last_buf;
+
+ h->opencl.copies[h->opencl.num_copies].src = locked;
+ h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
+ h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+ h->opencl.num_copies++;
+ h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
+ h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
+ h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+ h->opencl.num_copies++;
+
+ if( b == p1 ) // P frames only
+ {
+ h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
+ h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
+ h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+ h->opencl.num_copies++;
+ }
+ return 0;
+}
+
+void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda )
+{
+ if( h->param.b_opencl )
+ {
+#if _WIN32
+ /* Temporarily boost priority of this lookahead thread and the OpenCL
+ * driver's thread until the end of this function. On AMD GPUs this
+ * greatly reduces the latency of enqueuing kernels and getting results
+ * on Windows. */
+ HANDLE id = GetCurrentThread();
+ h->opencl.lookahead_thread_pri = GetThreadPriority( id );
+ SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
+ cl_int status = clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
+ if( status == CL_SUCCESS )
+ {
+ h->opencl.opencl_thread_pri = GetThreadPriority( id );
+ SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
+ }
+#endif
+
+ /* precalculate intra and I frames */
+ for( int i = 0; i <= num_frames; i++ )
+ x264_opencl_lowres_init( h, frames[i], lambda );
+ x264_opencl_flush( h );
+
+ if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && h->param.i_bframe )
+ {
+ /* For trellis B-Adapt, precompute exhaustive motion searches */
+ for( int b = 0; b <= num_frames; b++ )
+ {
+ for( int j = 1; j < h->param.i_bframe; j++ )
+ {
+ int p0 = b - j;
+ if( p0 >= 0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF )
+ {
+ const x264_weight_t *w = x264_weight_none;
+
+ if( h->param.analyse.i_weighted_pred )
+ {
+ x264_emms();
+ x264_weights_analyse( h, frames[b], frames[p0], 1 );
+ w = frames[b]->weight[0];
+ }
+ frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+ x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
+ }
+ int p1 = b + j;
+ if( p1 <= num_frames && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF )
+ {
+ frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
+ x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
+ }
+ }
+ }
+
+ x264_opencl_flush( h );
+ }
+ }
+}
+
+
+void x264_opencl_slicetype_end( x264_t *h )
+{
+#if _WIN32
+ if( h->param.b_opencl )
+ {
+ HANDLE id = GetCurrentThread();
+ SetThreadPriority( id, h->opencl.lookahead_thread_pri );
+ cl_int status = clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
+ if( status == CL_SUCCESS )
+ SetThreadPriority( id, h->opencl.opencl_thread_pri );
+ }
+#endif
+}
+
+int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b )
+{
+ if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
+ return 0;
+ else
+ {
+ int do_search[2];
+ int dist_scale_factor = 128;
+ const x264_weight_t *w = x264_weight_none;
+
+ // avoid duplicating work
+ frames[b]->i_cost_est[b-p0][p1-b] = 0;
+
+ do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
+ do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
+ if( do_search[0] )
+ {
+ if( h->param.analyse.i_weighted_pred && b == p1 )
+ {
+ x264_emms();
+ x264_weights_analyse( h, frames[b], frames[p0], 1 );
+ w = frames[b]->weight[0];
+ }
+ frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+ }
+ if( do_search[1] )
+ frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
+ if( b == p1 )
+ frames[b]->i_intra_mbs[b-p0] = 0;
+ if( p1 != p0 )
+ dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
+
+ frames[b]->i_cost_est[b-p0][p1-b] = 0;
+ frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
+
+ x264_opencl_lowres_init( h, frames[b], lambda );
+
+ if( do_search[0] )
+ {
+ x264_opencl_lowres_init( h, frames[p0], lambda );
+ x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
+ }
+ if( do_search[1] )
+ {
+ x264_opencl_lowres_init( h, frames[p1], lambda );
+ x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
+ }
+ x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor );
+ return 1;
+ }
+}
+
+#endif
x264_frame_t **frames, int p0, int p1, int b,
int b_intra_penalty );
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
+
+#if HAVE_OPENCL
+int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda );
+int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w );
+int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor );
+int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b );
+void x264_opencl_flush( x264_t *h );
+void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda );
+void x264_opencl_slicetype_end( x264_t *h );
+#endif
+
static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
{
a->i_qp = X264_LOOKAHEAD_QP;
return cost;
}
-static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
{
int i_delta_index = fenc->i_frame - ref->i_frame - 1;
/* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
output_inter[0] = h->scratch_buffer2;
output_intra[0] = output_inter[0] + output_buf_size;
- if( h->param.i_lookahead_threads > 1 )
+#if HAVE_OPENCL
+ if( h->param.b_opencl )
{
- x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
+ x264_opencl_lowres_init(h, fenc, a->i_lambda );
+ if( do_search[0] )
+ {
+ x264_opencl_lowres_init( h, frames[p0], a->i_lambda );
+ x264_opencl_motionsearch( h, frames, b, p0, 0, a->i_lambda, w );
+ }
+ if( do_search[1] )
+ {
+ x264_opencl_lowres_init( h, frames[p1], a->i_lambda );
+ x264_opencl_motionsearch( h, frames, b, p1, 1, a->i_lambda, NULL );
+ }
+ if( b != p0 )
+ x264_opencl_finalize_cost( h, a->i_lambda, frames, p0, p1, b, dist_scale_factor );
+ x264_opencl_flush( h );
- for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+ i_score = fenc->i_cost_est[b-p0][p1-b];
+ }
+ else
+#endif
+ {
+ if( h->param.i_lookahead_threads > 1 )
{
- x264_t *t = h->lookahead_thread[i];
+ x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
- /* FIXME move this somewhere else */
- t->mb.i_me_method = h->mb.i_me_method;
- t->mb.i_subpel_refine = h->mb.i_subpel_refine;
- t->mb.b_chroma_me = h->mb.b_chroma_me;
+ for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+ {
+ x264_t *t = h->lookahead_thread[i];
- s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
- output_inter[i], output_intra[i] };
+ /* FIXME move this somewhere else */
+ t->mb.i_me_method = h->mb.i_me_method;
+ t->mb.i_subpel_refine = h->mb.i_subpel_refine;
+ t->mb.b_chroma_me = h->mb.b_chroma_me;
- t->i_threadslice_start = ((h->mb.i_mb_height * i + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
- t->i_threadslice_end = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
+ s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
+ output_inter[i], output_intra[i] };
- int thread_height = t->i_threadslice_end - t->i_threadslice_start;
- int thread_output_size = thread_height + NUM_INTS;
- memset( output_inter[i], 0, thread_output_size * sizeof(int) );
- memset( output_intra[i], 0, thread_output_size * sizeof(int) );
- output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height;
+ t->i_threadslice_start = ((h->mb.i_mb_height * i + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
+ t->i_threadslice_end = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
- output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE;
- output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE;
+ int thread_height = t->i_threadslice_end - t->i_threadslice_start;
+ int thread_output_size = thread_height + NUM_INTS;
+ memset( output_inter[i], 0, thread_output_size * sizeof(int) );
+ memset( output_intra[i], 0, thread_output_size * sizeof(int) );
+ output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height;
- x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] );
- }
- for( int i = 0; i < h->param.i_lookahead_threads; i++ )
- x264_threadpool_wait( h->lookaheadpool, &s[i] );
- }
- else
- {
- h->i_threadslice_start = 0;
- h->i_threadslice_end = h->mb.i_mb_height;
- memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
- memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
- output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height;
- x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
- output_inter[0], output_intra[0] };
- x264_slicetype_slice_cost( &s );
- }
+ output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE;
+ output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE;
- /* Sum up accumulators */
- if( b == p1 )
- fenc->i_intra_mbs[b-p0] = 0;
- if( !fenc->b_intra_calculated )
- {
- fenc->i_cost_est[0][0] = 0;
- fenc->i_cost_est_aq[0][0] = 0;
- }
- fenc->i_cost_est[b-p0][p1-b] = 0;
- fenc->i_cost_est_aq[b-p0][p1-b] = 0;
+ x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] );
+ }
+ for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+ x264_threadpool_wait( h->lookaheadpool, &s[i] );
+ }
+ else
+ {
+ h->i_threadslice_start = 0;
+ h->i_threadslice_end = h->mb.i_mb_height;
+ memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
+ memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
+ output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height;
+ x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
+ output_inter[0], output_intra[0] };
+ x264_slicetype_slice_cost( &s );
+ }
- int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b];
- int *row_satd_intra = fenc->i_row_satds[0][0];
- for( int i = 0; i < h->param.i_lookahead_threads; i++ )
- {
+ /* Sum up accumulators */
if( b == p1 )
- fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS];
+ fenc->i_intra_mbs[b-p0] = 0;
if( !fenc->b_intra_calculated )
{
- fenc->i_cost_est[0][0] += output_intra[i][COST_EST];
- fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ];
+ fenc->i_cost_est[0][0] = 0;
+ fenc->i_cost_est_aq[0][0] = 0;
}
+ fenc->i_cost_est[b-p0][p1-b] = 0;
+ fenc->i_cost_est_aq[b-p0][p1-b] = 0;
- fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST];
- fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ];
-
- if( h->param.rc.i_vbv_buffer_size )
+ int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b];
+ int *row_satd_intra = fenc->i_row_satds[0][0];
+ for( int i = 0; i < h->param.i_lookahead_threads; i++ )
{
- int row_count = output_inter[i][NUM_ROWS];
- memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) );
+ if( b == p1 )
+ fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS];
if( !fenc->b_intra_calculated )
- memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) );
- row_satd_inter += row_count;
- row_satd_intra += row_count;
+ {
+ fenc->i_cost_est[0][0] += output_intra[i][COST_EST];
+ fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ];
+ }
+
+ fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST];
+ fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ];
+
+ if( h->param.rc.i_vbv_buffer_size )
+ {
+ int row_count = output_inter[i][NUM_ROWS];
+ memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) );
+ if( !fenc->b_intra_calculated )
+ memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) );
+ row_satd_inter += row_count;
+ row_satd_intra += row_count;
+ }
}
- }
- i_score = fenc->i_cost_est[b-p0][p1-b];
- if( b != p1 )
- i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
- else
- fenc->b_intra_calculated = 1;
+ i_score = fenc->i_cost_est[b-p0][p1-b];
+ if( b != p1 )
+ i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
+ else
+ fenc->b_intra_calculated = 1;
- fenc->i_cost_est[b-p0][p1-b] = i_score;
- x264_emms();
+ fenc->i_cost_est[b-p0][p1-b] = i_score;
+ x264_emms();
+ }
}
if( b_intra_penalty )
return;
}
+#if HAVE_OPENCL
+ x264_opencl_slicetype_prep( h, frames, num_frames, a.i_lambda );
+#endif
+
if( h->param.i_bframe )
{
if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
continue;
}
+#if HAVE_OPENCL
+ if( h->param.b_opencl )
+ {
+ int b_work_done = 0;
+ b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+0, i+2, i+1 );
+ b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+0, i+1, i+1 );
+ b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+1, i+2, i+2 );
+ if( b_work_done )
+ x264_opencl_flush( h );
+ }
+#endif
+
cost1b1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+1, 0 );
cost1p0 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+1, i+1, 0 );
cost2p0 = x264_slicetype_frame_cost( h, &a, frames, i+1, i+2, i+2, 0 );
/* Restore frametypes for all frames that haven't actually been decided yet. */
for( int j = reset_start; j <= num_frames; j++ )
frames[j]->i_type = X264_TYPE_AUTO;
+
+#if HAVE_OPENCL
+ x264_opencl_slicetype_end( h );
+#endif
}
void x264_slicetype_decide( x264_t *h )
--- /dev/null
+# Perl script used for compiling OpenCL src into x264 binary
+#
+# Copyright (C) 2013 x264 project
+# Authors: Steve Borho <sborho@multicorewareinc.com>
+
+use Digest::MD5 qw(md5_hex);
+
+# xxd takes a VAR, which will be the variable name
+# and BYTES, a string of bytes to beencoded.
+sub xxd
+{
+ my %args = @_;
+ my $var = $args{VAR};
+ my $bytes = $args{BYTES};
+ my @hexbytes;
+ my @bytes = split //, $$bytes;
+ foreach $b (@bytes)
+ {
+ push @hexbytes, sprintf("0x%02X", ord($b));
+ }
+
+ # Format 'em nice and pretty-like.
+ print 'static const char ' . $var . '[] = {' . "\n";
+ my $count = 0;
+ foreach my $h (@hexbytes)
+ {
+ print "$h, ";
+ $count++;
+ if ($count == 16)
+ {
+ print "\n";
+ $count = 0;
+ }
+ }
+ print "\n0x00 };\n\n";
+
+ return;
+}
+
+if (@ARGV < 1)
+{
+ printf "%s: VARNAME ", $0 . "\n";
+ exit(-1);
+}
+
+
+my @lines;
+while(<STDIN>)
+{
+ s/^\s+//; # trim leading whitespace
+ if (/^\/\//)
+ {
+ next; # skip the line if it starts with '//'
+ }
+ push @lines, $_;
+}
+
+my $lines = join '', @lines;
+xxd(VAR => @ARGV[0], BYTES => \$lines);
+
+my $hash = md5_hex($lines);
+@hash = ( $hash =~ m/../g );
+
+
+xxd(VAR => @ARGV[0] . "_hash", BYTES => \$hash);
" as opposed to letting them select different algorithms\n" );
H2( " --asm <integer> Override CPU detection\n" );
H2( " --no-asm Disable all CPU optimizations\n" );
+ H2( " --opencl Enable use of OpenCL\n" );
+ H2( " --opencl-clbin <string> Specify path of compiled OpenCL kernel cache\n" );
+ H2( " --opencl-device <integer> Specify OpenCL device ordinal\n" );
H2( " --visualize Show MB types overlayed on the encoded video\n" );
H2( " --dump-yuv <string> Save reconstructed frames\n" );
H2( " --sps-id <integer> Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
{ "ref", required_argument, NULL, 'r' },
{ "asm", required_argument, NULL, 0 },
{ "no-asm", no_argument, NULL, 0 },
+ { "opencl", no_argument, NULL, 1 },
+ { "opencl-clbin",required_argument, NULL, 0 },
+ { "opencl-device",required_argument, NULL, 0 },
{ "sar", required_argument, NULL, 0 },
{ "fps", required_argument, NULL, OPT_FPS },
{ "frames", required_argument, NULL, OPT_FRAMES },
int b_fake_interlaced;
+ int b_opencl; /* use OpenCL when available */
+ int i_opencl_device; /* specify count of GPU devices to skip, for CLI users */
+ void *opencl_device_id; /* pass explicit cl_device_id as void*, for API users */
+ char *psz_clbin_file; /* compiled OpenCL kernel cache file */
+
/* Slicing parameters */
int i_slice_max_size; /* Max size per slice in bytes; includes estimated NAL overhead. */
int i_slice_max_mbs; /* Max number of MBs per slice; overrides i_slice_count. */