OpenCL lookahead

author Steve Borho <steve@borho.org>

Thu, 21 Feb 2013 18:48:40 +0000 (12:48 -0600)

committer Fiona Glaser <fiona@x264.com>

Tue, 23 Apr 2013 21:36:20 +0000 (14:36 -0700)
author Steve Borho <steve@borho.org>
Thu, 21 Feb 2013 18:48:40 +0000 (12:48 -0600)
committer Fiona Glaser <fiona@x264.com>
Tue, 23 Apr 2013 21:36:20 +0000 (14:36 -0700)
diff --git a/.gitignore b/.gitignore

index 682a23c1263b9594608d642433ef06b1afa5a77c..0ff9fed77a8f005f55813481d0664f69f3b27820 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -43,3 +43,5 @@ checkasm
  .digress_x264
  dataDec.txt
  log.dec
+common/oclobj.h
+x264_lookahead.clbin
diff --git a/Makefile b/Makefile

index 7a22c42bd39786bebebd12ce75c5b1ac3fbf1a25..da1b45544515841125955ce2c1bfbfaf4a4034ca 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -8,6 +8,8 @@ vpath %.S $(SRCPATH)
  vpath %.asm $(SRCPATH)
  vpath %.rc $(SRCPATH)
  
+GENERATED =
+
  all: default
  default:
  
@@ -145,6 +147,35 @@ OBJSO  += $(if $(RC), x264res.dll.o)
  endif
  endif
  
+QUOTED_CFLAGS := $(CFLAGS)
+
+ifeq ($(HAVE_OPENCL),yes)
+empty:=
+space:=$(empty) $(empty)
+escaped:=\ $(empty)
+open:=(
+escopen:=\(
+close:=)
+escclose:=\)
+SAFE_INC_DIR := $(subst $(space),$(escaped),$(OPENCL_INC_DIR))
+SAFE_INC_DIR := $(subst $(open),$(escopen),$(SAFE_INC_DIR))
+SAFE_INC_DIR := $(subst $(close),$(escclose),$(SAFE_INC_DIR))
+SAFE_LIB_DIR := $(subst $(space),$(escaped),$(OPENCL_LIB_DIR))
+SAFE_LIB_DIR := $(subst $(open),$(escopen),$(SAFE_LIB_DIR))
+SAFE_LIB_DIR := $(subst $(close),$(escclose),$(SAFE_LIB_DIR))
+# For normal CFLAGS and LDFLAGS, we must escape spaces with a backslash to
+# make gcc happy
+CFLAGS += -I$(SAFE_INC_DIR) -DCL_USE_DEPRECATED_OPENCL_1_1_APIS
+LDFLAGS += -l$(OPENCL_LIB) -L$(SAFE_LIB_DIR)
+# For the CFLAGS used by the .depend rule, we must add quotes because
+# the rule does an extra level of shell expansions
+QUOTED_CFLAGS += -I"$(OPENCL_INC_DIR)" -DCL_USE_DEPRECATED_OPENCL_1_1_APIS
+common/oclobj.h: common/opencl/x264-cl.h $(wildcard $(SRCPATH)/common/opencl/*.cl)
+       cat $^ | perl $(SRCPATH)/tools/cltostr.pl x264_opencl_source > $@
+GENERATED += common/oclobj.h
+SRCS += common/opencl.c encoder/slicetype-cl.c
+endif
+
  OBJS   += $(SRCS:%.c=%.o)
  OBJCLI += $(SRCCLI:%.c=%.o)
  OBJSO  += $(SRCSO:%.c=%.o)
@@ -155,12 +186,12 @@ cli: x264$(EXE)
  lib-static: $(LIBX264)
  lib-shared: $(SONAME)
  
-$(LIBX264): .depend $(OBJS) $(OBJASM)
+$(LIBX264): $(GENERATED) .depend $(OBJS) $(OBJASM)
         rm -f $(LIBX264)
         $(AR)$@ $(OBJS) $(OBJASM)
         $(if $(RANLIB), $(RANLIB) $@)
  
-$(SONAME): .depend $(OBJS) $(OBJASM) $(OBJSO)
+$(SONAME): $(GENERATED) .depend $(OBJS) $(OBJASM) $(OBJSO)
         $(LD)$@ $(OBJS) $(OBJASM) $(OBJSO) $(SOFLAGS) $(LDFLAGS)
  
  ifneq ($(EXE),)
@@ -169,10 +200,10 @@ x264: x264$(EXE)
  checkasm: checkasm$(EXE)
  endif
  
-x264$(EXE): .depend $(OBJCLI) $(CLI_LIBX264)
+x264$(EXE): $(GENERATED) .depend $(OBJCLI) $(CLI_LIBX264)
         $(LD)$@ $(OBJCLI) $(CLI_LIBX264) $(LDFLAGSCLI) $(LDFLAGS)
  
-checkasm$(EXE): .depend $(OBJCHK) $(LIBX264)
+checkasm$(EXE): $(GENERATED) .depend $(OBJCHK) $(LIBX264)
         $(LD)$@ $(OBJCHK) $(LIBX264) $(LDFLAGS)
  
  $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend
@@ -193,7 +224,7 @@ $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK): .depend
  
  .depend: config.mak
         @rm -f .depend
-       @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
+       @$(foreach SRC, $(addprefix $(SRCPATH)/, $(SRCS) $(SRCCLI) $(SRCSO)), $(CC) $(QUOTED_CFLAGS) $(SRC) $(DEPMT) $(SRC:$(SRCPATH)/%.c=%.o) $(DEPMM) 1>> .depend;)
  
  config.mak:
         ./configure
@@ -231,7 +262,7 @@ endif
  
  clean:
         rm -f $(OBJS) $(OBJASM) $(OBJCLI) $(OBJSO) $(SONAME) *.a *.lib *.exp *.pdb x264 x264.exe .depend TAGS
-       rm -f checkasm checkasm.exe $(OBJCHK)
+       rm -f checkasm checkasm.exe $(OBJCHK) $(GENERATED) x264_lookahead.clbin
         rm -f $(SRC2:%.c=%.gcda) $(SRC2:%.c=%.gcno) *.dyn pgopti.dpi pgopti.dpi.lock
  
  distclean: clean
diff --git a/common/common.c b/common/common.c

index 23951de7aa6847c1392b85f8443b9a5ccd2c8012..0e8943a3d2616fbd17d9e4c87a025bb22b22eb5f 100644 (file)
--- a/common/common.c
+++ b/common/common.c
@@ -171,6 +171,10 @@ void x264_param_default( x264_param_t *param )
      param->b_pic_struct = 0;
      param->b_fake_interlaced = 0;
      param->i_frame_packing = -1;
+    param->b_opencl = 0;
+    param->i_opencl_device = 0;
+    param->opencl_device_id = NULL;
+    param->psz_clbin_file = NULL;
  }
  
  static int x264_param_apply_preset( x264_param_t *param, const char *preset )
@@ -1033,6 +1037,12 @@ int x264_param_parse( x264_param_t *p, const char *name, const char *value )
          p->b_fake_interlaced = atobool(value);
      OPT("frame-packing")
          p->i_frame_packing = atoi(value);
+    OPT("opencl")
+        p->b_opencl = atobool( value );
+    OPT("opencl-clbin")
+        p->psz_clbin_file = strdup( value );
+    OPT("opencl-device")
+        p->i_opencl_device = atoi( value );
      else
          return X264_PARAM_BAD_NAME;
  #undef OPT
@@ -1285,7 +1295,9 @@ char *x264_param2string( x264_param_t *p, int b_res )
          s += sprintf( s, "bitdepth=%d ", BIT_DEPTH );
      }
  
-    s += sprintf( s, "cabac=%d", p->b_cabac );
+    if( p->b_opencl )
+        s += sprintf( s, "opencl=%d", p->b_opencl );
+    s += sprintf( s, " cabac=%d", p->b_cabac );
      s += sprintf( s, " ref=%d", p->i_frame_reference );
      s += sprintf( s, " deblock=%d:%d:%d", p->b_deblocking_filter,
                    p->i_deblocking_filter_alphac0, p->i_deblocking_filter_beta );
diff --git a/common/common.h b/common/common.h

index f5c69baaac3888c4896d668d8c3b51c05ad0fab6..46344b0c842835887b7263fd49db4ff81e6a0857 100644 (file)
--- a/common/common.h
+++ b/common/common.h
@@ -54,6 +54,8 @@ do {\
      memset( var, 0, size );\
  } while( 0 )
  
+#define ARRAY_SIZE(array)  (sizeof(array)/sizeof(array[0]))
+
  #define X264_BFRAME_MAX 16
  #define X264_REF_MAX 16
  #define X264_THREAD_MAX 128
@@ -94,6 +96,10 @@ do {\
  #include <assert.h>
  #include <limits.h>
  
+#if HAVE_OPENCL
+#include "opencl.h"
+#endif
+
  #if HAVE_INTERLACED
  #   define MB_INTERLACED h->mb.b_interlaced
  #   define SLICE_MBAFF h->sh.b_mbaff
@@ -936,6 +942,10 @@ struct x264_t
      struct visualize_t *visualize;
  #endif
      x264_lookahead_t *lookahead;
+
+#if HAVE_OPENCL
+    x264_opencl_t opencl;
+#endif
  };
  
  // included at the end because it needs x264_t
diff --git a/common/frame.c b/common/frame.c

index c51fb9b85ca5dcb635d581ddf0728b9947037963..cc44ebdd8f7739df37c746439bcd4c6be4ef66a5 100644 (file)
--- a/common/frame.c
+++ b/common/frame.c
@@ -316,6 +316,9 @@ void x264_frame_delete( x264_frame_t *frame )
          }
          x264_pthread_mutex_destroy( &frame->mutex );
          x264_pthread_cond_destroy( &frame->cv );
+#if HAVE_OPENCL
+        x264_opencl_frame_delete( frame );
+#endif
      }
      x264_free( frame );
  }
diff --git a/common/frame.h b/common/frame.h

index 468503ae54661d310ef24d05298558ada00f4692..72c1fa3a56c7f4a758e61bf05d367e0e55b543b7 100644 (file)
--- a/common/frame.h
+++ b/common/frame.h
@@ -172,6 +172,10 @@ typedef struct x264_frame
      /* user frame properties */
      uint8_t *mb_info;
      void (*mb_info_free)( void* );
+
+#if HAVE_OPENCL
+    x264_frame_opencl_t opencl;
+#endif
  } x264_frame_t;
  
  /* synchronized frame list */
diff --git a/common/opencl.c b/common/opencl.c

new file mode 100644 (file)

index 0000000..aa2bd04
--- /dev/null
+++ b/common/opencl.c
@@ -0,0 +1,606 @@
+/*****************************************************************************
+ * opencl.c: OpenCL initialization and kernel compilation
+ *****************************************************************************
+ * Copyright (C) 2012-2013 x264 project
+ *
+ * Authors: Steve Borho <sborho@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common.h"
+#if _WIN32
+#include <windows.h>
+#else
+#include <dlfcn.h> //dlopen, dlsym, dlclose
+#endif
+
+/* define from recent cl_ext.h, copied here in case headers are old */
+#define CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD        0x4042
+
+/* Requires full include path in case of out-of-tree builds */
+#include "common/oclobj.h"
+
+static int x264_detect_switchable_graphics();
+
+/* Try to load the cached compiled program binary, verify the device context is
+ * still valid before reuse */
+static cl_program x264_opencl_cache_load( x264_t *h, char *devname, char *devvendor, char *driverversion )
+{
+    cl_program program = NULL;
+    cl_int status;
+
+    /* try to load cached program binary */
+    FILE *fp = fopen( h->param.psz_clbin_file, "rb" );
+    if( !fp )
+        return NULL;
+
+    fseek( fp, 0L, SEEK_END );
+    size_t size = ftell( fp );
+    rewind( fp );
+    uint8_t *binary;
+    CHECKED_MALLOC( binary, size );
+
+    fread( binary, 1, size, fp );
+    const uint8_t *ptr = (const uint8_t*)binary;
+
+#define CHECK_STRING( STR )\
+    do {\
+        size_t len = strlen( STR );\
+        if( size <= len || strncmp( (char*)ptr, STR, len ) )\
+            goto fail;\
+        else {\
+            size -= (len+1); ptr += (len+1);\
+        }\
+    } while( 0 )
+
+    CHECK_STRING( devname );
+    CHECK_STRING( devvendor );
+    CHECK_STRING( driverversion );
+    CHECK_STRING( x264_opencl_source_hash );
+#undef CHECK_STRING
+
+    program = clCreateProgramWithBinary( h->opencl.context, 1, &h->opencl.device, &size, &ptr, NULL, &status );
+    if( status != CL_SUCCESS )
+        program = NULL;
+
+fail:
+    fclose( fp );
+    x264_free( binary );
+    return program;
+}
+
+/* Save the compiled program binary to a file for later reuse.  Device context
+ * is also saved in the cache file so we do not reuse stale binaries */
+static void x264_opencl_cache_save( x264_t *h, cl_program program, char *devname, char *devvendor, char *driverversion )
+{
+    FILE *fp = fopen( h->param.psz_clbin_file, "wb" );
+    if( !fp )
+    {
+        x264_log( h, X264_LOG_INFO, "OpenCL: unable to open clbin file for write");
+        return;
+    }
+
+    size_t size;
+    cl_int status = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &size, NULL );
+    if( status == CL_SUCCESS )
+    {
+        uint8_t *binary;
+        CHECKED_MALLOC( binary, size );
+        status = clGetProgramInfo( program, CL_PROGRAM_BINARIES, sizeof(uint8_t *), &binary, NULL );
+        if( status == CL_SUCCESS )
+        {
+            fputs( devname, fp );
+            fputc( '\n', fp );
+            fputs( devvendor, fp );
+            fputc( '\n', fp );
+            fputs( driverversion, fp );
+            fputc( '\n', fp );
+            fputs( x264_opencl_source_hash, fp );
+            fputc( '\n', fp );
+            fwrite( binary, 1, size, fp );
+        }
+        else
+            x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary, no cache file generated");
+        x264_free( binary );
+    }
+    else
+        x264_log( h, X264_LOG_INFO, "OpenCL: Unable to query program binary size, no cache file generated");
+    fclose( fp );
+
+fail:
+    return;
+}
+
+/* The OpenCL source under common/opencl will be merged into common/oclobj.h by
+ * the Makefile. It defines a x264_opencl_source byte array which we will pass
+ * to clCreateProgramWithSource().  We also attempt to use a cache file for the
+ * compiled binary, stored in the current working folder. */
+static cl_program x264_opencl_compile( x264_t *h )
+{
+    cl_program program;
+    cl_int status;
+
+    char devname[64];
+    char devvendor[64];
+    char driverversion[64];
+    status  = clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME,    sizeof(devname), devname, NULL );
+    status |= clGetDeviceInfo( h->opencl.device, CL_DEVICE_VENDOR,  sizeof(devvendor), devvendor, NULL );
+    status |= clGetDeviceInfo( h->opencl.device, CL_DRIVER_VERSION, sizeof(driverversion), driverversion, NULL );
+    if( status != CL_SUCCESS )
+        return NULL;
+
+    // Most AMD GPUs have vector registers
+    int vectorize = !strcmp( devvendor, "Advanced Micro Devices, Inc." );
+    h->opencl.b_device_AMD_SI = 0;
+
+    if( vectorize )
+    {
+        /* Disable OpenCL on Intel/AMD switchable graphics devices */
+        if( x264_detect_switchable_graphics() )
+        {
+            x264_log( h, X264_LOG_INFO, "OpenCL acceleration disabled, switchable graphics detected\n" );
+            return NULL;
+        }
+
+        /* Detect AMD SouthernIsland or newer device (single-width registers) */
+        cl_uint simdwidth = 4;
+        status = clGetDeviceInfo( h->opencl.device, CL_DEVICE_SIMD_INSTRUCTION_WIDTH_AMD, sizeof(cl_uint), &simdwidth, NULL );
+        if( status == CL_SUCCESS && simdwidth == 1 )
+        {
+            vectorize = 0;
+            h->opencl.b_device_AMD_SI = 1;
+        }
+    }
+
+    x264_log( h, X264_LOG_INFO, "OpenCL acceleration enabled with %s %s %s\n", devvendor, devname, h->opencl.b_device_AMD_SI ? "(SI)" : "" );
+
+    program = x264_opencl_cache_load( h, devname, devvendor, driverversion );
+    if( !program )
+    {
+        /* clCreateProgramWithSource() requires a pointer variable, you cannot just use &x264_opencl_source */
+        x264_log( h, X264_LOG_INFO, "Compiling OpenCL kernels...\n" );
+        const char *strptr = (const char*)x264_opencl_source;
+        size_t size = sizeof(x264_opencl_source);
+        program = clCreateProgramWithSource( h->opencl.context, 1, &strptr, &size, &status );
+        if( status != CL_SUCCESS || !program )
+        {
+            x264_log( h, X264_LOG_WARNING, "OpenCL: unable to create program\n" );
+            return NULL;
+        }
+    }
+
+    /* Build the program binary for the OpenCL device */
+    const char *buildopts = vectorize ? "-DVECTORIZE=1" : "";
+    status = clBuildProgram( program, 1, &h->opencl.device, buildopts, NULL, NULL );
+    if( status == CL_SUCCESS )
+    {
+        x264_opencl_cache_save( h, program, devname, devvendor, driverversion );
+        return program;
+    }
+
+    /* Compile failure, should not happen with production code. */
+
+    size_t build_log_len = 0;
+
+    status = clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, NULL, &build_log_len );
+    if( status != CL_SUCCESS )
+    {
+        x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to query build log\n" );
+        return NULL;
+    }
+
+    char *build_log;
+    CHECKED_MALLOC( build_log, build_log_len );
+    if( !build_log )
+    {
+        x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to alloc build log\n" );
+        return NULL;
+    }
+
+    status = clGetProgramBuildInfo( program, h->opencl.device, CL_PROGRAM_BUILD_LOG, build_log_len, build_log, NULL );
+    if( status != CL_SUCCESS )
+    {
+        x264_log( h, X264_LOG_WARNING, "OpenCL: Compilation failed, unable to get build log\n" );
+        x264_free( build_log );
+        return NULL;
+    }
+
+    FILE *lg = fopen( "x264_kernel_build_log.txt", "w" );
+    if( lg )
+    {
+        fwrite( build_log, 1, build_log_len, lg );
+        fclose( lg );
+        x264_log( h, X264_LOG_WARNING, "OpenCL: kernel build errors written to x264_kernel_build_log.txt\n" );
+    }
+
+    x264_free( build_log );
+fail:
+    return NULL;
+}
+
+static void x264_opencl_free_lookahead( x264_t *h )
+{
+#define RELEASE( a, f ) if( a ) f( a );
+    RELEASE( h->opencl.intra_kernel, clReleaseKernel )
+    RELEASE( h->opencl.rowsum_intra_kernel, clReleaseKernel )
+    RELEASE( h->opencl.downscale_kernel1, clReleaseKernel )
+    RELEASE( h->opencl.downscale_kernel2, clReleaseKernel )
+    RELEASE( h->opencl.downscale_hpel_kernel, clReleaseKernel )
+    RELEASE( h->opencl.weightp_hpel_kernel, clReleaseKernel )
+    RELEASE( h->opencl.weightp_scaled_images_kernel, clReleaseKernel )
+    RELEASE( h->opencl.memset_kernel, clReleaseKernel )
+    RELEASE( h->opencl.hme_kernel, clReleaseKernel )
+    RELEASE( h->opencl.subpel_refine_kernel, clReleaseKernel )
+    RELEASE( h->opencl.mode_select_kernel, clReleaseKernel )
+    RELEASE( h->opencl.rowsum_inter_kernel, clReleaseKernel )
+    RELEASE( h->opencl.lookahead_program, clReleaseProgram )
+    RELEASE( h->opencl.row_satds[0], clReleaseMemObject )
+    RELEASE( h->opencl.row_satds[1], clReleaseMemObject )
+    RELEASE( h->opencl.frame_stats[0], clReleaseMemObject )
+    RELEASE( h->opencl.frame_stats[1], clReleaseMemObject )
+    RELEASE( h->opencl.mv_buffers[0], clReleaseMemObject )
+    RELEASE( h->opencl.mv_buffers[1], clReleaseMemObject )
+    RELEASE( h->opencl.mvp_buffer, clReleaseMemObject )
+    RELEASE( h->opencl.luma_16x16_image[0], clReleaseMemObject )
+    RELEASE( h->opencl.luma_16x16_image[1], clReleaseMemObject )
+    RELEASE( h->opencl.lowres_mv_costs, clReleaseMemObject )
+    RELEASE( h->opencl.lowres_costs[0], clReleaseMemObject )
+    RELEASE( h->opencl.lowres_costs[1], clReleaseMemObject )
+    RELEASE( h->opencl.page_locked_buffer, clReleaseMemObject )
+    RELEASE( h->opencl.weighted_luma_hpel, clReleaseMemObject )
+    for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+        RELEASE( h->opencl.weighted_scaled_images[i], clReleaseMemObject )
+#undef RELEASE
+}
+
+int x264_opencl_init_lookahead( x264_t *h )
+{
+    if( !h->param.rc.i_lookahead )
+        return -1;
+
+    static const char const *kernelnames[] = {
+        "mb_intra_cost_satd_8x8",
+        "sum_intra_cost",
+        "downscale_hpel",
+        "downscale1",
+        "downscale2",
+        "memset_int16",
+        "weightp_scaled_images",
+        "weightp_hpel",
+        "hierarchical_motion",
+        "subpel_refine",
+        "mode_selection",
+        "sum_inter_cost"
+    };
+    cl_kernel *kernels[] = {
+        &h->opencl.intra_kernel,
+        &h->opencl.rowsum_intra_kernel,
+        &h->opencl.downscale_hpel_kernel,
+        &h->opencl.downscale_kernel1,
+        &h->opencl.downscale_kernel2,
+        &h->opencl.memset_kernel,
+        &h->opencl.weightp_scaled_images_kernel,
+        &h->opencl.weightp_hpel_kernel,
+        &h->opencl.hme_kernel,
+        &h->opencl.subpel_refine_kernel,
+        &h->opencl.mode_select_kernel,
+        &h->opencl.rowsum_inter_kernel
+    };
+    cl_int status;
+
+    h->opencl.lookahead_program = x264_opencl_compile( h );
+    if( !h->opencl.lookahead_program )
+    {
+        x264_opencl_free_lookahead( h );
+        return -1;
+    }
+
+    for( int i = 0; i < ARRAY_SIZE(kernelnames); i++ )
+    {
+        *kernels[i] = clCreateKernel( h->opencl.lookahead_program, kernelnames[i], &status );
+        if( status != CL_SUCCESS )
+        {
+            x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to compile kernel '%s' (%d)\n", kernelnames[i], status );
+            x264_opencl_free_lookahead( h );
+            return -1;
+        }
+    }
+
+    h->opencl.page_locked_buffer = clCreateBuffer( h->opencl.context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, PAGE_LOCKED_BUF_SIZE, NULL, &status );
+    if( status != CL_SUCCESS )
+    {
+        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to allocate page-locked buffer, error '%d'\n", status );
+        x264_opencl_free_lookahead( h );
+        return -1;
+    }
+    h->opencl.page_locked_ptr = clEnqueueMapBuffer( h->opencl.queue, h->opencl.page_locked_buffer, CL_TRUE, CL_MAP_READ | CL_MAP_WRITE,
+                                                    0, PAGE_LOCKED_BUF_SIZE, 0, NULL, NULL, &status );
+    if( status != CL_SUCCESS )
+    {
+        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to map page-locked buffer, error '%d'\n", status );
+        x264_opencl_free_lookahead( h );
+        return -1;
+    }
+
+    return 0;
+}
+
+static void x264_opencl_error_notify( const char *errinfo, const void *private_info, size_t cb, void *user_data )
+{
+    /* Any error notification can be assumed to be fatal to the OpenCL context.
+     * We need to stop using it immediately to prevent further damage. */
+    x264_t *h = (x264_t*)user_data;
+    h->param.b_opencl = 0;
+    h->opencl.b_fatal_error = 1;
+    x264_log( h, X264_LOG_ERROR, "OpenCL: %s\n", errinfo );
+    x264_log( h, X264_LOG_ERROR, "OpenCL: fatal error, aborting encode\n" );
+}
+
+int x264_opencl_init( x264_t *h )
+{
+    cl_int status;
+    cl_uint numPlatforms;
+    int ret = -1;
+
+    status = clGetPlatformIDs( 0, NULL, &numPlatforms );
+    if( status != CL_SUCCESS || numPlatforms == 0 )
+    {
+        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n");
+        return -1;
+    }
+
+    cl_platform_id *platforms = (cl_platform_id*)x264_malloc( numPlatforms * sizeof(cl_platform_id) );
+    status = clGetPlatformIDs( numPlatforms, platforms, NULL );
+    if( status != CL_SUCCESS )
+    {
+        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to query installed platforms\n");
+        x264_free( platforms );
+        return -1;
+    }
+
+    /* Select the first OpenCL platform with a GPU device that supports our
+     * required image (texture) formats */
+    for( cl_uint i = 0; i < numPlatforms; ++i )
+    {
+        cl_uint gpu_count = 0;
+        status = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &gpu_count );
+        if( status != CL_SUCCESS || !gpu_count )
+            continue;
+
+        cl_device_id *devices = x264_malloc( sizeof(cl_device_id) * gpu_count );
+        if( !devices )
+            continue;
+
+        status = clGetDeviceIDs( platforms[i], CL_DEVICE_TYPE_GPU, gpu_count, devices, NULL );
+        if( status != CL_SUCCESS )
+        {
+            x264_free( devices );
+            continue;
+        }
+
+        /* Find a GPU device that supports our image formats */
+        for( cl_uint gpu = 0; gpu < gpu_count; gpu++ )
+        {
+            h->opencl.device = devices[gpu];
+
+            /* if the user has specified an exact device ID, skip all other
+             * GPUs.  If this device matches, allow it to continue through the
+             * checks for supported images, etc.  */
+            if( h->param.opencl_device_id && devices[gpu] != (cl_device_id) h->param.opencl_device_id )
+                continue;
+
+            cl_bool image_support;
+            clGetDeviceInfo( h->opencl.device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &image_support, NULL );
+            if( !image_support )
+                continue;
+
+            cl_context context = clCreateContext( NULL, 1, &h->opencl.device, (void*)x264_opencl_error_notify, (void*)h, &status );
+            if( status != CL_SUCCESS )
+                continue;
+
+            cl_uint imagecount = 0;
+            clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, 0, NULL, &imagecount );
+            if( !imagecount )
+            {
+                clReleaseContext( context );
+                continue;
+            }
+
+            cl_image_format *imageType = x264_malloc( sizeof(cl_image_format) * imagecount );
+            if( !imageType )
+            {
+                clReleaseContext( context );
+                continue;
+            }
+
+            clGetSupportedImageFormats( context, CL_MEM_READ_WRITE, CL_MEM_OBJECT_IMAGE2D, imagecount, imageType, NULL );
+
+            int b_has_r = 0;
+            int b_has_rgba = 0;
+            for( cl_uint j = 0; j < imagecount; j++ )
+            {
+                if( imageType[j].image_channel_order == CL_R &&
+                    imageType[j].image_channel_data_type == CL_UNSIGNED_INT32 )
+                    b_has_r = 1;
+                else if( imageType[j].image_channel_order == CL_RGBA &&
+                         imageType[j].image_channel_data_type == CL_UNSIGNED_INT8 )
+                    b_has_rgba = 1;
+            }
+            x264_free( imageType );
+            if( !b_has_r || !b_has_rgba )
+            {
+                char devname[64];
+                status = clGetDeviceInfo( h->opencl.device, CL_DEVICE_NAME, sizeof(devname), devname, NULL );
+                if( status == CL_SUCCESS )
+                {
+                    /* emit warning if we are discarding the user's explicit choice */
+                    int level = h->param.opencl_device_id ? X264_LOG_WARNING : X264_LOG_DEBUG;
+                    x264_log( h, level, "OpenCL: %s does not support required image formats\n", devname);
+                }
+                clReleaseContext( context );
+                continue;
+            }
+
+            /* user selection of GPU device, skip N first matches */
+            if( h->param.i_opencl_device )
+            {
+                h->param.i_opencl_device--;
+                clReleaseContext( context );
+                continue;
+            }
+
+            h->opencl.queue = clCreateCommandQueue( context, h->opencl.device, 0, &status );
+            if( status != CL_SUCCESS )
+            {
+                clReleaseContext( context );
+                continue;
+            }
+
+            h->opencl.context = context;
+
+            ret = 0;
+            break;
+        }
+
+        x264_free( devices );
+
+        if( !ret )
+            break;
+    }
+
+    x264_free( platforms );
+
+    if( !h->param.psz_clbin_file )
+        h->param.psz_clbin_file = "x264_lookahead.clbin";
+
+    if( ret )
+        x264_log( h, X264_LOG_WARNING, "OpenCL: Unable to find a compatible device\n");
+    else
+        ret = x264_opencl_init_lookahead( h );
+
+    return ret;
+}
+
+void x264_opencl_frame_delete( x264_frame_t *frame )
+{
+#define RELEASEBUF(mem) if( mem ) clReleaseMemObject( mem );
+    for( int j = 0; j < NUM_IMAGE_SCALES; j++ )
+        RELEASEBUF( frame->opencl.scaled_image2Ds[j] );
+    RELEASEBUF( frame->opencl.luma_hpel );
+    RELEASEBUF( frame->opencl.inv_qscale_factor );
+    RELEASEBUF( frame->opencl.intra_cost );
+    RELEASEBUF( frame->opencl.lowres_mvs0 );
+    RELEASEBUF( frame->opencl.lowres_mvs1 );
+    RELEASEBUF( frame->opencl.lowres_mv_costs0 );
+    RELEASEBUF( frame->opencl.lowres_mv_costs1 );
+#undef RELEASEBUF
+}
+
+void x264_opencl_free( x264_t *h )
+{
+    if( h->opencl.queue )
+        clFinish(h->opencl.queue );
+
+    x264_opencl_free_lookahead( h );
+
+    if( h->opencl.queue )
+        clReleaseCommandQueue( h->opencl.queue );
+    if( h->opencl.context )
+        clReleaseContext( h->opencl.context );
+}
+
+/* OpenCL misbehaves on hybrid laptops with Intel iGPU and AMD dGPU, so
+ * we consult AMD's ADL interface to detect this situation and disable
+ * OpenCL on these machines (Linux and Windows) */
+#ifndef _WIN32
+#define __stdcall
+#define HINSTANCE void *
+#endif
+typedef void* ( __stdcall *ADL_MAIN_MALLOC_CALLBACK )( int );
+typedef int ( *ADL_MAIN_CONTROL_CREATE )(ADL_MAIN_MALLOC_CALLBACK, int );
+typedef int ( *ADL_ADAPTER_NUMBEROFADAPTERS_GET ) ( int* );
+typedef int ( *ADL_POWERXPRESS_SCHEME_GET ) ( int, int *, int *, int * );
+typedef int ( *ADL_MAIN_CONTROL_DESTROY )();
+#define ADL_OK 0
+#define ADL_PX_SCHEME_DYNAMIC 2
+
+void* __stdcall adl_malloc_wrapper( int iSize ) { return x264_malloc( iSize ); }
+
+static int x264_detect_switchable_graphics()
+{
+    ADL_MAIN_CONTROL_CREATE          ADL_Main_Control_Create;
+    ADL_ADAPTER_NUMBEROFADAPTERS_GET ADL_Adapter_NumberOfAdapters_Get;
+    ADL_POWERXPRESS_SCHEME_GET       ADL_PowerXpress_Scheme_Get;
+    ADL_MAIN_CONTROL_DESTROY         ADL_Main_Control_Destroy;
+    HINSTANCE hDLL;
+    int ret = 0;
+
+#if _WIN32
+    hDLL = LoadLibrary( "atiadlxx.dll" );
+    if( !hDLL )
+        hDLL = LoadLibrary( "atiadlxy.dll" );
+#else
+    hDLL = dlopen( "libatiadlxx.so", RTLD_LAZY|RTLD_GLOBAL );
+#define GetProcAddress dlsym
+#endif
+    if( !hDLL )
+        return ret;
+
+    ADL_Main_Control_Create = (ADL_MAIN_CONTROL_CREATE) GetProcAddress(hDLL, "ADL_Main_Control_Create");
+    ADL_Main_Control_Destroy = (ADL_MAIN_CONTROL_DESTROY) GetProcAddress(hDLL, "ADL_Main_Control_Destroy");
+    ADL_Adapter_NumberOfAdapters_Get = (ADL_ADAPTER_NUMBEROFADAPTERS_GET) GetProcAddress(hDLL, "ADL_Adapter_NumberOfAdapters_Get");
+    ADL_PowerXpress_Scheme_Get = (ADL_POWERXPRESS_SCHEME_GET) GetProcAddress(hDLL, "ADL_PowerXpress_Scheme_Get");
+    if( !ADL_Main_Control_Destroy || !ADL_Main_Control_Destroy || !ADL_Adapter_NumberOfAdapters_Get ||
+        !ADL_PowerXpress_Scheme_Get )
+        goto bail;
+
+    if( ADL_OK != ADL_Main_Control_Create( adl_malloc_wrapper, 1) )
+        goto bail;
+
+    int numAdapters = 0;
+    if( ADL_OK != ADL_Adapter_NumberOfAdapters_Get( &numAdapters ) )
+    {
+        ADL_Main_Control_Destroy();
+        goto bail;
+    }
+
+    for( int i = 0; i < numAdapters; i++ )
+    {
+        int PXSchemeRange, PXSchemeCurrentState, PXSchemeDefaultState;
+        if( ADL_OK != ADL_PowerXpress_Scheme_Get( i, &PXSchemeRange, &PXSchemeCurrentState, &PXSchemeDefaultState) )
+            break;
+
+        if( PXSchemeRange >= ADL_PX_SCHEME_DYNAMIC )
+        {
+            ret = 1;
+            break;
+        }
+    }
+
+    ADL_Main_Control_Destroy();
+
+bail:
+#if _WIN32
+    FreeLibrary( hDLL );
+#else
+    dlclose( hDLL );
+#endif
+
+    return ret;
+}
diff --git a/common/opencl.h b/common/opencl.h

new file mode 100644 (file)

index 0000000..86a3059
--- /dev/null
+++ b/common/opencl.h
@@ -0,0 +1,120 @@
+/*****************************************************************************
+ * opencl.h: OpenCL structures and defines
+ *****************************************************************************
+ * Copyright (C) 2012-2013 x264 project
+ *
+ * Authors: Steve Borho <sborho@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#ifndef X264_OPENCL_H
+#define X264_OPENCL_H
+
+#include "x264.h"
+#include "common/common.h"
+
+#include <CL/cl.h>
+
+/* Number of downscale resolutions to use for motion search */
+#define NUM_IMAGE_SCALES 4
+
+/* Number of PCIe copies that can be queued before requiring a flush */
+#define MAX_FINISH_COPIES 1024
+
+/* Size (in bytes) of the page-locked buffer used for PCIe xfers */
+#define PAGE_LOCKED_BUF_SIZE 32 * 1024 * 1024
+
+typedef struct
+{
+    cl_context       context;
+    cl_device_id     device;
+    cl_command_queue queue;
+
+    cl_program  lookahead_program;
+    cl_int      last_buf;
+
+    cl_mem      page_locked_buffer;
+    char       *page_locked_ptr;
+    int         pl_occupancy;
+
+    struct
+    {
+        void *src;
+        void *dest;
+        int   bytes;
+    } copies[MAX_FINISH_COPIES];
+    int         num_copies;
+
+    int         b_device_AMD_SI;
+    int         b_fatal_error;
+    int         lookahead_thread_pri;
+    int         opencl_thread_pri;
+
+    /* downscale lowres luma */
+    cl_kernel   downscale_hpel_kernel;
+    cl_kernel   downscale_kernel1;
+    cl_kernel   downscale_kernel2;
+    cl_mem      luma_16x16_image[2];
+
+    /* weightp filtering */
+    cl_kernel   weightp_hpel_kernel;
+    cl_kernel   weightp_scaled_images_kernel;
+    cl_mem      weighted_scaled_images[NUM_IMAGE_SCALES];
+    cl_mem      weighted_luma_hpel;
+
+    /* intra */
+    cl_kernel   memset_kernel;
+    cl_kernel   intra_kernel;
+    cl_kernel   rowsum_intra_kernel;
+    cl_mem      row_satds[2];
+
+    /* hierarchical motion estimation */
+    cl_kernel   hme_kernel;
+    cl_kernel   subpel_refine_kernel;
+    cl_mem      mv_buffers[2];
+    cl_mem      lowres_mv_costs;
+    cl_mem      mvp_buffer;
+
+    /* bidir */
+    cl_kernel   mode_select_kernel;
+    cl_kernel   rowsum_inter_kernel;
+    cl_mem      lowres_costs[2];
+    cl_mem      frame_stats[2]; /* cost_est, cost_est_aq, intra_mbs */
+} x264_opencl_t;
+
+typedef struct
+{
+    cl_mem scaled_image2Ds[NUM_IMAGE_SCALES];
+    cl_mem luma_hpel;
+    cl_mem inv_qscale_factor;
+    cl_mem intra_cost;
+    cl_mem lowres_mvs0;
+    cl_mem lowres_mvs1;
+    cl_mem lowres_mv_costs0;
+    cl_mem lowres_mv_costs1;
+} x264_frame_opencl_t;
+
+typedef struct x264_frame x264_frame;
+
+int x264_opencl_init( x264_t *h );
+int x264_opencl_init_lookahead( x264_t *h );
+void x264_opencl_free( x264_t *h );
+void x264_opencl_frame_delete( x264_frame *frame );
+
+#endif
diff --git a/common/opencl/bidir.cl b/common/opencl/bidir.cl

new file mode 100644 (file)

index 0000000..9c21626
--- /dev/null
+++ b/common/opencl/bidir.cl
@@ -0,0 +1,265 @@
+/* Mode selection routines, select the least SATD cost mode for each lowres
+ * macroblock.  When measuring B slices, this includes measuring the cost of
+ * three bidir modes.  */
+
+/* Four threads cooperatively measure 8x8 BIDIR cost with SATD */
+int bidir_satd_8x8_ii_coop4( read_only image2d_t fenc_lowres,
+                             int2 fencpos,
+                             read_only image2d_t fref0_planes,
+                             int2 qpos0,
+                             read_only image2d_t fref1_planes,
+                             int2 qpos1,
+                             int weight,
+                             local sum2_t *tmpp,
+                             int idx )
+{
+    volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
+    sum2_t b0, b1, b2, b3;
+    sum2_t sum = 0;
+
+    // fencpos is full-pel position of original MB
+    // qpos0 is qpel position within reference frame 0
+    // qpos1 is qpel position within reference frame 1
+
+    int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2);
+    int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2);
+
+    int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1));
+    int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2);
+    int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2);
+
+    int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2);
+    int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2);
+
+    int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1));
+    int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2);
+    int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2);
+
+    uint mask_shift0A = 8 * hpel0A, mask_shift0B = 8 * hpel0B;
+    uint mask_shift1A = 8 * hpel1A, mask_shift1B = 8 * hpel1B;
+
+    uint vA, vB;
+    uint enc, ref0, ref1;
+    uint a0, a1;
+    const int weight2 = 64 - weight;
+
+#define READ_BIDIR_DIFF( OUT, X )\
+    enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\
+    vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 >> mask_shift0A) & 0xFF;\
+    vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 >> mask_shift0B) & 0xFF;\
+    ref0 = rhadd( vA, vB );\
+    vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 >> mask_shift1A) & 0xFF;\
+    vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 >> mask_shift1B) & 0xFF;\
+    ref1 = rhadd( vA, vB );\
+    OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6);
+
+#define READ_DIFF_EX( OUT, a, b )\
+    READ_BIDIR_DIFF( a0, a );\
+    READ_BIDIR_DIFF( a1, b );\
+    OUT = a0 + (a1<<BITS_PER_SUM);
+
+#define ROW_8x4_SATD( a, b, c )\
+    fencpos.y += a;\
+    fref0Apos.y += b;\
+    fref0Bpos.y += b;\
+    fref1Apos.y += c;\
+    fref1Bpos.y += c;\
+    READ_DIFF_EX( b0, 0, 4 );\
+    READ_DIFF_EX( b1, 1, 5 );\
+    READ_DIFF_EX( b2, 2, 6 );\
+    READ_DIFF_EX( b3, 3, 7 );\
+    HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
+    HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
+    sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );
+
+    ROW_8x4_SATD( 0, 0, 0 );
+    ROW_8x4_SATD( 4, 4, 4 );
+
+#undef READ_BIDIR_DIFF
+#undef READ_DIFF_EX
+#undef ROW_8x4_SATD
+
+    return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
+}
+
+/*
+ * mode selection - pick the least cost partition type for each 8x8 macroblock.
+ * Intra, list0 or list1.  When measuring a B slice, also test three bidir
+ * possibilities.
+ *
+ * fenc_lowres_mvs[0|1] and fenc_lowres_mv_costs[0|1] are large buffers that
+ * hold many frames worth of motion vectors.  We must offset into the correct
+ * location for this frame's vectors:
+ *
+ *   CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
+ *   GPU equivalent: fenc_lowres_mvs0[(b - p0 - 1) * mb_count]
+ *
+ * global launch dimensions for P slice estimate:  [mb_width, mb_height]
+ * global launch dimensions for B slice estimate:  [mb_width * 4, mb_height]
+ */
+kernel void mode_selection( read_only image2d_t   fenc_lowres,
+                            read_only image2d_t   fref0_planes,
+                            read_only image2d_t   fref1_planes,
+                            const global short2  *fenc_lowres_mvs0,
+                            const global short2  *fenc_lowres_mvs1,
+                            const global short2  *fref1_lowres_mvs0,
+                            const global int16_t *fenc_lowres_mv_costs0,
+                            const global int16_t *fenc_lowres_mv_costs1,
+                            const global uint16_t *fenc_intra_cost,
+                            global uint16_t      *lowres_costs,
+                            global int           *frame_stats,
+                            local int16_t        *cost_local,
+                            local sum2_t         *satd_local,
+                            int                   mb_width,
+                            int                   bipred_weight,
+                            int                   dist_scale_factor,
+                            int                   b,
+                            int                   p0,
+                            int                   p1,
+                            int                   lambda )
+{
+    int mb_x = get_global_id( 0 );
+    int b_bidir = b < p1;
+    if( b_bidir )
+    {
+        /* when mode_selection is run for B frames, it must perform BIDIR SATD
+         * measurements, so it is launched with four times as many threads in
+         * order to spread the work around more of the GPU.  And it can add
+         * padding threads in the X direction. */
+        mb_x >>= 2;
+        if( mb_x >= mb_width )
+            return;
+    }
+    int mb_y = get_global_id( 1 );
+    int mb_height = get_global_size( 1 );
+    int mb_count = mb_width * mb_height;
+    int mb_xy = mb_x + mb_y * mb_width;
+
+    /* Initialize int frame_stats[4] for next kernel (sum_inter_cost) */
+    if( mb_x < 4 && mb_y == 0 )
+        frame_stats[mb_x] = 0;
+
+    int bcost = COST_MAX;
+    int list_used = 0;
+
+    if( !b_bidir )
+    {
+        int icost = fenc_intra_cost[mb_xy];
+        COPY2_IF_LT( bcost, icost, list_used, 0 );
+    }
+    if( b != p0 )
+    {
+        int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy];
+        COPY2_IF_LT( bcost, mv_cost0, list_used, 1 );
+    }
+    if( b != p1 )
+    {
+        int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy];
+        COPY2_IF_LT( bcost, mv_cost1, list_used, 2 );
+    }
+
+    if( b_bidir )
+    {
+        int2 coord = (int2)(mb_x, mb_y) << 3;
+        int mb_i = get_global_id( 0 ) & 3;
+        int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
+        cost_local += mb_in_group * 4;
+        satd_local += mb_in_group * 16;
+
+#define TRY_BIDIR( mv0, mv1, penalty )\
+{\
+    int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\
+    int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\
+    cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\
+    int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
+    COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\
+}
+
+        /* temporal prediction */
+        short2 dmv0, dmv1;
+        short2 mvr = fref1_lowres_mvs0[mb_xy];
+        dmv0 = (mvr * (short) dist_scale_factor + (short) 128) >> (short) 8;
+        dmv1 = dmv0 - mvr;
+        TRY_BIDIR( dmv0, dmv1, 0 )
+
+        if( as_uint( dmv0 ) || as_uint( dmv1 ) )
+        {
+            /* B-direct prediction */
+            dmv0 = 0; dmv1 = 0;
+            TRY_BIDIR( dmv0, dmv1, 0 );
+        }
+
+        /* L0+L1 prediction */
+        dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy];
+        dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy];
+        TRY_BIDIR( dmv0, dmv1, 5 );
+#undef TRY_BIDIR
+    }
+
+    lowres_costs[mb_xy] = min( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
+}
+
+/*
+ * parallel sum inter costs
+ *
+ * global launch dimensions: [256, mb_height]
+ */
+kernel void sum_inter_cost( const global uint16_t *fenc_lowres_costs,
+                            const global uint16_t *inv_qscale_factor,
+                            global int           *fenc_row_satds,
+                            global int           *frame_stats,
+                            int                   mb_width,
+                            int                   bframe_bias,
+                            int                   b,
+                            int                   p0,
+                            int                   p1 )
+{
+    int y = get_global_id( 1 );
+    int mb_height = get_global_size( 1 );
+
+    int row_satds = 0;
+    int cost_est = 0;
+    int cost_est_aq = 0;
+    int intra_mbs = 0;
+
+    for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
+    {
+        int mb_xy = x + y * mb_width;
+        int cost = fenc_lowres_costs[mb_xy] & LOWRES_COST_MASK;
+        int list = fenc_lowres_costs[mb_xy] >> LOWRES_COST_SHIFT;
+        int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
+
+        if( list == 0 && b_frame_score_mb )
+            intra_mbs++;
+
+        int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
+
+        row_satds += cost_aq;
+
+        if( b_frame_score_mb )
+        {
+            cost_est += cost;
+            cost_est_aq += cost_aq;
+        }
+    }
+
+    local int buffer[256];
+    int x = get_global_id( 0 );
+
+    row_satds   = parallel_sum( row_satds, x, buffer );
+    cost_est    = parallel_sum( cost_est, x, buffer );
+    cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
+    intra_mbs   = parallel_sum( intra_mbs, x, buffer );
+
+    if( b != p1 )
+        // Use floating point math to avoid 32bit integer overflow conditions
+        cost_est = (int)((float)cost_est * 100.0f / (120.0f + (float)bframe_bias));
+
+    if( get_global_id( 0 ) == 0 )
+    {
+        fenc_row_satds[y] = row_satds;
+        atomic_add( frame_stats + COST_EST, cost_est );
+        atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
+        atomic_add( frame_stats + INTRA_MBS, intra_mbs );
+    }
+}
diff --git a/common/opencl/downscale.cl b/common/opencl/downscale.cl

new file mode 100644 (file)

index 0000000..f7ceeb8
--- /dev/null
+++ b/common/opencl/downscale.cl
@@ -0,0 +1,135 @@
+/*
+ * downscale lowres luma: full-res buffer to down scale image, and to packed hpel image
+ *
+ * --
+ *
+ * fenc_img is an output image (area of memory referenced through a texture
+ * cache). A read of any pixel location (x,y) returns four pixel values:
+ *
+ * val.s0 = P(x,y)
+ * val.s1 = P(x+1,y)
+ * val.s2 = P(x+2,y)
+ * val.s3 = P(x+3,y)
+ *
+ * This is a 4x replication of the lowres pixels, a trade-off between memory
+ * size and read latency.
+ *
+ * --
+ *
+ * hpel_planes is an output image that contains the four HPEL planes used for
+ * subpel refinement. A read of any pixel location (x,y) returns a UInt32 with
+ * the four planar values C | V | H | F
+ *
+ * launch dimensions:  [lowres-width, lowres-height]
+ */
+kernel void downscale_hpel( const global pixel *fenc,
+                            write_only image2d_t fenc_img,
+                            write_only image2d_t hpel_planes,
+                            int stride )
+{
+    int x = get_global_id( 0 );
+    int y = get_global_id( 1 );
+    uint4 values;
+
+    fenc += y * stride * 2;
+    const global pixel *src1 = fenc + stride;
+    const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride;
+    int2 pos = (int2)(x, y);
+    pixel right, left;
+
+    right = rhadd( fenc[x*2], src1[x*2] );
+    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
+    values.s0 = rhadd( right, left );           // F
+
+    right = rhadd( fenc[2*x+1], src1[2*x+1] );
+    left  = rhadd( fenc[2*x+2], src1[2*x+2] );
+    values.s1 = rhadd( right, left );           // H
+
+    right = rhadd( src1[2*x], src2[2*x] );
+    left  = rhadd( src1[2*x+1], src2[2*x+1] );
+    values.s2 = rhadd( right, left );           // V
+
+    right = rhadd( src1[2*x+1], src2[2*x+1] );
+    left  = rhadd( src1[2*x+2], src2[2*x+2] );
+    values.s3 = rhadd( right, left );           // C
+
+    uint4 val = (uint4) ((values.s3 & 0xff) << 24) | ((values.s2 & 0xff) << 16) | ((values.s1 & 0xff) << 8) | (values.s0 & 0xff);
+    write_imageui( hpel_planes, pos, val );
+
+    x = select( x, x+1, x+1 < get_global_size( 0 ) );
+    right = rhadd( fenc[x*2], src1[x*2] );
+    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
+    values.s1 = rhadd( right, left );
+
+    x = select( x, x+1, x+1 < get_global_size( 0 ) );
+    right = rhadd( fenc[x*2], src1[x*2] );
+    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
+    values.s2 = rhadd( right, left );
+
+    x = select( x, x+1, x+1 < get_global_size( 0 ) );
+    right = rhadd( fenc[x*2], src1[x*2] );
+    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
+    values.s3 = rhadd( right, left );
+
+    write_imageui( fenc_img, pos, values );
+}
+
+/*
+ * downscale lowres hierarchical motion search image, copy from one image to
+ * another decimated image.  This kernel is called iteratively to generate all
+ * of the downscales.
+ *
+ * launch dimensions:  [lower_res width, lower_res height]
+ */
+kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res )
+{
+    int x = get_global_id( 0 );
+    int y = get_global_id( 1 );
+    int2 pos = (int2)(x, y);
+    int gs = get_global_size( 0 );
+    uint4 top, bot, values;
+    top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
+    bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
+    values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
+
+    /* these select statements appear redundant, and they should be, but tests break when
+     * they are not here.  I believe this was caused by a driver bug
+     */
+    values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
+    top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
+    bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
+    values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
+    values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
+    write_imageui( lower_res, pos, (uint4)(values) );
+}
+
+/*
+ * Second copy of downscale kernel, no differences. This is a (no perf loss)
+ * workaround for a scheduling bug in current Tahiti drivers.  This bug has
+ * theoretically been fixed in the July 2012 driver release from AMD.
+ */
+kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res )
+{
+    int x = get_global_id( 0 );
+    int y = get_global_id( 1 );
+    int2 pos = (int2)(x, y);
+    int gs = get_global_size( 0 );
+    uint4 top, bot, values;
+    top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
+    bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
+    values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
+
+    // see comment in above function copy
+    values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
+    top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
+    bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
+    values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
+    values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
+    write_imageui( lower_res, pos, (uint4)(values) );
+}
+
+/* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */
+kernel void memset_int16( global int16_t *buf, int16_t value )
+{
+    buf[get_global_id( 0 )] = value;
+}
diff --git a/common/opencl/intra.cl b/common/opencl/intra.cl

new file mode 100644 (file)

index 0000000..d55978b
--- /dev/null
+++ b/common/opencl/intra.cl
@@ -0,0 +1,1072 @@
+/* Lookahead lowres intra analysis
+ *
+ * Each intra analysis function has been implemented twice, once for scalar GPUs
+ * (NV) and once for vectorized GPUs (AMD pre-Southern Islands).  x264 detects
+ * the GPU type and sets the -DVECTORIZE compile flag accordingly.
+ *
+ * All the intra analysis functions were based on their C versions in pixel.c
+ * and produce the exact same results.
+ */
+
+/* force all clamp arguments and return value to int, prevent ambiguous types */
+#define clamp_int( X, MIN, MAX ) (int) clamp( (int)(X), (int)(MIN), (int)(MAX) )
+
+#if VECTORIZE
+int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 )
+{
+    int8 a_v, d_v;
+    int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13;
+    int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33;
+
+    d_v = convert_int8( vload8( 0, data ) );
+    a_v.s01234567 = (d_v - pr0).s04152637;
+    HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+    data += data_stride;
+    d_v = convert_int8( vload8( 0, data ) );
+    a_v.s01234567 = (d_v - pr1).s04152637;
+    HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+    data += data_stride;
+    d_v = convert_int8( vload8( 0, data ) );
+    a_v.s01234567 = (d_v - pr2).s04152637;
+    HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+    data += data_stride;
+    d_v = convert_int8( vload8( 0, data ) );
+    a_v.s01234567 = (d_v - pr3).s04152637;
+    HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
+
+    uint8 sum_v;
+
+    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 );
+    sum_v = abs( a_v );
+
+    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 );
+    sum_v += abs( a_v );
+
+    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 );
+    sum_v += abs( a_v );
+
+    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 );
+    sum_v += abs( a_v );
+
+    uint4 sum2 = sum_v.hi + sum_v.lo;
+    uint2 sum3 = sum2.hi + sum2.lo;
+    return ( sum3.hi + sum3.lo ) >> 1;
+}
+#else
+SATD_C_8x4_Q( satd_8x4_lp, const local, private )
+#endif
+
+/****************************************************************************
+ * 8x8 prediction for intra luma block
+ ****************************************************************************/
+
+#define F1            rhadd
+#define F2( a, b, c ) ( a+2*b+c+2 )>>2
+
+#if VECTORIZE
+int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
+{
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2;
+    pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
+    pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
+    pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+    pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+    pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+
+    pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
+    pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
+    pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+    pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+    pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+
+    pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
+    pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+    pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+    pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+
+    pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
+    pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+    pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+    pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+    // Lower half of pred[]
+    pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
+    pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+    pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+    pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+
+    pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
+    pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+    pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+    pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+    pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
+
+    pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
+    pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+    pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+    pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+    pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
+    pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
+
+    pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
+    pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
+    pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
+    pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
+    pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
+    pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
+    pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
+    pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2;
+
+    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr3.s0 = F2( left[1], left[2], left[3] );
+    pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] );
+    pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top );
+    pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
+    pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
+    pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
+    pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
+    pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
+    pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] );
+    pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] );
+    pr0.s7 = F2( top[5], top[6], top[7] );
+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+    // Lower half of pred[]
+    pr3.s0 = F2( left[5], left[6], left[7] );
+    pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] );
+    pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] );
+    pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] );
+    pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] );
+    pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] );
+    pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top );
+    pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] );
+    pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] );
+    pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] );
+    pr0.s7 = F2( top[1], top[2], top[3] );
+    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr2.s0 = F2( left[1], left[0], left_top );
+    pr3.s0 = F2( left[2], left[1], left[0] );
+    pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] );
+    pr0.s0 = pr2.s1 = F1( left_top, top[0] );
+    pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] );
+    pr0.s1 = pr2.s2 = F1( top[0], top[1] );
+    pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] );
+    pr0.s2 = pr2.s3 = F1( top[1], top[2] );
+    pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] );
+    pr0.s3 = pr2.s4 = F1( top[2], top[3] );
+    pr1.s4 = pr3.s5 = F2( top[2], top[3], top[4] );
+    pr0.s4 = pr2.s5 = F1( top[3], top[4] );
+    pr1.s5 = pr3.s6 = F2( top[3], top[4], top[5] );
+    pr0.s5 = pr2.s6 = F1( top[4], top[5] );
+    pr1.s6 = pr3.s7 = F2( top[4], top[5], top[6] );
+    pr0.s6 = pr2.s7 = F1( top[5], top[6] );
+    pr1.s7 = F2( top[5], top[6], top[7] );
+    pr0.s7 = F1( top[6], top[7] );
+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+    // Lower half of pred[]
+    pr2.s0 = F2( left[5], left[4], left[3] );
+    pr3.s0 = F2( left[6], left[5], left[4] );
+    pr0.s0 = pr2.s1 = F2( left[3], left[2], left[1] );
+    pr1.s0 = pr3.s1 = F2( left[4], left[3], left[2] );
+    pr0.s1 = pr2.s2 = F2( left[1], left[0], left_top );
+    pr1.s1 = pr3.s2 = F2( left[2], left[1], left[0] );
+    pr1.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
+    pr0.s2 = pr2.s3 = F1( left_top, top[0] );
+    pr1.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
+    pr0.s3 = pr2.s4 = F1( top[0], top[1] );
+    pr1.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
+    pr0.s4 = pr2.s5 = F1( top[1], top[2] );
+    pr1.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
+    pr0.s5 = pr2.s6 = F1( top[2], top[3] );
+    pr1.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
+    pr0.s6 = pr2.s7 = F1( top[3], top[4] );
+    pr1.s7 = F2( top[3], top[4], top[5] );
+    pr0.s7 = F1( top[4], top[5] );
+    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+#undef PRED
+}
+
+int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr0.s0 = F1( left_top, left[0] ); pr0.s1 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
+    pr0.s2 = F2( top[1], top[0], left_top ); pr0.s3 = F2( top[2], top[1], top[0] );
+    pr0.s4 = F2( top[3], top[2], top[1] ); pr0.s5 = F2( top[4], top[3], top[2] );
+    pr0.s6 = F2( top[5], top[4], top[3] ); pr0.s7 = F2( top[6], top[5], top[4] );
+
+    pr1.s0 = F1( left[0], left[1] ); pr1.s1 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
+    pr1.s2 = F1( left_top, left[0] ); pr1.s3 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
+    pr1.s4 = F2( top[1], top[0], left_top ); pr1.s5 = F2( top[2], top[1], top[0] );
+    pr1.s6 = F2( top[3], top[2], top[1] ); pr1.s7 = F2( top[4], top[3], top[2] );
+
+    pr2.s0 = F1( left[1], left[2] ); pr2.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+    pr2.s2 = F1( left[0], left[1] ); pr2.s3 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
+    pr2.s4 = F1( left_top, left[0] ); pr2.s5 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
+    pr2.s6 = F2( top[1], top[0], left_top ); pr2.s7 = F2( top[2], top[1], top[0] );
+
+    pr3.s0 = F1( left[2], left[3] ); pr3.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+    pr3.s2 = F1( left[1], left[2] ); pr3.s3 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+    pr3.s4 = F1( left[0], left[1] ); pr3.s5 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
+    pr3.s6 = F1( left_top, left[0] ); pr3.s7 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+    // Lower half of pred[]
+    pr0.s0 = F1( left[3], left[4] ); pr0.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+    pr0.s2 = F1( left[2], left[3] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+    pr0.s4 = F1( left[1], left[2] ); pr0.s5 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+    pr0.s6 = F1( left[0], left[1] ); pr0.s7 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
+
+    pr1.s0 = F1( left[4], left[5] ); pr1.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+    pr1.s2 = F1( left[3], left[4] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+    pr1.s4 = F1( left[2], left[3] ); pr1.s5 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+    pr1.s6 = F1( left[1], left[2] ); pr1.s7 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+
+    pr2.s0 = F1( left[5], left[6] ); pr2.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+    pr2.s2 = F1( left[4], left[5] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+    pr2.s4 = F1( left[3], left[4] ); pr2.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+    pr2.s6 = F1( left[2], left[3] ); pr2.s7 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+
+    pr3.s0 = F1( left[6], left[7] ); pr3.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+    pr3.s2 = F1( left[5], left[6] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+    pr3.s4 = F1( left[4], left[5] ); pr3.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+    pr3.s6 = F1( left[3], left[4] ); pr3.s7 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
+{
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr0.s0 = F1( top[0], top[1] );
+    pr1.s0 = F2( top[0], top[1], top[2] );
+    pr2.s0 = pr0.s1 = F1( top[1], top[2] );
+    pr3.s0 = pr1.s1 = F2( top[1], top[2], top[3] );
+    pr2.s1 = pr0.s2 = F1( top[2], top[3] );
+    pr3.s1 = pr1.s2 = F2( top[2], top[3], top[4] );
+    pr2.s2 = pr0.s3 = F1( top[3], top[4] );
+    pr3.s2 = pr1.s3 = F2( top[3], top[4], top[5] );
+    pr2.s3 = pr0.s4 = F1( top[4], top[5] );
+    pr3.s3 = pr1.s4 = F2( top[4], top[5], top[6] );
+    pr2.s4 = pr0.s5 = F1( top[5], top[6] );
+    pr3.s4 = pr1.s5 = F2( top[5], top[6], top[7] );
+    pr2.s5 = pr0.s6 = F1( top[6], top[7] );
+    pr3.s5 = pr1.s6 = F2( top[6], top[7], top[8] );
+    pr2.s6 = pr0.s7 = F1( top[7], top[8] );
+    pr3.s6 = pr1.s7 = F2( top[7], top[8], top[9] );
+    pr2.s7 = F1( top[8], top[9] );
+    pr3.s7 = F2( top[8], top[9], top[10] );
+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+    // Lower half of pred[]
+    pr0.s0 = F1( top[2], top[3] );
+    pr1.s0 = F2( top[2], top[3], top[4] );
+    pr2.s0 = pr0.s1 = F1( top[3], top[4] );
+    pr3.s0 = pr1.s1 = F2( top[3], top[4], top[5] );
+    pr2.s1 = pr0.s2 = F1( top[4], top[5] );
+    pr3.s1 = pr1.s2 = F2( top[4], top[5], top[6] );
+    pr2.s2 = pr0.s3 = F1( top[5], top[6] );
+    pr3.s2 = pr1.s3 = F2( top[5], top[6], top[7] );
+    pr2.s3 = pr0.s4 = F1( top[6], top[7] );
+    pr3.s3 = pr1.s4 = F2( top[6], top[7], top[8] );
+    pr2.s4 = pr0.s5 = F1( top[7], top[8] );
+    pr3.s4 = pr1.s5 = F2( top[7], top[8], top[9] );
+    pr2.s5 = pr0.s6 = F1( top[8], top[9] );
+    pr3.s5 = pr1.s6 = F2( top[8], top[9], top[10] );
+    pr2.s6 = pr0.s7 = F1( top[9], top[10] );
+    pr3.s6 = pr1.s7 = F2( top[9], top[10], top[11] );
+    pr2.s7 = F1( top[10], top[11] );
+    pr3.s7 = F2( top[10], top[11], top[12] );
+    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
+{
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr0.s0 = F1( left[0], left[1] ); pr0.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
+    pr0.s2 = F1( left[1], left[2] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+    pr0.s4 = F1( left[2], left[3] ); pr0.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+    pr0.s6 = F1( left[3], left[4] ); pr0.s7 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+
+    pr1.s0 = F1( left[1], left[2] ); pr1.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
+    pr1.s2 = F1( left[2], left[3] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+    pr1.s4 = F1( left[3], left[4] ); pr1.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+    pr1.s6 = F1( left[4], left[5] ); pr1.s7 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+
+    pr2.s0 = F1( left[2], left[3] ); pr2.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
+    pr2.s2 = F1( left[3], left[4] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+    pr2.s4 = F1( left[4], left[5] ); pr2.s5 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+    pr2.s6 = F1( left[5], left[6] ); pr2.s7 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+
+    pr3.s0 = F1( left[3], left[4] ); pr3.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
+    pr3.s2 = F1( left[4], left[5] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+    pr3.s4 = F1( left[5], left[6] ); pr3.s5 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+    pr3.s6 = F1( left[6], left[7] ); pr3.s7 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+    // Lower half of pred[]
+    pr0.s0 = F1( left[4], left[5] ); pr0.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
+    pr0.s2 = F1( left[5], left[6] ); pr0.s3 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+    pr0.s4 = F1( left[6], left[7] ); pr0.s5 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
+    pr0.s6 = left[7]; pr0.s7 = left[7];
+
+    pr1.s0 = F1( left[5], left[6] ); pr1.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
+    pr1.s2 = F1( left[6], left[7] ); pr1.s3 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
+    pr1.s4 = left[7]; pr1.s5 = left[7];
+    pr1.s6 = left[7]; pr1.s7 = left[7];
+
+    pr2.s0 = F1( left[6], left[7] ); pr2.s1 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
+    pr2.s2 = left[7]; pr2.s3 = left[7];
+    pr2.s4 = left[7]; pr2.s5 = left[7];
+    pr2.s6 = left[7]; pr2.s7 = left[7];
+
+    pr3 = (int8)left[7];
+
+    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8c_h( const local pixel *src, int src_stride )
+{
+    const local pixel *src_l = src;
+    int8 pr0, pr1, pr2, pr3;
+
+    // Upper half of pred[]
+    pr0 = (int8)src[-1]; src += src_stride;
+    pr1 = (int8)src[-1]; src += src_stride;
+    pr2 = (int8)src[-1]; src += src_stride;
+    pr3 = (int8)src[-1]; src += src_stride;
+    int satd = satd_8x4_intra_lr( src_l, src_stride, pr0, pr1, pr2, pr3 );
+
+    //Lower half of pred[]
+    pr0 = (int8)src[-1]; src += src_stride;
+    pr1 = (int8)src[-1]; src += src_stride;
+    pr2 = (int8)src[-1]; src += src_stride;
+    pr3 = (int8)src[-1];
+    return satd + satd_8x4_intra_lr( src_l + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8c_v( const local pixel *src, int src_stride )
+{
+    int8 pred = convert_int8( vload8( 0, &src[-src_stride] ));
+    return satd_8x4_intra_lr( src, src_stride, pred, pred, pred, pred ) +
+           satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pred, pred, pred, pred );
+}
+
+int x264_predict_8x8c_p( const local pixel *src, int src_stride )
+{
+    int H = 0, V = 0;
+    for( int i = 0; i < 4; i++ )
+    {
+        H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
+        V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
+    }
+
+    int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
+    int b = (17 * H + 16) >> 5;
+    int c = (17 * V + 16) >> 5;
+    int i00 = a - 3 * b - 3 * c + 16;
+
+    // Upper half of pred[]
+    int pix = i00;
+    int8 pr0, pr1, pr2, pr3;
+    pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+    pix = i00;
+    pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+    pix = i00;
+    pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+    pix = i00;
+    pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
+
+    //Lower half of pred[]
+    pix = i00;
+    pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+    pix = i00;
+    pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+    pix = i00;
+    pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+
+    pix = i00;
+    pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
+    pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
+    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
+}
+
+int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
+{
+    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+    for( int i = 0; i < 4; i++ )
+    {
+        s0 += src[i - src_stride];
+        s1 += src[i + 4 - src_stride];
+        s2 += src[-1 + i * src_stride];
+        s3 += src[-1 + (i+4)*src_stride];
+    }
+
+    // Upper half of pred[]
+    int8 dc0;
+    dc0.lo = (int4)( (s0 + s2 + 4) >> 3 );
+    dc0.hi = (int4)( (s1 + 2) >> 2 );
+    int satd = satd_8x4_intra_lr( src, src_stride, dc0, dc0, dc0, dc0 );
+
+    // Lower half of pred[]
+    dc0.lo = (int4)( (s3 + 2) >> 2 );
+    dc0.hi = (int4)( (s1 + s3 + 4) >> 3 );
+    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, dc0, dc0, dc0, dc0 );
+}
+
+#else  /* not vectorized: private is cheap registers are scarce */
+
+int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
+{
+    private pixel pred[32];
+
+    // Upper half of pred[]
+    for( int y = 0; y < 4; y++ )
+    {
+        for( int x = 0; x < 8; x++ )
+        {
+            pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 );
+            pred[x + y*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
+        }
+    }
+    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
+    //Lower half of pred[]
+    for( int y = 4; y < 8; y++ )
+    {
+        for( int x = 0; x < 8; x++ )
+        {
+            pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 );
+            pred[x + ( y - 4 )*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
+        }
+    }
+    pred[31] = ( 2 + top[14] + 3*top[15] ) >> 2;
+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+    return satd;
+}
+
+int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+    private pixel pred[32];
+#define PRED( x, y ) pred[(x) + (y)*8]
+    // Upper half of pred[]
+    PRED( 0, 3 ) = F2( left[1], left[2], left[3] );
+    PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[0], left[1], left[2] );
+    PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[1], left[0], left_top );
+    PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
+    PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
+    PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
+    PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
+    PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
+    PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( top[3], top[4], top[5] );
+    PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[4], top[5], top[6] );
+    PRED( 7, 0 ) = F2( top[5], top[6], top[7] );
+    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
+
+    // Lower half of pred[]
+    PRED( 0, 3 ) = F2( left[5], left[6], left[7] );
+    PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[4], left[5], left[6] );
+    PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[3], left[4], left[5] );
+    PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[2], left[3], left[4] );
+    PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left[1], left[2], left[3] );
+    PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( left[0], left[1], left[2] );
+    PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( left[1], left[0], left_top );
+    PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( left[0], left_top, top[0] );
+    PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( left_top, top[0], top[1] );
+    PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[0], top[1], top[2] );
+    PRED( 7, 0 ) = F2( top[1], top[2], top[3] );
+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+    return satd;
+#undef PRED
+}
+
+int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+    private pixel pred[32];
+#define PRED( x, y ) pred[(x) + (y)*8]
+    // Upper half of pred[]
+    PRED( 0, 2 ) = F2( left[1], left[0], left_top );
+    PRED( 0, 3 ) = F2( left[2], left[1], left[0] );
+    PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[0], left_top, top[0] );
+    PRED( 0, 0 ) = PRED( 1, 2 ) = F1( left_top, top[0] );
+    PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left_top, top[0], top[1] );
+    PRED( 1, 0 ) = PRED( 2, 2 ) = F1( top[0], top[1] );
+    PRED( 2, 1 ) = PRED( 3, 3 ) = F2( top[0], top[1], top[2] );
+    PRED( 2, 0 ) = PRED( 3, 2 ) = F1( top[1], top[2] );
+    PRED( 3, 1 ) = PRED( 4, 3 ) = F2( top[1], top[2], top[3] );
+    PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[2], top[3] );
+    PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[2], top[3], top[4] );
+    PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[3], top[4] );
+    PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[3], top[4], top[5] );
+    PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[4], top[5] );
+    PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[4], top[5], top[6] );
+    PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[5], top[6] );
+    PRED( 7, 1 ) = F2( top[5], top[6], top[7] );
+    PRED( 7, 0 ) = F1( top[6], top[7] );
+    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
+
+    //Lower half of pred[]
+    PRED( 0, 2 ) = F2( left[5], left[4], left[3] );
+    PRED( 0, 3 ) = F2( left[6], left[5], left[4] );
+    PRED( 0, 0 ) = PRED( 1, 2 ) = F2( left[3], left[2], left[1] );
+    PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[4], left[3], left[2] );
+    PRED( 1, 0 ) = PRED( 2, 2 ) = F2( left[1], left[0], left_top );
+    PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left[2], left[1], left[0] );
+    PRED( 2, 1 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
+    PRED( 2, 0 ) = PRED( 3, 2 ) = F1( left_top, top[0] );
+    PRED( 3, 1 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
+    PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[0], top[1] );
+    PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
+    PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[1], top[2] );
+    PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
+    PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[2], top[3] );
+    PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
+    PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[3], top[4] );
+    PRED( 7, 1 ) = F2( top[3], top[4], top[5] );
+    PRED( 7, 0 ) = F1( top[4], top[5] );
+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+    return satd;
+#undef PRED
+}
+
+inline uint32_t pack16to32( uint32_t a, uint32_t b )
+{
+    return a + (b << 16);
+}
+
+inline uint32_t pack8to16( uint32_t a, uint32_t b )
+{
+    return a + (b << 8);
+}
+
+int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
+{
+    private pixel pred[32];
+    int satd;
+    int p1 =  pack8to16( (F1( left[6], left[7] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
+    int p2 =  pack8to16( (F1( left[5], left[6] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
+    int p3 =  pack8to16( (F1( left[4], left[5] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
+    int p4 =  pack8to16( (F1( left[3], left[4] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
+    int p5 =  pack8to16( (F1( left[2], left[3] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
+    int p6 =  pack8to16( (F1( left[1], left[2] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
+    int p7 =  pack8to16( (F1( left[0], left[1] )), ((left_top + 2 * left[0] + left[1] + 2) >> 2) );
+    int p8 =  pack8to16( (F1( left_top, left[0] )), ((left[0] + 2 * left_top + top[0] + 2) >> 2) );
+    int p9 =  pack8to16( (F2( top[1], top[0], left_top )), (F2( top[2], top[1], top[0] )) );
+    int p10 =  pack8to16( (F2( top[3], top[2], top[1] )), (F2( top[4], top[3], top[2] )) );
+    int p11 =  pack8to16( (F2( top[5], top[4], top[3] )), (F2( top[6], top[5], top[4] )) );
+    // Upper half of pred[]
+    vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[0 + 0 * 8] );
+    vstore4( as_uchar4( pack16to32( p10, p11 ) ), 0, &pred[4 + 0 * 8] );
+    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[0 + 1 * 8] );
+    vstore4( as_uchar4( pack16to32( p9, p10 ) ), 0, &pred[4 + 1 * 8] );
+    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[0 + 2 * 8] );
+    vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[4 + 2 * 8] );
+    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[0 + 3 * 8] );
+    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[4 + 3 * 8] );
+    satd = satd_8x4_lp( src, src_stride, pred, 8 );
+    // Lower half of pred[]
+    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[0 + 0 * 8] );
+    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[4 + 0 * 8] );
+    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[0 + 1 * 8] );
+    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[4 + 1 * 8] );
+    vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[0 + 2 * 8] );
+    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[4 + 2 * 8] );
+    vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[0 + 3 * 8] );
+    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[4 + 3 * 8] );
+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+    return satd;
+}
+
+int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
+{
+    private pixel pred[32];
+    int satd;
+#define PRED( x, y ) pred[(x) + (y)*8]
+    // Upper half of pred[]
+    PRED( 0, 0 ) = F1( top[0], top[1] );
+    PRED( 0, 1 ) = F2( top[0], top[1], top[2] );
+    PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[1], top[2] );
+    PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[1], top[2], top[3] );
+    PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[2], top[3] );
+    PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[2], top[3], top[4] );
+    PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[3], top[4] );
+    PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[3], top[4], top[5] );
+    PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[4], top[5] );
+    PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[4], top[5], top[6] );
+    PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[5], top[6] );
+    PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[5], top[6], top[7] );
+    PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[6], top[7] );
+    PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[6], top[7], top[8] );
+    PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[7], top[8] );
+    PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[7], top[8], top[9] );
+    PRED( 7, 2 ) = F1( top[8], top[9] );
+    PRED( 7, 3 ) = F2( top[8], top[9], top[10] );
+    satd = satd_8x4_lp( src, src_stride, pred, 8 );
+    // Lower half of pred[]
+    PRED( 0, 0 ) = F1( top[2], top[3] );
+    PRED( 0, 1 ) = F2( top[2], top[3], top[4] );
+    PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[3], top[4] );
+    PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[3], top[4], top[5] );
+    PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[4], top[5] );
+    PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[4], top[5], top[6] );
+    PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[5], top[6] );
+    PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[5], top[6], top[7] );
+    PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[6], top[7] );
+    PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[6], top[7], top[8] );
+    PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[7], top[8] );
+    PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[7], top[8], top[9] );
+    PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[8], top[9] );
+    PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[8], top[9], top[10] );
+    PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[9], top[10] );
+    PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[9], top[10], top[11] );
+    PRED( 7, 2 ) = F1( top[10], top[11] );
+    PRED( 7, 3 ) = F2( top[10], top[11], top[12] );
+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+    return satd;
+#undef PRED
+}
+
+int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
+{
+    private pixel pred[32];
+    int satd;
+    int p1 = pack8to16( (F1( left[0], left[1] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
+    int p2 = pack8to16( (F1( left[1], left[2] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
+    int p3 = pack8to16( (F1( left[2], left[3] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
+    int p4 = pack8to16( (F1( left[3], left[4] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
+    int p5 = pack8to16( (F1( left[4], left[5] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
+    int p6 = pack8to16( (F1( left[5], left[6] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
+    int p7 = pack8to16( (F1( left[6], left[7] )), ((left[6] + 2 * left[7] + left[7] + 2) >> 2) );
+    int p8 = pack8to16( left[7], left[7] );
+    // Upper half of pred[]
+    vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
+    satd = satd_8x4_lp( src, src_stride, pred, 8 );
+    // Lower half of pred[]
+    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
+    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+    return satd;
+}
+
+int x264_predict_8x8c_h( const local pixel *src, int src_stride )
+{
+    private pixel pred[32];
+    const local pixel *src_l = src;
+
+    // Upper half of pred[]
+    vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
+    vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
+    vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
+    vstore8( (uchar8)(src[-1]), 3, pred ); src += src_stride;
+    int satd = satd_8x4_lp( src_l, src_stride, pred, 8 );
+
+    // Lower half of pred[]
+    vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
+    vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
+    vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
+    vstore8( (uchar8)(src[-1]), 3, pred );
+    return satd + satd_8x4_lp( src_l + ( src_stride << 2 ), src_stride, pred, 8 );
+}
+
+int x264_predict_8x8c_v( const local pixel *src, int src_stride )
+{
+    private pixel pred[32];
+    uchar16 v16;
+    v16.lo = vload8( 0, &src[-src_stride] );
+    v16.hi = vload8( 0, &src[-src_stride] );
+
+    vstore16( v16, 0, pred );
+    vstore16( v16, 1, pred );
+
+    return satd_8x4_lp( src, src_stride, pred, 8 ) +
+           satd_8x4_lp( src + (src_stride << 2), src_stride, pred, 8 );
+}
+
+int x264_predict_8x8c_p( const local pixel *src, int src_stride )
+{
+    int H = 0, V = 0;
+    private pixel pred[32];
+    int satd;
+
+    for( int i = 0; i < 4; i++ )
+    {
+        H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
+        V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
+    }
+
+    int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
+    int b = (17 * H + 16) >> 5;
+    int c = (17 * V + 16) >> 5;
+    int i00 = a - 3 * b - 3 * c + 16;
+
+    // Upper half of pred[]
+    for( int y = 0; y < 4; y++ )
+    {
+        int pix = i00;
+        for( int x = 0; x < 8; x++ )
+        {
+            pred[x + y*8] = x264_clip_pixel( pix >> 5 );
+            pix += b;
+        }
+        i00 += c;
+    }
+    satd = satd_8x4_lp( src, src_stride, pred, 8 );
+    // Lower half of pred[]
+    for( int y = 0; y < 4; y++ )
+    {
+        int pix = i00;
+        for( int x = 0; x < 8; x++ )
+        {
+            pred[x + y*8] = x264_clip_pixel( pix >> 5 );
+            pix += b;
+        }
+        i00 += c;
+    }
+    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+    return satd;
+}
+
+int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
+{
+    private pixel pred[32];
+    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
+    for( int i = 0; i < 4; i++ )
+    {
+        s0 += src[i - src_stride];
+        s1 += src[i + 4 - src_stride];
+        s2 += src[-1 + i * src_stride];
+        s3 += src[-1 + (i+4)*src_stride];
+    }
+
+    // Upper half of pred[]
+    uchar8 dc0;
+    dc0.lo = (uchar4)( (s0 + s2 + 4) >> 3 );
+    dc0.hi = (uchar4)( (s1 + 2) >> 2 );
+    vstore8( dc0, 0, pred );
+    vstore8( dc0, 1, pred );
+    vstore8( dc0, 2, pred );
+    vstore8( dc0, 3, pred );
+    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
+
+    // Lower half of pred[]
+    dc0.lo = (uchar4)( (s3 + 2) >> 2 );
+    dc0.hi = (uchar4)( (s1 + s3 + 4) >> 3 );
+    vstore8( dc0, 0, pred );
+    vstore8( dc0, 1, pred );
+    vstore8( dc0, 2, pred );
+    vstore8( dc0, 3, pred );
+    return satd + satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
+}
+#endif
+
+/* Find the least cost intra mode for 32 8x8 macroblocks per workgroup
+ *
+ * Loads 33 macroblocks plus the pixels directly above them into local memory,
+ * padding where necessary with edge pixels.  It then cooperatively calculates
+ * smoothed top and left pixels for use in some of the analysis.
+ *
+ * Then groups of 32 threads each calculate a single intra mode for each 8x8
+ * block.  Since consecutive threads are calculating the same intra mode there
+ * is no code-path divergence.  8 intra costs are calculated simultaneously.  If
+ * the "slow" argument is not zero, the final two (least likely) intra modes are
+ * tested in a second pass.  The slow mode is only enabled for presets slow,
+ * slower, and placebo.
+ *
+ * This allows all of the pixels functions to read pixels from local memory, and
+ * avoids re-fetching edge pixels from global memory.  And it allows us to
+ * calculate all of the intra mode costs simultaneously without branch divergence.
+ *
+ * Local dimension:    [ 32, 8 ]
+ * Global dimensions:  [ paddedWidth, height ] */
+kernel void mb_intra_cost_satd_8x8( read_only image2d_t  fenc,
+                                    global uint16_t     *fenc_intra_cost,
+                                    global int          *frame_stats,
+                                    int                  lambda,
+                                    int                  mb_width,
+                                    int                  slow )
+{
+#define CACHE_STRIDE 265
+#define BLOCK_OFFSET 266
+    local pixel cache[2385];
+    local int cost_buf[32];
+    local pixel top[32 * 16];
+    local pixel left[32 * 8];
+    local pixel left_top[32];
+
+    int lx = get_local_id( 0 );
+    int ly = get_local_id( 1 );
+    int gx = get_global_id( 0 );
+    int gy = get_global_id( 1 );
+    int gidx = get_group_id( 0 );
+    int gidy = get_group_id( 1 );
+    int linear_id = ly * get_local_size( 0 ) + lx;
+    int satd = COST_MAX;
+    int basex = gidx << 8;
+    int basey = (gidy << 3) - 1;
+
+    /* Load 33 8x8 macroblocks and the pixels above them into local cache */
+    for( int y = 0; y < 9 && linear_id < (33<<3)>>2; y++ )
+    {
+        int x = linear_id << 2;
+        uint4 data = read_imageui( fenc, sampler, (int2)(x + basex, y + basey) );
+        cache[y * CACHE_STRIDE + 1 + x] = data.s0;
+        cache[y * CACHE_STRIDE + 1 + x + 1] = data.s1;
+        cache[y * CACHE_STRIDE + 1 + x + 2] = data.s2;
+        cache[y * CACHE_STRIDE + 1 + x + 3] = data.s3;
+    }
+    /* load pixels on left edge */
+    if( linear_id < 9 )
+        cache[linear_id * CACHE_STRIDE] = read_imageui( fenc, sampler, (int2)( basex - 1, linear_id + basey) ).s0;
+
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // Cooperatively build the top edge for the macroblock using lowpass filter
+    int j = ly;
+    top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
+                       2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
+                       cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
+    j += 8;
+    top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
+                       2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
+                       cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
+    // Cooperatively build the left edge for the macroblock using lowpass filter
+    left[lx*8 + ly] = ( cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*(ly - 1)] +
+                        2*cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*ly] +
+                        cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*clamp((ly + 1), 0, 7 )] + 2 ) >> 2;
+    // One left_top per macroblock
+    if( 0 == ly )
+    {
+        left_top[lx] = ( cache[BLOCK_OFFSET + 8*lx - 1] + 2*cache[BLOCK_OFFSET + 8*lx - 1 - CACHE_STRIDE] +
+                         cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE] + 2 ) >> 2;
+        cost_buf[lx] = COST_MAX;
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    // each warp/wavefront generates a different prediction type; no divergence
+    switch( ly )
+    {
+        case 0:
+            satd = x264_predict_8x8c_h( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
+            break;
+        case 1:
+            satd = x264_predict_8x8c_v( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
+            break;
+        case 2:
+            satd = x264_predict_8x8c_dc( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
+            break;
+        case 3:
+            satd = x264_predict_8x8c_p( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
+            break;
+        case 4:
+            satd = x264_predict_8x8_ddr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
+            break;
+        case 5:
+            satd = x264_predict_8x8_vr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
+            break;
+        case 6:
+            satd = x264_predict_8x8_hd( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
+            break;
+        case 7:
+            satd = x264_predict_8x8_hu( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &left[8*lx] );
+            break;
+        default:
+            break;
+    }
+    atom_min( &cost_buf[lx], satd );
+    if( slow )
+    {
+        // Do the remaining two (least likely) prediction modes
+        switch( ly )
+        {
+            case 0: // DDL
+                satd = x264_predict_8x8_ddl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
+                atom_min( &cost_buf[lx], satd );
+                break;
+            case 1: // VL
+                satd = x264_predict_8x8_vl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
+                atom_min( &cost_buf[lx], satd );
+                break;
+            default:
+                break;
+        }
+    }
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    if( (0 == ly) && (gx < mb_width) )
+        fenc_intra_cost[gidy * mb_width + gx] = cost_buf[lx]+ 5*lambda;
+
+    // initialize the frame_stats[2] buffer for kernel sum_intra_cost().
+    if( gx < 2 && gy == 0 )
+        frame_stats[gx] = 0;
+#undef CACHE_STRIDE
+#undef BLOCK_OFFSET
+}
+
+/*
+ * parallel sum intra costs
+ *
+ * global launch dimensions: [256, mb_height]
+ */
+kernel void sum_intra_cost( const global uint16_t *fenc_intra_cost,
+                            const global uint16_t *inv_qscale_factor,
+                            global int           *fenc_row_satds,
+                            global int           *frame_stats,
+                            int                   mb_width )
+{
+    int y = get_global_id( 1 );
+    int mb_height = get_global_size( 1 );
+
+    int row_satds = 0;
+    int cost_est = 0;
+    int cost_est_aq = 0;
+
+    for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
+    {
+        int mb_xy = x + y * mb_width;
+        int cost = fenc_intra_cost[mb_xy];
+        int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
+        int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
+
+        row_satds += cost_aq;
+        if( b_frame_score_mb )
+        {
+            cost_est += cost;
+            cost_est_aq += cost_aq;
+        }
+    }
+
+    local int buffer[256];
+    int x = get_global_id( 0 );
+
+    row_satds   = parallel_sum( row_satds, x, buffer );
+    cost_est    = parallel_sum( cost_est, x, buffer );
+    cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
+
+    if( get_global_id( 0 ) == 0 )
+    {
+        fenc_row_satds[y] = row_satds;
+        atomic_add( frame_stats + COST_EST,    cost_est );
+        atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
+    }
+}
diff --git a/common/opencl/motionsearch.cl b/common/opencl/motionsearch.cl

new file mode 100644 (file)

index 0000000..77a07ce
--- /dev/null
+++ b/common/opencl/motionsearch.cl
@@ -0,0 +1,249 @@
+/* Hierarchical (iterative) OpenCL lowres motion search */
+
+inline int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height )
+{
+    /* edge macroblocks might not have a direct descendant, use nearest */
+    x = select( x >> 1, (x - (mb_width&1)) >> 1, x == mb_width-1 );
+    y = select( y >> 1, (y - (mb_height&1)) >> 1, y == mb_height-1 );
+    return (mb_width>>1) * y + x;
+}
+
+/* Four threads calculate an 8x8 SAD.  Each does two rows */
+int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs )
+{
+    frefpos.y += idx << 1;
+    fencpos.y += idx << 1;
+    int cost = 0;
+    if( frefpos.x < 0 )
+    {
+        /* slow path when MV goes past left edge.  The GPU clamps reads from
+         * (-1, 0) to (0,0), so you get pixels [0, 1, 2, 3] when what you really
+         * want are [0, 0, 1, 2]
+         */
+        for( int y = 0; y < 2; y++ )
+        {
+            for( int x = 0; x < 8; x++ )
+            {
+                pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
+                pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
+                cost += abs_diff( enc, ref );
+            }
+        }
+    }
+    else
+    {
+        uint4 enc, ref, costs = 0;
+        enc = read_imageui( fenc, sampler, fencpos );
+        ref = read_imageui( fref, sampler, frefpos );
+        costs += abs_diff( enc, ref );
+        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) );
+        ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) );
+        costs += abs_diff( enc, ref );
+        enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) );
+        ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) );
+        costs += abs_diff( enc, ref );
+        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) );
+        ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) );
+        costs += abs_diff( enc, ref );
+        cost = costs.s0 + costs.s1 + costs.s2 + costs.s3;
+    }
+    costs[idx] = cost;
+    return costs[0] + costs[1] + costs[2] + costs[3];
+}
+
+/* One thread performs 8x8 SAD */
+int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos )
+{
+    if( frefpos.x < 0 )
+    {
+        /* slow path when MV goes past left edge */
+        int cost = 0;
+        for( int y = 0; y < 8; y++ )
+        {
+            for( int x = 0; x < 8; x++ )
+            {
+                uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
+                uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
+                cost += abs_diff( enc, ref );
+            }
+        }
+        return cost;
+    }
+    else
+    {
+        uint4 enc, ref, cost = 0;
+        for( int y = 0; y < 8; y++ )
+        {
+            for( int x = 0; x < 8; x += 4 )
+            {
+                enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) );
+                ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) );
+                cost += abs_diff( enc, ref );
+            }
+        }
+        return cost.s0 + cost.s1 + cost.s2 + cost.s3;
+    }
+}
+/*
+ * hierarchical motion estimation
+ *
+ * Each kernel launch is a single iteration
+ *
+ * MB per work group is determined by lclx / 4 * lcly
+ *
+ * global launch dimensions:  [mb_width * 4, mb_height]
+ */
+kernel void hierarchical_motion( read_only image2d_t  fenc,
+                                 read_only image2d_t  fref,
+                                 const global short2 *in_mvs,
+                                 global short2       *out_mvs,
+                                 global int16_t      *out_mv_costs,
+                                 global short2       *mvp_buffer,
+                                 local int16_t       *cost_local,
+                                 local short2        *mvc_local,
+                                 int                  mb_width,
+                                 int                  lambda,
+                                 int                  me_range,
+                                 int                  scale,
+                                 int                  b_shift_index,
+                                 int                  b_first_iteration,
+                                 int                  b_reverse_references )
+{
+    int mb_x = get_global_id( 0 ) >> 2;
+    if( mb_x >= mb_width )
+        return;
+    int mb_height = get_global_size( 1 );
+    int mb_i = get_global_id( 0 ) & 3;
+    int mb_y = get_global_id( 1 );
+    int mb_xy = mb_y * mb_width + mb_x;
+    const int mb_size = 8;
+    int2 coord = (int2)(mb_x, mb_y) * mb_size;
+
+    const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
+    cost_local += 4 * mb_in_group;
+
+    int i_mvc = 0;
+    mvc_local += 4 * mb_in_group;
+    mvc_local[mb_i] = 0;
+    int2 mvp =0;
+
+    if( !b_first_iteration )
+    {
+#define MVC( DX, DY )\
+    {\
+        int px = mb_x + DX;\
+        int py = mb_y + DY;\
+        mvc_local[i_mvc] = b_shift_index ? in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )] : \
+                                           in_mvs[mb_width * py + px];\
+        mvc_local[i_mvc] >>= (short) scale;\
+        i_mvc++;\
+    }
+        /* Find MVP from median of MVCs */
+        if( b_reverse_references )
+        {
+            /* odd iterations: derive MVP from down and right */
+            if( mb_x < mb_width - 1 )
+                MVC( 1, 0 );
+            if( mb_y < mb_height - 1 )
+            {
+                MVC( 0, 1 );
+                if( mb_x > b_shift_index )
+                    MVC( -1, 1 );
+                if( mb_x < mb_width - 1 )
+                    MVC( 1, 1 );
+            }
+        }
+        else
+        {
+            /* even iterations: derive MVP from up and left */
+            if( mb_x > 0 )
+                MVC( -1, 0 );
+            if( mb_y > 0 )
+            {
+                MVC( 0, -1 );
+                if( mb_x < mb_width - 1 )
+                    MVC( 1, -1 );
+                if( mb_x > b_shift_index )
+                    MVC( -1, -1 );
+            }
+        }
+#undef MVC
+        mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
+    }
+    /* current mvp matches the previous mvp and we have not changed scale.  We know
+     * we're going to arrive at the same MV again, so just copy the previous
+     * result to our output. */
+    if( !b_shift_index && mvp.x == mvp_buffer[mb_xy].x && mvp.y == mvp_buffer[mb_xy].y )
+    {
+        out_mvs[mb_xy] = in_mvs[mb_xy];
+        return;
+    }
+    mvp_buffer[mb_xy] = convert_short2_sat(mvp);
+    int2 mv_min = -mb_size * (int2)(mb_x, mb_y) - 4;
+    int2 mv_max = mb_size * ((int2)(mb_width, mb_height) - (int2)(mb_x, mb_y) - 1) + 4;
+
+    int2 bestmv = clamp(mvp, mv_min, mv_max);
+    int2 refcrd = coord + bestmv;
+
+    /* measure cost at bestmv */
+    int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +
+                lambda * mv_cost( abs_diff( bestmv, mvp ) << (2 + scale) );
+
+    do
+    {
+        /* measure costs at offsets from bestmv */
+        refcrd = coord + bestmv + dia_offs[mb_i];
+        int2 trymv = bestmv + dia_offs[mb_i];
+        int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) +
+                   lambda * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );
+
+        cost_local[mb_i] = (cost<<2) | mb_i;
+        cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );
+
+        if( (cost >> 2) >= bcost )
+            break;
+
+        bestmv += dia_offs[cost&3];
+        bcost = cost>>2;
+
+        if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y )
+            break;
+    }
+    while( --me_range > 0 );
+
+    int2 trymv = 0, diff = 0;
+
+#define COST_MV_NO_PAD( L )\
+    trymv = clamp( trymv, mv_min, mv_max );\
+    diff = convert_int2_sat(abs_diff( mvp, trymv ));\
+    if( diff.x > 1 || diff.y > 1 ) {\
+        int2 refcrd = coord + trymv;\
+        int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\
+                   L * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );\
+        if( cost < bcost ) { bcost = cost; bestmv = trymv; } }
+
+    COST_MV_NO_PAD( 0 );
+
+    if( !b_first_iteration )
+    {
+        /* try cost at previous iteration's MV, if MVP was too far away */
+        int2 prevmv = b_shift_index ? convert_int2_sat(in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )]) : convert_int2_sat(in_mvs[mb_xy]);
+        prevmv >>= scale;
+        trymv = prevmv;
+        COST_MV_NO_PAD( lambda );
+    }
+
+    for( int i = 0; i < i_mvc; i++ )
+    {
+        /* try cost at each candidate MV, if MVP was too far away */
+        trymv = convert_int2_sat( mvc_local[i] );
+        COST_MV_NO_PAD( lambda );
+    }
+
+    if( mb_i == 0 )
+    {
+        bestmv <<= scale;
+        out_mvs[mb_xy] = convert_short2_sat(bestmv);
+        out_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
+    }
+}
diff --git a/common/opencl/subpel.cl b/common/opencl/subpel.cl

new file mode 100644 (file)

index 0000000..8c7216c
--- /dev/null
+++ b/common/opencl/subpel.cl
@@ -0,0 +1,242 @@
+/* OpenCL lowres subpel Refine */
+
+/* Each thread performs 8x8 SAD.  4 threads per MB, so the 4 DIA HPEL offsets are
+ * calculated simultaneously */
+int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
+{
+    int2 frefpos = qpos >> 2;
+    int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2);
+    uint mask_shift = 8 * hpel_idx;
+
+    uint4 cost4 = 0;
+
+    for( int y = 0; y < 8; y++ )
+    {
+        uint4 enc, val4;
+        enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y));
+        val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 >> mask_shift) & 0xFF;
+        val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 >> mask_shift) & 0xFF;
+        val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 >> mask_shift) & 0xFF;
+        val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 >> mask_shift) & 0xFF;
+        cost4 += abs_diff( enc, val4 );
+
+        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y));
+        val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 >> mask_shift) & 0xFF;
+        val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 >> mask_shift) & 0xFF;
+        val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 >> mask_shift) & 0xFF;
+        val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 >> mask_shift) & 0xFF;
+        cost4 += abs_diff( enc, val4 );
+    }
+
+    return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3;
+}
+
+/* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */
+int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
+{
+    int2 frefApos = qpos >> 2;
+    int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2);
+
+    int2 qposB = qpos + ((qpos & 1) << 1);
+    int2 frefBpos = qposB >> 2;
+    int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2);
+
+    uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
+
+    int cost = 0;
+
+    for( int y = 0; y < 8; y++ )
+    {
+        for( int x = 0; x < 8; x++ )
+        {
+            uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0;
+            uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 >> mask_shift0) & 0xFF;
+            uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 >> mask_shift1) & 0xFF;
+            cost += abs_diff( enc, rhadd( vA, vB ) );
+        }
+    }
+
+    return cost;
+}
+
+/* Four threads measure 8x8 SATD cost at a QPEL offset into an HPEL plane
+ *
+ * Each thread collects 1/4 of the rows of diffs and processes one quarter of
+ * the transforms
+ */
+int satd_8x8_ii_qpel_coop4( read_only image2d_t fenc,
+                            int2 fencpos,
+                            read_only image2d_t fref_planes,
+                            int2 qpos,
+                            local sum2_t *tmpp,
+                            int idx )
+{
+    volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
+    sum2_t b0, b1, b2, b3;
+
+    // fencpos is full-pel position of original MB
+    // qpos is qpel position within reference frame
+    int2 frefApos = qpos >> 2;
+    int hpelA = ((qpos.x&2)>>1) + (qpos.y&2);
+
+    int2 qposB = qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1));
+    int2 frefBpos = qposB >> 2;
+    int hpelB = ((qposB.x&2)>>1) + (qposB.y&2);
+
+    uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
+
+    uint vA, vB;
+    uint a0, a1;
+    uint enc;
+    sum2_t sum = 0;
+
+#define READ_DIFF( OUT, X )\
+    enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\
+    vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 >> mask_shift0) & 0xFF;\
+    vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 >> mask_shift1) & 0xFF;\
+    OUT = enc - rhadd( vA, vB );
+
+#define READ_DIFF_EX( OUT, a, b )\
+    {\
+        READ_DIFF( a0, a );\
+        READ_DIFF( a1, b );\
+        OUT = a0 + (a1<<BITS_PER_SUM);\
+    }
+#define ROW_8x4_SATD( a, b )\
+    {\
+        fencpos.y += a;\
+        frefApos.y += b;\
+        frefBpos.y += b;\
+        READ_DIFF_EX( b0, 0, 4 );\
+        READ_DIFF_EX( b1, 1, 5 );\
+        READ_DIFF_EX( b2, 2, 6 );\
+        READ_DIFF_EX( b3, 3, 7 );\
+        HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
+        HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
+        sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );\
+    }
+    ROW_8x4_SATD( 0, 0 );
+    ROW_8x4_SATD( 4, 4 );
+
+#undef READ_DIFF
+#undef READ_DIFF_EX
+#undef ROW_8x4_SATD
+    return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
+}
+
+constant int2 hpoffs[4] =
+{
+    {0, -2}, {-2, 0}, {2, 0}, {0, 2}
+};
+
+/* sub pixel refinement of motion vectors, output MVs and costs are moved from
+ * temporary buffers into final per-frame buffer
+ *
+ * global launch dimensions:  [mb_width * 4, mb_height]
+ *
+ * With X being the source 16x16 pixels, F is the lowres pixel used by the
+ * motion search.  We will now utilize the H V and C pixels (stored in separate
+ * planes) to search at half-pel increments.
+ *
+ * X X X X X X
+ *  F H F H F
+ * X X X X X X
+ *  V C V C V
+ * X X X X X X
+ *  F H F H F
+ * X X X X X X
+ *
+ * The YX HPEL bits of the motion vector selects the plane we search in.  The
+ * four planes are packed in the fref_planes 2D image buffer.  Each sample
+ * returns:  s0 = F, s1 = H, s2 = V, s3 = C */
+kernel void subpel_refine( read_only image2d_t   fenc,
+                           read_only image2d_t   fref_planes,
+                           const global short2  *in_mvs,
+                           const global int16_t *in_sad_mv_costs,
+                           local int16_t        *cost_local,
+                           local sum2_t         *satd_local,
+                           local short2         *mvc_local,
+                           global short2        *fenc_lowres_mv,
+                           global int16_t       *fenc_lowres_mv_costs,
+                           int                   mb_width,
+                           int                   lambda,
+                           int                   b,
+                           int                   ref,
+                           int                   b_islist1 )
+{
+    int mb_x = get_global_id( 0 ) >> 2;
+    if( mb_x >= mb_width )
+        return;
+    int mb_height = get_global_size( 1 );
+
+    int mb_i = get_global_id( 0 ) & 3;
+    int mb_y = get_global_id( 1 );
+    int mb_xy = mb_y * mb_width + mb_x;
+
+    /* fenc_lowres_mv and fenc_lowres_mv_costs are large buffers that
+     * hold many frames worth of motion vectors.  We must offset into the correct
+     * location for this frame's vectors.  The kernel will be passed the correct
+     * directional buffer for the direction of the search: list1 or list0
+     *
+     *   CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
+     *   GPU equivalent: fenc_lowres_mvs[(b - p0 - 1) * mb_count] */
+    fenc_lowres_mv +=       (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
+    fenc_lowres_mv_costs += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
+
+    /* Adjust pointers into local memory buffers for this thread's data */
+    int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
+    cost_local += mb_in_group * 4;
+    satd_local += mb_in_group * 16;
+    mvc_local += mb_in_group * 4;
+
+    int i_mvc = 0;
+
+    mvc_local[0] = mvc_local[1] = mvc_local[2] = mvc_local[3] = 0;
+
+#define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)];
+    if( mb_x > 0 )
+        MVC( -1, 0 );
+    if( mb_y > 0 )
+    {
+        MVC( 0, -1 );
+        if( mb_x < mb_width - 1 )
+            MVC( 1, -1 );
+        if( mb_x > 0 )
+            MVC( -1, -1 );
+    }
+#undef MVC
+    int2 mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
+
+    int bcost =  in_sad_mv_costs[mb_xy];
+    int2 coord = (int2)(mb_x, mb_y) << 3;
+    int2 bmv = convert_int2_sat( in_mvs[mb_xy] );
+
+    /* Make mvp and bmv QPEL MV */
+    mvp <<= 2; bmv <<= 2;
+
+#define HPEL_QPEL( ARR, FUNC )\
+    {\
+        int2 trymv = bmv + ARR[mb_i];\
+        int2 qpos = (coord << 2) + trymv;\
+        int cost = FUNC( fenc, coord, fref_planes, qpos ) + lambda * mv_cost( abs_diff( trymv, mvp ) );\
+        cost_local[mb_i] = (cost<<2) + mb_i;\
+        cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );\
+        if( (cost>>2) < bcost )\
+        {\
+            bmv += ARR[cost&3];\
+            bcost = cost>>2;\
+        }\
+    }
+
+    HPEL_QPEL( hpoffs, sad_8x8_ii_hpel );
+    HPEL_QPEL( dia_offs, sad_8x8_ii_qpel );
+    fenc_lowres_mv[mb_xy] = convert_short2_sat( bmv );
+
+    /* remeasure cost of bmv using SATD */
+    int2 qpos = (coord << 2) + bmv;
+    cost_local[mb_i] = satd_8x8_ii_qpel_coop4( fenc, coord, fref_planes, qpos, satd_local, mb_i );
+    bcost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];
+    bcost += lambda * mv_cost( abs_diff( bmv, mvp ) );
+
+    fenc_lowres_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
+}
diff --git a/common/opencl/weightp.cl b/common/opencl/weightp.cl

new file mode 100644 (file)

index 0000000..1524cce
--- /dev/null
+++ b/common/opencl/weightp.cl
@@ -0,0 +1,48 @@
+/* Weightp filter a downscaled image into a temporary output buffer.
+ * This kernel is launched once for each scale.
+ *
+ * Launch dimensions: width x height (in pixels)
+ */
+kernel void weightp_scaled_images( read_only image2d_t in_plane,
+                                   write_only image2d_t out_plane,
+                                   uint offset,
+                                   uint scale,
+                                   uint denom )
+{
+    int gx = get_global_id( 0 );
+    int gy = get_global_id( 1 );
+    uint4 input_val;
+    uint4 output_val;
+
+    input_val = read_imageui( in_plane, sampler, (int2)(gx, gy));
+    output_val = (uint4)(offset) + ( ( ((uint4)(scale)) * input_val ) >> ((uint4)(denom)) );
+    write_imageui( out_plane, (int2)(gx, gy), output_val );
+}
+
+/* Weightp filter for the half-pel interpolated image
+ *
+ * Launch dimensions: width x height (in pixels)
+ */
+kernel void weightp_hpel( read_only image2d_t in_plane,
+                          write_only image2d_t out_plane,
+                          uint offset,
+                          uint scale,
+                          uint denom )
+{
+    int gx = get_global_id( 0 );
+    int gy = get_global_id( 1 );
+    uint input_val;
+    uint output_val;
+
+    input_val = read_imageui( in_plane, sampler, (int2)(gx, gy)).s0;
+    //Unpack
+    uint4 temp;
+    temp.s0 = input_val & 0x00ff; temp.s1 = (input_val >> 8) & 0x00ff;
+    temp.s2 = (input_val >> 16) & 0x00ff; temp.s3 = (input_val >> 24) & 0x00ff;
+
+    temp = (uint4)(offset) + ( ( ((uint4)(scale)) * temp ) >> ((uint4)(denom)) );
+
+    //Pack
+    output_val = temp.s0 | (temp.s1 << 8) | (temp.s2 << 16) | (temp.s3 << 24);
+    write_imageui( out_plane, (int2)(gx, gy), output_val );
+}
diff --git a/common/opencl/x264-cl.h b/common/opencl/x264-cl.h

new file mode 100644 (file)

index 0000000..892904d
--- /dev/null
+++ b/common/opencl/x264-cl.h
@@ -0,0 +1,132 @@
+#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
+
+constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
+
+/* 7.18.1.1  Exact-width integer types */
+typedef signed char int8_t;
+typedef unsigned char   uint8_t;
+typedef short  int16_t;
+typedef unsigned short  uint16_t;
+typedef int  int32_t;
+typedef unsigned   uint32_t;
+
+typedef uint8_t  pixel;
+typedef uint16_t sum_t;
+typedef uint32_t sum2_t;
+
+#define LOWRES_COST_MASK ((1<<14)-1)
+#define LOWRES_COST_SHIFT 14
+#define COST_MAX (1<<28)
+
+#define PIXEL_MAX 255
+#define BITS_PER_SUM (8 * sizeof(sum_t))
+
+/* Constants for offsets into frame statistics buffer */
+#define COST_EST    0
+#define COST_EST_AQ 1
+#define INTRA_MBS   2
+
+#define COPY2_IF_LT( x, y, a, b )\
+    if((y)<(x))\
+    {\
+        (x) = (y);\
+        (a) = (b);\
+    }
+
+constant int2 dia_offs[4] =
+{
+    {0, -1}, {-1, 0}, {1, 0}, {0, 1},
+};
+
+inline pixel x264_clip_pixel( int x )
+{
+    return (pixel) clamp( x, (int) 0, (int) PIXEL_MAX );
+}
+
+inline int2 x264_median_mv( short2 a, short2 b, short2 c )
+{
+    short2 t1 = min(a, b);
+    short2 t2 = min(max(a, b), c);
+    return convert_int2(max(t1, t2));
+}
+
+inline sum2_t abs2( sum2_t a )
+{
+    sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
+    return (a + s) ^ s;
+}
+
+#define HADAMARD4( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
+    sum2_t t0 = s0 + s1;\
+    sum2_t t1 = s0 - s1;\
+    sum2_t t2 = s2 + s3;\
+    sum2_t t3 = s2 - s3;\
+    d0 = t0 + t2;\
+    d2 = t0 - t2;\
+    d1 = t1 + t3;\
+    d3 = t1 - t3;\
+}
+
+#define HADAMARD4V( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
+    int2 t0 = s0 + s1;\
+    int2 t1 = s0 - s1;\
+    int2 t2 = s2 + s3;\
+    int2 t3 = s2 - s3;\
+    d0 = t0 + t2;\
+    d2 = t0 - t2;\
+    d1 = t1 + t3;\
+    d3 = t1 - t3;\
+}
+
+#define SATD_C_8x4_Q( name, q1, q2 )\
+    int name( q1 pixel *pix1, int i_pix1, q2 pixel *pix2, int i_pix2 )\
+    {\
+        sum2_t tmp[4][4];\
+        sum2_t a0, a1, a2, a3;\
+        sum2_t sum = 0;\
+        for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )\
+        {\
+            a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);\
+            a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);\
+            a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);\
+            a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);\
+            HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3 );\
+        }\
+        for( int i = 0; i < 4; i++ )\
+        {\
+            HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );\
+            sum += abs2( a0 ) + abs2( a1 ) + abs2( a2 ) + abs2( a3 );\
+        }\
+        return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;\
+    }
+
+/*
+ * Utility function to perform a parallel sum reduction of an array of integers
+ */
+int parallel_sum( int value, int x, volatile local int *array )
+{
+    array[x] = value;
+    barrier( CLK_LOCAL_MEM_FENCE );
+
+    int dim = get_local_size( 0 );
+
+    while( dim > 1 )
+    {
+        dim >>= 1;
+
+        if( x < dim )
+            array[x] += array[x + dim];
+
+        if( dim > 32 )
+            barrier( CLK_LOCAL_MEM_FENCE );
+    }
+
+    return array[0];
+}
+
+int mv_cost( uint2 mvd )
+{
+    float2 mvdf = (float2)(mvd.x, mvd.y) + 1.0f;
+    float2 cost = round( log2(mvdf) * 2.0f + 0.718f + (float2)(!!mvd.x, !!mvd.y) );
+    return (int) (cost.x + cost.y);
+}
diff --git a/configure b/configure

index f9072b2391077f4b72fd02c34068abbe31ea9f3b..7fea42154e905d4157109ae52c7695a14bbefc5d 100755 (executable)
--- a/configure
+++ b/configure
@@ -25,6 +25,7 @@ Configuration options:
    --system-libx264         use system libx264 instead of internal
    --enable-shared          build shared library
    --enable-static          build static library
+  --disable-opencl         disable OpenCL features
    --disable-gpl            disable GPL-only features
    --disable-thread         disable multithreaded encoding
    --enable-win32thread     use win32threads (windows only)
@@ -273,6 +274,7 @@ vis="no"
  bit_depth="8"
  chroma_format="all"
  compiler="GNU"
+opencl="yes"
  
  CFLAGS="$CFLAGS -Wall -I. -I\$(SRCPATH)"
  LDFLAGS="$LDFLAGS"
@@ -381,6 +383,9 @@ for opt do
          --host=*)
              host="$optarg"
              ;;
+        --disable-opencl)
+            opencl="no"
+            ;;
          --cross-prefix=*)
              cross_prefix="$optarg"
              ;;
@@ -998,6 +1003,7 @@ fi
  if [ "$bit_depth" -gt "8" ]; then
      define HIGH_BIT_DEPTH
      ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=1"
+    opencl="no"
  else
      ASFLAGS="$ASFLAGS -DHIGH_BIT_DEPTH=0"
  fi
@@ -1105,6 +1111,60 @@ PROF_USE_CC=$PROF_USE_CC
  PROF_USE_LD=$PROF_USE_LD
  EOF
  
+if [[ $host_os != mingw* ]]; then
+    # OpenCL support is only well tested on Windows/MinGW.  If you
+    # wish to try it on an unsupported platform, swap the lines
+    # below.  If OpenCL breaks, you get to keep both halves
+    #opencl="yes"
+    opencl="no"
+fi
+if [ "$opencl" = "yes" ]; then
+    log_check "looking for perl"
+    output=$(perl -v)
+    if [ "$output" = "" ]; then
+        echo 'OpenCL support requires perl to compile.'
+        echo 'use --disable-opencl to compile without OpenCL.'
+        exit 1
+    elif [[ $cross_prefix != ""  &&  $host_os == mingw* ]] ; then
+        if cc_check "CL/cl.h" "-lOpenCL"; then
+            echo 'HAVE_OPENCL=yes' >> config.mak
+            echo 'OPENCL_LIB=OpenCL' >> config.mak
+            echo "OPENCL_INC_DIR=." >> config.mak
+            echo "OPENCL_LIB_DIR=." >> config.mak
+            define HAVE_OPENCL
+        else
+            opencl="no"
+        fi
+    elif [ "$CUDA_PATH" != "" ]; then
+        echo 'HAVE_OPENCL=yes' >> config.mak
+        echo 'OPENCL_LIB=OpenCL' >> config.mak
+        echo 'OPENCL_INC_DIR=$(CUDA_PATH)include' >> config.mak
+        if [ "$ARCH" = "X86" ]; then
+            echo 'OPENCL_LIB_DIR=$(CUDA_PATH)lib/Win32' >> config.mak
+        else
+            echo 'OPENCL_LIB_DIR=$(CUDA_PATH)lib/x64' >> config.mak
+        fi
+        define HAVE_OPENCL
+    elif [ -e "$AMDAPPSDKROOT/include/CL/cl.h" ]; then
+        if [[ $host_os = mingw* ]]; then
+            app_path=`echo "/$AMDAPPSDKROOT" | sed 's/\\\/\//g' | sed 's/://'`
+        else
+            app_path='$(AMDAPPSDKROOT)'
+       fi
+        echo 'HAVE_OPENCL=yes' >> config.mak
+        echo 'OPENCL_LIB=OpenCL' >> config.mak
+        echo OPENCL_INC_DIR=$app_path/include >> config.mak
+        if [ "$ARCH" = "X86" ]; then
+            echo OPENCL_LIB_DIR=$app_path/lib/x86 >> config.mak
+        else
+            echo OPENCL_LIB_DIR=$app_path/lib/x86_64 >> config.mak
+        fi
+        define HAVE_OPENCL
+    else
+        opencl="no"
+    fi
+fi
+
  if [ $compiler = ICL ]; then
      echo '%.o: %.c' >> config.mak
      echo '     $(CC) $(CFLAGS) -c -Fo$@ $<' >> config.mak
@@ -1214,6 +1274,7 @@ PIC:           $pic
  visualize:     $vis
  bit depth:     $bit_depth
  chroma format: $chroma_format
+opencl:        $opencl
  EOF
  
  echo >> config.log
diff --git a/encoder/encoder.c b/encoder/encoder.c

index 5a94209ce8bc3b75ec9e18596d43e6179b534a0f..f9f411bb212ec0c2d7903884e604e5ee4c8eace4 100644 (file)
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -37,6 +37,10 @@
  #include "common/visualize.h"
  #endif
  
+#if HAVE_OPENCL
+#include "common/opencl.h"
+#endif
+
  //#define DEBUG_MB_TYPE
  
  #define bs_write_ue bs_write_ue_big
@@ -541,6 +545,28 @@ static int x264_validate_parameters( x264_t *h, int b_open )
      if( h->i_thread_frames > 1 )
          h->param.nalu_process = NULL;
  
+    if( h->param.b_opencl )
+    {
+#if !HAVE_OPENCL
+        x264_log( h, X264_LOG_WARNING, "OpenCL: not compiled with OpenCL support, disabling\n" );
+        h->param.b_opencl = 0;
+#elif BIT_DEPTH > 8
+        x264_log( h, X264_LOG_WARNING, "OpenCL lookahead does not support high bit depth, disabling opencl\n" );
+        h->param.b_opencl = 0;
+#else
+        if( h->param.i_width < 32 || h->param.i_height < 32 )
+        {
+            x264_log( h, X264_LOG_WARNING, "OpenCL: frame size is too small, disabling opencl\n" );
+            h->param.b_opencl = 0;
+        }
+#endif
+        if( h->param.opencl_device_id && h->param.i_opencl_device )
+        {
+            x264_log( h, X264_LOG_WARNING, "OpenCL: device id and device skip count configured; dropping skip\n" );
+            h->param.i_opencl_device = 0;
+        }
+    }
+
      h->param.i_keyint_max = x264_clip3( h->param.i_keyint_max, 1, X264_KEYINT_MAX_INFINITE );
      if( h->param.i_keyint_max == 1 )
      {
@@ -1042,6 +1068,7 @@ static int x264_validate_parameters( x264_t *h, int b_open )
      BOOLIFY( b_open_gop );
      BOOLIFY( b_bluray_compat );
      BOOLIFY( b_full_recon );
+    BOOLIFY( b_opencl );
      BOOLIFY( analyse.b_transform_8x8 );
      BOOLIFY( analyse.b_weighted_bipred );
      BOOLIFY( analyse.b_chroma_me );
@@ -1400,6 +1427,11 @@ x264_t *x264_encoder_open( x264_param_t *param )
              goto fail;
      }
  
+#if HAVE_OPENCL
+    if( h->param.b_opencl && x264_opencl_init( h ) < 0 )
+        h->param.b_opencl = 0;
+#endif
+
      if( x264_lookahead_init( h, i_slicetype_length ) )
          goto fail;
  
@@ -2862,6 +2894,11 @@ int     x264_encoder_encode( x264_t *h,
      int i_nal_type, i_nal_ref_idc, i_global_qp;
      int overhead = NALU_OVERHEAD;
  
+#if HAVE_OPENCL
+    if( h->opencl.b_fatal_error )
+        return -1;
+#endif
+
      if( h->i_thread_frames > 1 )
      {
          thread_prev    = h->thread[ h->i_thread_phase ];
@@ -3608,6 +3645,10 @@ void    x264_encoder_close  ( x264_t *h )
                     || h->stat.i_mb_count[SLICE_TYPE_P][I_PCM]
                     || h->stat.i_mb_count[SLICE_TYPE_B][I_PCM];
  
+#if HAVE_OPENCL
+    x264_opencl_free( h );
+#endif
+
      x264_lookahead_delete( h );
  
      if( h->param.b_sliced_threads )
diff --git a/encoder/slicetype-cl.c b/encoder/slicetype-cl.c

new file mode 100644 (file)

index 0000000..1991060
--- /dev/null
+++ b/encoder/slicetype-cl.c
@@ -0,0 +1,766 @@
+/*****************************************************************************
+ * slicetype-cl.c: OpenCL slicetype decision code (lowres lookahead)
+ *****************************************************************************
+ * Copyright (C) 2012-2013 x264 project
+ *
+ * Authors: Steve Borho <sborho@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing@x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macroblock.h"
+#include "me.h"
+
+#if HAVE_OPENCL
+#if _WIN32
+#include <windows.h>
+#endif
+
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
+
+/* We define CL_QUEUE_THREAD_HANDLE_AMD here because it is not defined
+ * in the OpenCL headers shipped with NVIDIA drivers.  We need to be
+ * able to compile on an NVIDIA machine and run optimally on an AMD GPU. */
+#define CL_QUEUE_THREAD_HANDLE_AMD 0x403E
+
+#define OCLCHECK( method, ... )\
+    status = method( __VA_ARGS__ );\
+    if( status != CL_SUCCESS ) {\
+        h->param.b_opencl = 0;\
+        h->opencl.b_fatal_error = 1;\
+        x264_log( h, X264_LOG_ERROR, # method " error '%d'\n", status );\
+        return status;\
+    }
+
+void x264_opencl_flush( x264_t *h )
+{
+    clFinish( h->opencl.queue );
+
+    /* Finish copies from the GPU by copying from the page-locked buffer to
+     * their final destination */
+    for( int i = 0; i < h->opencl.num_copies; i++ )
+        memcpy( h->opencl.copies[i].dest, h->opencl.copies[i].src, h->opencl.copies[i].bytes );
+    h->opencl.num_copies = 0;
+    h->opencl.pl_occupancy = 0;
+}
+
+static void *x264_opencl_alloc_locked( x264_t *h, int bytes )
+{
+    if( h->opencl.pl_occupancy + bytes >= PAGE_LOCKED_BUF_SIZE )
+        x264_opencl_flush( h );
+    assert( bytes < PAGE_LOCKED_BUF_SIZE );
+    char *ptr = h->opencl.page_locked_ptr + h->opencl.pl_occupancy;
+    h->opencl.pl_occupancy += bytes;
+    return ptr;
+}
+
+int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda )
+{
+    if( fenc->b_intra_calculated )
+        return 0;
+    fenc->b_intra_calculated = 1;
+
+    int luma_length = fenc->i_stride[0] * fenc->i_lines[0];
+
+#define CREATEBUF( out, flags, size )\
+    out = clCreateBuffer( h->opencl.context, (flags), (size), NULL, &status );\
+    if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateBuffer error '%d'\n", status ); return -1; }
+#define CREATEIMAGE( out, flags, pf, width, height )\
+    out = clCreateImage2D( h->opencl.context, (flags), &pf, width, height, 0, NULL, &status );\
+    if( status != CL_SUCCESS ) { h->param.b_opencl = 0; x264_log( h, X264_LOG_ERROR, "clCreateImage2D error '%d'\n", status ); return -1; }
+
+    int mb_count = h->mb.i_mb_count;
+    cl_int status;
+
+    if( !h->opencl.lowres_mv_costs )
+    {
+        /* Allocate shared memory buffers */
+        int width = h->mb.i_mb_width * 8 * sizeof(pixel);
+        int height = h->mb.i_mb_height * 8 * sizeof(pixel);
+
+        cl_image_format pixel_format;
+        pixel_format.image_channel_order = CL_R;
+        pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
+        CREATEIMAGE( h->opencl.weighted_luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
+
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+        {
+            pixel_format.image_channel_order = CL_RGBA;
+            pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
+            CREATEIMAGE( h->opencl.weighted_scaled_images[i], CL_MEM_READ_WRITE, pixel_format, width, height );
+            width >>= 1;
+            height >>= 1;
+        }
+
+        CREATEBUF( h->opencl.lowres_mv_costs,     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+        CREATEBUF( h->opencl.lowres_costs[0],     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+        CREATEBUF( h->opencl.lowres_costs[1],     CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) );
+        CREATEBUF( h->opencl.mv_buffers[0],       CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+        CREATEBUF( h->opencl.mv_buffers[1],       CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+        CREATEBUF( h->opencl.mvp_buffer,          CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * 2 );
+        CREATEBUF( h->opencl.frame_stats[0],      CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
+        CREATEBUF( h->opencl.frame_stats[1],      CL_MEM_WRITE_ONLY, 4 * sizeof(int) );
+        CREATEBUF( h->opencl.row_satds[0],        CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
+        CREATEBUF( h->opencl.row_satds[1],        CL_MEM_WRITE_ONLY, h->mb.i_mb_height * sizeof(int) );
+        CREATEBUF( h->opencl.luma_16x16_image[0], CL_MEM_READ_ONLY,  luma_length );
+        CREATEBUF( h->opencl.luma_16x16_image[1], CL_MEM_READ_ONLY,  luma_length );
+    }
+
+    if( !fenc->opencl.intra_cost )
+    {
+        /* Allocate per-frame buffers */
+        int width = h->mb.i_mb_width * 8 * sizeof(pixel);
+        int height = h->mb.i_mb_height * 8 * sizeof(pixel);
+
+        cl_image_format pixel_format;
+        pixel_format.image_channel_order = CL_R;
+        pixel_format.image_channel_data_type = CL_UNSIGNED_INT32;
+        CREATEIMAGE( fenc->opencl.luma_hpel, CL_MEM_READ_WRITE, pixel_format, width, height );
+
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+        {
+            pixel_format.image_channel_order = CL_RGBA;
+            pixel_format.image_channel_data_type = CL_UNSIGNED_INT8;
+            CREATEIMAGE( fenc->opencl.scaled_image2Ds[i], CL_MEM_READ_WRITE, pixel_format, width, height );
+            width >>= 1;
+            height >>= 1;
+        }
+        CREATEBUF( fenc->opencl.inv_qscale_factor, CL_MEM_READ_ONLY,  mb_count * sizeof(int16_t) );
+        CREATEBUF( fenc->opencl.intra_cost,        CL_MEM_WRITE_ONLY, mb_count * sizeof(int16_t) );
+        CREATEBUF( fenc->opencl.lowres_mvs0,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
+        CREATEBUF( fenc->opencl.lowres_mvs1,       CL_MEM_READ_WRITE, mb_count * 2 * sizeof(int16_t) * (h->param.i_bframe + 1) );
+        CREATEBUF( fenc->opencl.lowres_mv_costs0,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
+        CREATEBUF( fenc->opencl.lowres_mv_costs1,  CL_MEM_READ_WRITE, mb_count * sizeof(int16_t) * (h->param.i_bframe + 1) );
+    }
+#undef CREATEBUF
+
+    /* Copy image to the GPU, downscale to unpadded 8x8, then continue for all scales */
+
+    char *locked = x264_opencl_alloc_locked( h, luma_length );
+    memcpy( locked, fenc->plane[0], luma_length );
+    OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue,  h->opencl.luma_16x16_image[h->opencl.last_buf], CL_FALSE, 0, luma_length, locked, 0, NULL, NULL );
+
+    size_t gdim[2];
+    if( h->param.rc.i_aq_mode && fenc->i_inv_qscale_factor )
+    {
+        int size = h->mb.i_mb_count * sizeof(int16_t);
+        locked = x264_opencl_alloc_locked( h, size );
+        memcpy( locked, fenc->i_inv_qscale_factor, size );
+        OCLCHECK( clEnqueueWriteBuffer, h->opencl.queue, fenc->opencl.inv_qscale_factor, CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    }
+    else
+    {
+        /* Fill fenc->opencl.inv_qscale_factor with NOP (256) */
+        cl_uint arg = 0;
+        int16_t value = 256;
+        OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
+        OCLCHECK( clSetKernelArg, h->opencl.memset_kernel, arg++, sizeof(int16_t), &value );
+        gdim[0] = h->mb.i_mb_count;
+        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.memset_kernel, 1, NULL, gdim, NULL, 0, NULL, NULL );
+    }
+
+    int stride = fenc->i_stride[0];
+    cl_uint arg = 0;
+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.luma_16x16_image[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(cl_mem), &fenc->opencl.luma_hpel );
+    OCLCHECK( clSetKernelArg, h->opencl.downscale_hpel_kernel, arg++, sizeof(int), &stride );
+    gdim[0] = 8 * h->mb.i_mb_width;
+    gdim[1] = 8 * h->mb.i_mb_height;
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.downscale_hpel_kernel, 2, NULL, gdim, NULL, 0, NULL, NULL );
+
+    for( int i = 0; i < NUM_IMAGE_SCALES - 1; i++ )
+    {
+        /* Workaround for AMD Southern Island:
+         *
+         * Alternate kernel instances.  No perf impact to this, so we do it for
+         * all GPUs.  It prevents the same kernel from being enqueued
+         * back-to-back, avoiding a dependency calculation bug in the driver.
+         */
+        cl_kernel kern = i & 1 ? h->opencl.downscale_kernel1 : h->opencl.downscale_kernel2;
+
+        arg = 0;
+        OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i] );
+        OCLCHECK( clSetKernelArg, kern, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[i+1] );
+        gdim[0] >>= 1;
+        gdim[1] >>= 1;
+        if( gdim[0] < 16 || gdim[1] < 16 )
+            break;
+        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, kern, 2, NULL, gdim, NULL, 0, NULL, NULL );
+    }
+
+    size_t ldim[2];
+    gdim[0] = ((h->mb.i_mb_width + 31)>>5)<<5;
+    gdim[1] = 8*h->mb.i_mb_height;
+    ldim[0] = 32;
+    ldim[1] = 8;
+    arg = 0;
+
+    /* For presets slow, slower, and placebo, check all 10 intra modes that the
+     * C lookahead supports.  For faster presets, only check the most frequent 8
+     * modes
+     */
+    int slow = h->param.analyse.i_subpel_refine > 7;
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &lambda );
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+    OCLCHECK( clSetKernelArg, h->opencl.intra_kernel, arg++, sizeof(int), &slow );
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
+
+    gdim[0] = 256;
+    gdim[1] = h->mb.i_mb_height;
+    ldim[0] = 256;
+    ldim[1] = 1;
+    arg = 0;
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_intra_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_intra_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
+
+    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
+        x264_opencl_flush( h );
+
+    int size = h->mb.i_mb_count * sizeof(int16_t);
+    locked = x264_opencl_alloc_locked( h, size );
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.intra_cost, CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[0][0];
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].bytes = size;
+    h->opencl.num_copies++;
+
+    size = h->mb.i_mb_height * sizeof(int);
+    locked = x264_opencl_alloc_locked( h, size );
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[0][0];
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].bytes = size;
+    h->opencl.num_copies++;
+
+    size = sizeof(int) * 4;
+    locked = x264_opencl_alloc_locked( h, size );
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[0][0];
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+    h->opencl.num_copies++;
+    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[0][0];
+    h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
+    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+    h->opencl.num_copies++;
+
+    h->opencl.last_buf = !h->opencl.last_buf;
+    return 0;
+}
+
+/* This function was tested emprically on a number of AMD and NV GPUs.  Making a
+ * function which returns perfect launch dimensions is impossible; some
+ * applications will have self-tuning code to try many possible variables and
+ * measure the runtime.  Here we simply make an educated guess based on what we
+ * know GPUs typically prefer.  */
+static void x264_optimal_launch_dims( size_t *gdims, size_t *ldims, const cl_kernel kernel, const cl_device_id device )
+{
+    size_t max_work_group = 256;    /* reasonable defaults for OpenCL 1.0 devices, below APIs may fail */
+    size_t preferred_multiple = 64;
+    cl_uint num_cus = 6;
+
+    clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_work_group, NULL );
+    clGetKernelWorkGroupInfo( kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &preferred_multiple, NULL );
+    clGetDeviceInfo( device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &num_cus, NULL );
+
+    ldims[0] = preferred_multiple;
+    ldims[1] = 8;
+
+    /* make ldims[1] an even divisor of gdims[1] */
+    while( gdims[1] & (ldims[1] - 1) )
+    {
+        ldims[0] <<= 1;
+        ldims[1] >>= 1;
+    }
+    /* make total ldims fit under the max work-group dimensions for the device */
+    while( ldims[0] * ldims[1] > max_work_group )
+    {
+        if( (ldims[0] <= preferred_multiple) && (ldims[1] > 1) )
+            ldims[1] >>= 1;
+        else
+            ldims[0] >>= 1;
+    }
+
+    if( ldims[0] > gdims[0] )
+    {
+        /* remove preferred multiples until we're close to gdims[0] */
+        while( gdims[0] + preferred_multiple < ldims[0] )
+            ldims[0] -= preferred_multiple;
+        gdims[0] = ldims[0];
+    }
+    else
+    {
+        /* make gdims an even multiple of ldims */
+        gdims[0] = (gdims[0]+ldims[0]-1)/ldims[0];
+        gdims[0] *= ldims[0];
+    }
+
+    /* make ldims smaller to spread work across compute units */
+    while( (gdims[0]/ldims[0]) * (gdims[1]/ldims[1]) * 2 <= num_cus )
+    {
+        if( ldims[0] > preferred_multiple )
+            ldims[0] >>= 1;
+        else if( ldims[1] > 1 )
+            ldims[1] >>= 1;
+        else
+            break;
+    }
+    /* for smaller GPUs, try not to abuse their texture cache */
+    if( num_cus == 6 && ldims[0] == 64 && ldims[1] == 4 )
+        ldims[0] = 32;
+}
+
+int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w )
+{
+    x264_frame_t *fenc = frames[b];
+    x264_frame_t *fref = frames[ref];
+
+    cl_mem ref_scaled_images[NUM_IMAGE_SCALES];
+    cl_mem ref_luma_hpel;
+    cl_int status;
+
+    if( w && w->weightfn )
+    {
+        size_t gdims[2];
+
+        gdims[0] = 8 * h->mb.i_mb_width;
+        gdims[1] = 8 * h->mb.i_mb_height;
+
+        /* WeightP: Perform a filter on fref->opencl.scaled_image2Ds[] and fref->opencl.luma_hpel */
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+        {
+            cl_uint arg = 0;
+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &fref->opencl.scaled_image2Ds[i] );
+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_scaled_images[i] );
+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_offset );
+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_scale );
+            OCLCHECK( clSetKernelArg, h->opencl.weightp_scaled_images_kernel, arg++, sizeof(int32_t), &w->i_denom );
+            OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_scaled_images_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
+
+            gdims[0] >>= 1;
+            gdims[1] >>= 1;
+            if( gdims[0] < 16 || gdims[1] < 16 )
+                break;
+        }
+
+        cl_uint arg = 0;
+        gdims[0] = 8 * h->mb.i_mb_width;
+        gdims[1] = 8 * h->mb.i_mb_height;
+
+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &fref->opencl.luma_hpel );
+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(cl_mem), &h->opencl.weighted_luma_hpel );
+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_offset );
+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_scale );
+        OCLCHECK( clSetKernelArg, h->opencl.weightp_hpel_kernel, arg++, sizeof(int32_t), &w->i_denom );
+        OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.weightp_hpel_kernel, 2, NULL, gdims, NULL, 0, NULL, NULL );
+
+        /* Use weighted reference planes for motion search */
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+            ref_scaled_images[i] = h->opencl.weighted_scaled_images[i];
+        ref_luma_hpel = h->opencl.weighted_luma_hpel;
+    }
+    else
+    {
+        /* Use unweighted reference planes for motion search */
+        for( int i = 0; i < NUM_IMAGE_SCALES; i++ )
+            ref_scaled_images[i] = fref->opencl.scaled_image2Ds[i];
+        ref_luma_hpel = fref->opencl.luma_hpel;
+    }
+
+    const int num_iterations[NUM_IMAGE_SCALES] = { 1, 1, 2, 3 };
+    int b_first_iteration = 1;
+    int b_reverse_references = 1;
+    int A = 1;
+
+
+    int mb_per_group = 0;
+    int cost_local_size = 0;
+    int mvc_local_size = 0;
+    int mb_width;
+
+    size_t gdims[2];
+    size_t ldims[2];
+
+    /* scale 0 is 8x8 */
+    for( int scale = NUM_IMAGE_SCALES-1; scale >= 0; scale-- )
+    {
+        mb_width = h->mb.i_mb_width >> scale;
+        gdims[0] = mb_width;
+        gdims[1] = h->mb.i_mb_height >> scale;
+        if( gdims[0] < 2 || gdims[1] < 2 )
+            continue;
+        gdims[0] <<= 2;
+        x264_optimal_launch_dims( gdims, ldims, h->opencl.hme_kernel, h->opencl.device );
+
+        mb_per_group = (ldims[0] >> 2) * ldims[1];
+        cost_local_size = 4 * mb_per_group * sizeof(int16_t);
+        mvc_local_size = 4 * mb_per_group * sizeof(int16_t) * 2;
+        int scaled_me_range = h->param.analyse.i_me_range >> scale;
+        int b_shift_index = 1;
+
+        cl_uint arg = 0;
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[scale] );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &ref_scaled_images[scale] );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(cl_mem), (void*)&h->opencl.mvp_buffer );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, cost_local_size, NULL );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, mvc_local_size, NULL );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &mb_width );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &lambda );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scaled_me_range );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &scale );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_shift_index );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_first_iteration );
+        OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg++, sizeof(int), &b_reverse_references );
+
+        for( int iter = 0; iter < num_iterations[scale]; iter++ )
+        {
+            OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.hme_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
+
+            b_shift_index = 0;
+            b_first_iteration = 0;
+
+            /* alternate top-left vs bot-right MB references at lower scales, so
+             * motion field smooths more quickly.  */
+            if( scale > 2 )
+                b_reverse_references ^= 1;
+            else
+                b_reverse_references = 0;
+            A = !A;
+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 2, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, 3, sizeof(cl_mem), &h->opencl.mv_buffers[!A] );
+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 3, sizeof(int), &b_shift_index );
+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 2, sizeof(int), &b_first_iteration );
+            OCLCHECK( clSetKernelArg, h->opencl.hme_kernel, arg - 1, sizeof(int), &b_reverse_references );
+        }
+    }
+
+    int satd_local_size = mb_per_group * sizeof(uint32_t) * 16;
+    cl_uint arg = 0;
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &ref_luma_hpel );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.mv_buffers[A] );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_mv_costs );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, cost_local_size, NULL );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, satd_local_size, NULL );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, mvc_local_size, NULL );
+
+    if( b_islist1 )
+    {
+        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
+        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
+    }
+    else
+    {
+        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
+        OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
+    }
+
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &mb_width );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &lambda );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &ref );
+    OCLCHECK( clSetKernelArg, h->opencl.subpel_refine_kernel, arg++, sizeof(int), &b_islist1 );
+
+    if( h->opencl.b_device_AMD_SI )
+    {
+        /* workaround for AMD Southern Island driver scheduling bug (fixed in
+         * July 2012), perform meaningless small copy to add a data dependency */
+        OCLCHECK( clEnqueueCopyBuffer, h->opencl.queue, h->opencl.mv_buffers[A], h->opencl.mv_buffers[!A], 0, 0, 20, 0, NULL, NULL );
+    }
+
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.subpel_refine_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
+
+    int mvlen = 2 * sizeof(int16_t) * h->mb.i_mb_count;
+
+    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 1 )
+        x264_opencl_flush( h );
+
+    char *locked = x264_opencl_alloc_locked( h, mvlen );
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].bytes = mvlen;
+
+    if( b_islist1 )
+    {
+        int mvs_offset = mvlen * (ref - b - 1);
+        OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs1, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
+        h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[1][ref - b - 1];
+    }
+    else
+    {
+        int mvs_offset = mvlen * (b - ref - 1);
+        OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, fenc->opencl.lowres_mvs0, CL_FALSE, mvs_offset, mvlen, locked, 0, NULL, NULL );
+        h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_mvs[0][b - ref - 1];
+    }
+
+    h->opencl.num_copies++;
+
+    return 0;
+}
+
+int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor )
+{
+    cl_int status;
+    x264_frame_t *fenc = frames[b];
+    x264_frame_t *fref0 = frames[p0];
+    x264_frame_t *fref1 = frames[p1];
+
+    int bipred_weight = h->param.analyse.b_weighted_bipred ? 64 - (dist_scale_factor >> 2) : 32;
+
+    /* Tasks for this kernel:
+     * 1. Select least cost mode (intra, ref0, ref1)
+     *    list_used 0, 1, 2, or 3.  if B frame, do not allow intra
+     * 2. if B frame, try bidir predictions.
+     * 3. lowres_costs[i_mb_xy] = X264_MIN( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT); */
+    size_t gdims[2] = { h->mb.i_mb_width, h->mb.i_mb_height };
+    size_t ldim_bidir[2];
+    size_t *ldims = NULL;
+    int cost_local_size = 4;
+    int satd_local_size = 4;
+    if( b < p1 )
+    {
+        /* For B frames, use 4 threads per MB for BIDIR checks */
+        ldims = ldim_bidir;
+        gdims[0] <<= 2;
+        x264_optimal_launch_dims( gdims, ldims, h->opencl.mode_select_kernel, h->opencl.device );
+        int mb_per_group = (ldims[0] >> 2) * ldims[1];
+        cost_local_size = 4 * mb_per_group * sizeof(int16_t);
+        satd_local_size = 16 * mb_per_group * sizeof(uint32_t);
+    }
+
+    cl_uint arg = 0;
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.scaled_image2Ds[0] );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref0->opencl.luma_hpel );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.luma_hpel );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs0 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mvs1 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fref1->opencl.lowres_mvs0 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs0 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.lowres_mv_costs1 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &fenc->opencl.intra_cost );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, cost_local_size, NULL );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, satd_local_size, NULL );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &bipred_weight );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &dist_scale_factor );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &b );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p0 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &p1 );
+    OCLCHECK( clSetKernelArg, h->opencl.mode_select_kernel, arg++, sizeof(int), &lambda );
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.mode_select_kernel, 2, NULL, gdims, ldims, 0, NULL, NULL );
+
+    /* Sum costs across rows, atomicAdd down frame */
+    size_t gdim[2] = { 256, h->mb.i_mb_height };
+    size_t ldim[2] = { 256, 1 };
+
+    arg = 0;
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.lowres_costs[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &fenc->opencl.inv_qscale_factor );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.row_satds[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(cl_mem), &h->opencl.frame_stats[h->opencl.last_buf] );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->mb.i_mb_width );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &h->param.i_bframe_bias );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &b );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p0 );
+    OCLCHECK( clSetKernelArg, h->opencl.rowsum_inter_kernel, arg++, sizeof(int), &p1 );
+    OCLCHECK( clEnqueueNDRangeKernel, h->opencl.queue, h->opencl.rowsum_inter_kernel, 2, NULL, gdim, ldim, 0, NULL, NULL );
+
+    if( h->opencl.num_copies >= MAX_FINISH_COPIES - 4 )
+        x264_opencl_flush( h );
+
+    int size =  h->mb.i_mb_count * sizeof(int16_t);
+    char *locked = x264_opencl_alloc_locked( h, size );
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].dest = fenc->lowres_costs[b - p0][p1 - b];
+    h->opencl.copies[h->opencl.num_copies].bytes = size;
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.lowres_costs[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.num_copies++;
+
+    size =  h->mb.i_mb_height * sizeof(int);
+    locked = x264_opencl_alloc_locked( h, size );
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].dest = fenc->i_row_satds[b - p0][p1 - b];
+    h->opencl.copies[h->opencl.num_copies].bytes = size;
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.row_satds[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.num_copies++;
+
+    size =  4 * sizeof(int);
+    locked = x264_opencl_alloc_locked( h, size );
+    OCLCHECK( clEnqueueReadBuffer, h->opencl.queue, h->opencl.frame_stats[h->opencl.last_buf], CL_FALSE, 0, size, locked, 0, NULL, NULL );
+    h->opencl.last_buf = !h->opencl.last_buf;
+
+    h->opencl.copies[h->opencl.num_copies].src = locked;
+    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est[b - p0][p1 - b];
+    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+    h->opencl.num_copies++;
+    h->opencl.copies[h->opencl.num_copies].src = locked + sizeof(int);
+    h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_cost_est_aq[b - p0][p1 - b];
+    h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+    h->opencl.num_copies++;
+
+    if( b == p1 ) // P frames only
+    {
+        h->opencl.copies[h->opencl.num_copies].src = locked + 2 * sizeof(int);
+        h->opencl.copies[h->opencl.num_copies].dest = &fenc->i_intra_mbs[b - p0];
+        h->opencl.copies[h->opencl.num_copies].bytes = sizeof(int);
+        h->opencl.num_copies++;
+    }
+    return 0;
+}
+
+void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda )
+{
+    if( h->param.b_opencl )
+    {
+#if _WIN32
+        /* Temporarily boost priority of this lookahead thread and the OpenCL
+         * driver's thread until the end of this function.  On AMD GPUs this
+         * greatly reduces the latency of enqueuing kernels and getting results
+         * on Windows. */
+        HANDLE id = GetCurrentThread();
+        h->opencl.lookahead_thread_pri = GetThreadPriority( id );
+        SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
+        cl_int status = clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
+        if( status == CL_SUCCESS )
+        {
+            h->opencl.opencl_thread_pri = GetThreadPriority( id );
+            SetThreadPriority( id, THREAD_PRIORITY_ABOVE_NORMAL );
+        }
+#endif
+
+        /* precalculate intra and I frames */
+        for( int i = 0; i <= num_frames; i++ )
+            x264_opencl_lowres_init( h, frames[i], lambda );
+        x264_opencl_flush( h );
+
+        if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS && h->param.i_bframe )
+        {
+            /* For trellis B-Adapt, precompute exhaustive motion searches */
+            for( int b = 0; b <= num_frames; b++ )
+            {
+                for( int j = 1; j < h->param.i_bframe; j++ )
+                {
+                    int p0 = b - j;
+                    if( p0 >= 0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF )
+                    {
+                        const x264_weight_t *w = x264_weight_none;
+
+                        if( h->param.analyse.i_weighted_pred )
+                        {
+                            x264_emms();
+                            x264_weights_analyse( h, frames[b], frames[p0], 1 );
+                            w = frames[b]->weight[0];
+                        }
+                        frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+                        x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
+                    }
+                    int p1 = b + j;
+                    if( p1 <= num_frames && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF )
+                    {
+                        frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
+                        x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
+                    }
+                }
+            }
+
+            x264_opencl_flush( h );
+        }
+    }
+}
+
+
+void x264_opencl_slicetype_end( x264_t *h )
+{
+#if _WIN32
+    if( h->param.b_opencl )
+    {
+        HANDLE id = GetCurrentThread();
+        SetThreadPriority( id, h->opencl.lookahead_thread_pri );
+        cl_int status = clGetCommandQueueInfo( h->opencl.queue, CL_QUEUE_THREAD_HANDLE_AMD, sizeof(HANDLE), &id, NULL );
+        if( status == CL_SUCCESS )
+            SetThreadPriority( id, h->opencl.opencl_thread_pri );
+    }
+#endif
+}
+
+int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b )
+{
+    if( (frames[b]->i_cost_est[b-p0][p1-b] >= 0) || (b == p0 && b == p1) )
+        return 0;
+    else
+    {
+        int do_search[2];
+        int dist_scale_factor = 128;
+        const x264_weight_t *w = x264_weight_none;
+
+        // avoid duplicating work
+        frames[b]->i_cost_est[b-p0][p1-b] = 0;
+
+        do_search[0] = b != p0 && frames[b]->lowres_mvs[0][b-p0-1][0][0] == 0x7FFF;
+        do_search[1] = b != p1 && frames[b]->lowres_mvs[1][p1-b-1][0][0] == 0x7FFF;
+        if( do_search[0] )
+        {
+            if( h->param.analyse.i_weighted_pred && b == p1 )
+            {
+                x264_emms();
+                x264_weights_analyse( h, frames[b], frames[p0], 1 );
+                w = frames[b]->weight[0];
+            }
+            frames[b]->lowres_mvs[0][b-p0-1][0][0] = 0;
+        }
+        if( do_search[1] )
+            frames[b]->lowres_mvs[1][p1-b-1][0][0] = 0;
+        if( b == p1 )
+            frames[b]->i_intra_mbs[b-p0] = 0;
+        if( p1 != p0 )
+            dist_scale_factor = ( ((b-p0) << 8) + ((p1-p0) >> 1) ) / (p1-p0);
+
+        frames[b]->i_cost_est[b-p0][p1-b] = 0;
+        frames[b]->i_cost_est_aq[b-p0][p1-b] = 0;
+
+        x264_opencl_lowres_init( h, frames[b], lambda );
+
+        if( do_search[0] )
+        {
+            x264_opencl_lowres_init( h, frames[p0], lambda );
+            x264_opencl_motionsearch( h, frames, b, p0, 0, lambda, w );
+        }
+        if( do_search[1] )
+        {
+            x264_opencl_lowres_init( h, frames[p1], lambda );
+            x264_opencl_motionsearch( h, frames, b, p1, 1, lambda, NULL );
+        }
+        x264_opencl_finalize_cost( h, lambda, frames, p0, p1, b, dist_scale_factor );
+        return 1;
+    }
+}
+
+#endif
diff --git a/encoder/slicetype.c b/encoder/slicetype.c

index a0643dc646de52d56f6bde570cdb1d98af9b0c2c..e300bbf353bfd477998cc68aa15384fc94956a51 100644 (file)
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -36,6 +36,18 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
                                        x264_frame_t **frames, int p0, int p1, int b,
                                        int b_intra_penalty );
  
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead );
+
+#if HAVE_OPENCL
+int x264_opencl_lowres_init( x264_t *h, x264_frame_t *fenc, int lambda );
+int x264_opencl_motionsearch( x264_t *h, x264_frame_t **frames, int b, int ref, int b_islist1, int lambda, const x264_weight_t *w );
+int x264_opencl_finalize_cost( x264_t *h, int lambda, x264_frame_t **frames, int p0, int p1, int b, int dist_scale_factor );
+int x264_opencl_precalculate_frame_cost( x264_t *h, x264_frame_t **frames, int lambda, int p0, int p1, int b );
+void x264_opencl_flush( x264_t *h );
+void x264_opencl_slicetype_prep( x264_t *h, x264_frame_t **frames, int num_frames, int lambda );
+void x264_opencl_slicetype_end( x264_t *h );
+#endif
+
  static void x264_lowres_context_init( x264_t *h, x264_mb_analysis_t *a )
  {
      a->i_qp = X264_LOOKAHEAD_QP;
@@ -276,7 +288,7 @@ static NOINLINE unsigned int x264_weight_cost_chroma444( x264_t *h, x264_frame_t
      return cost;
  }
  
-static void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
+void x264_weights_analyse( x264_t *h, x264_frame_t *fenc, x264_frame_t *ref, int b_lookahead )
  {
      int i_delta_index = fenc->i_frame - ref->i_frame - 1;
      /* epsilon is chosen to require at least a numerator of 127 (with denominator = 128) */
@@ -856,96 +868,120 @@ static int x264_slicetype_frame_cost( x264_t *h, x264_mb_analysis_t *a,
          output_inter[0] = h->scratch_buffer2;
          output_intra[0] = output_inter[0] + output_buf_size;
  
-        if( h->param.i_lookahead_threads > 1 )
+#if HAVE_OPENCL
+        if( h->param.b_opencl )
          {
-            x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
+            x264_opencl_lowres_init(h, fenc, a->i_lambda );
+            if( do_search[0] )
+            {
+                x264_opencl_lowres_init( h, frames[p0], a->i_lambda );
+                x264_opencl_motionsearch( h, frames, b, p0, 0, a->i_lambda, w );
+            }
+            if( do_search[1] )
+            {
+                x264_opencl_lowres_init( h, frames[p1], a->i_lambda );
+                x264_opencl_motionsearch( h, frames, b, p1, 1, a->i_lambda, NULL );
+            }
+            if( b != p0 )
+                x264_opencl_finalize_cost( h, a->i_lambda, frames, p0, p1, b, dist_scale_factor );
+            x264_opencl_flush( h );
  
-            for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+            i_score = fenc->i_cost_est[b-p0][p1-b];
+        }
+        else
+#endif
+        {
+            if( h->param.i_lookahead_threads > 1 )
              {
-                x264_t *t = h->lookahead_thread[i];
+                x264_slicetype_slice_t s[X264_LOOKAHEAD_THREAD_MAX];
  
-                /* FIXME move this somewhere else */
-                t->mb.i_me_method = h->mb.i_me_method;
-                t->mb.i_subpel_refine = h->mb.i_subpel_refine;
-                t->mb.b_chroma_me = h->mb.b_chroma_me;
+                for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+                {
+                    x264_t *t = h->lookahead_thread[i];
  
-                s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
-                                                 output_inter[i], output_intra[i] };
+                    /* FIXME move this somewhere else */
+                    t->mb.i_me_method = h->mb.i_me_method;
+                    t->mb.i_subpel_refine = h->mb.i_subpel_refine;
+                    t->mb.b_chroma_me = h->mb.b_chroma_me;
  
-                t->i_threadslice_start = ((h->mb.i_mb_height *  i    + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
-                t->i_threadslice_end   = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
+                    s[i] = (x264_slicetype_slice_t){ t, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
+                        output_inter[i], output_intra[i] };
  
-                int thread_height = t->i_threadslice_end - t->i_threadslice_start;
-                int thread_output_size = thread_height + NUM_INTS;
-                memset( output_inter[i], 0, thread_output_size * sizeof(int) );
-                memset( output_intra[i], 0, thread_output_size * sizeof(int) );
-                output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height;
+                    t->i_threadslice_start = ((h->mb.i_mb_height *  i    + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
+                    t->i_threadslice_end   = ((h->mb.i_mb_height * (i+1) + h->param.i_lookahead_threads/2) / h->param.i_lookahead_threads);
  
-                output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE;
-                output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE;
+                    int thread_height = t->i_threadslice_end - t->i_threadslice_start;
+                    int thread_output_size = thread_height + NUM_INTS;
+                    memset( output_inter[i], 0, thread_output_size * sizeof(int) );
+                    memset( output_intra[i], 0, thread_output_size * sizeof(int) );
+                    output_inter[i][NUM_ROWS] = output_intra[i][NUM_ROWS] = thread_height;
  
-                x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] );
-            }
-            for( int i = 0; i < h->param.i_lookahead_threads; i++ )
-                x264_threadpool_wait( h->lookaheadpool, &s[i] );
-        }
-        else
-        {
-            h->i_threadslice_start = 0;
-            h->i_threadslice_end = h->mb.i_mb_height;
-            memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
-            memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
-            output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height;
-            x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
-                                                                 output_inter[0], output_intra[0] };
-            x264_slicetype_slice_cost( &s );
-        }
+                    output_inter[i+1] = output_inter[i] + thread_output_size + PAD_SIZE;
+                    output_intra[i+1] = output_intra[i] + thread_output_size + PAD_SIZE;
  
-        /* Sum up accumulators */
-        if( b == p1 )
-            fenc->i_intra_mbs[b-p0] = 0;
-        if( !fenc->b_intra_calculated )
-        {
-            fenc->i_cost_est[0][0] = 0;
-            fenc->i_cost_est_aq[0][0] = 0;
-        }
-        fenc->i_cost_est[b-p0][p1-b] = 0;
-        fenc->i_cost_est_aq[b-p0][p1-b] = 0;
+                    x264_threadpool_run( h->lookaheadpool, (void*)x264_slicetype_slice_cost, &s[i] );
+                }
+                for( int i = 0; i < h->param.i_lookahead_threads; i++ )
+                    x264_threadpool_wait( h->lookaheadpool, &s[i] );
+            }
+            else
+            {
+                h->i_threadslice_start = 0;
+                h->i_threadslice_end = h->mb.i_mb_height;
+                memset( output_inter[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
+                memset( output_intra[0], 0, (output_buf_size - PAD_SIZE) * sizeof(int) );
+                output_inter[0][NUM_ROWS] = output_intra[0][NUM_ROWS] = h->mb.i_mb_height;
+                x264_slicetype_slice_t s = (x264_slicetype_slice_t){ h, a, frames, p0, p1, b, dist_scale_factor, do_search, w,
+                    output_inter[0], output_intra[0] };
+                x264_slicetype_slice_cost( &s );
+            }
  
-        int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b];
-        int *row_satd_intra = fenc->i_row_satds[0][0];
-        for( int i = 0; i < h->param.i_lookahead_threads; i++ )
-        {
+            /* Sum up accumulators */
              if( b == p1 )
-                fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS];
+                fenc->i_intra_mbs[b-p0] = 0;
              if( !fenc->b_intra_calculated )
              {
-                fenc->i_cost_est[0][0] += output_intra[i][COST_EST];
-                fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ];
+                fenc->i_cost_est[0][0] = 0;
+                fenc->i_cost_est_aq[0][0] = 0;
              }
+            fenc->i_cost_est[b-p0][p1-b] = 0;
+            fenc->i_cost_est_aq[b-p0][p1-b] = 0;
  
-            fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST];
-            fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ];
-
-            if( h->param.rc.i_vbv_buffer_size )
+            int *row_satd_inter = fenc->i_row_satds[b-p0][p1-b];
+            int *row_satd_intra = fenc->i_row_satds[0][0];
+            for( int i = 0; i < h->param.i_lookahead_threads; i++ )
              {
-                int row_count = output_inter[i][NUM_ROWS];
-                memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) );
+                if( b == p1 )
+                    fenc->i_intra_mbs[b-p0] += output_inter[i][INTRA_MBS];
                  if( !fenc->b_intra_calculated )
-                    memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) );
-                row_satd_inter += row_count;
-                row_satd_intra += row_count;
+                {
+                    fenc->i_cost_est[0][0] += output_intra[i][COST_EST];
+                    fenc->i_cost_est_aq[0][0] += output_intra[i][COST_EST_AQ];
+                }
+
+                fenc->i_cost_est[b-p0][p1-b] += output_inter[i][COST_EST];
+                fenc->i_cost_est_aq[b-p0][p1-b] += output_inter[i][COST_EST_AQ];
+
+                if( h->param.rc.i_vbv_buffer_size )
+                {
+                    int row_count = output_inter[i][NUM_ROWS];
+                    memcpy( row_satd_inter, output_inter[i] + NUM_INTS, row_count * sizeof(int) );
+                    if( !fenc->b_intra_calculated )
+                        memcpy( row_satd_intra, output_intra[i] + NUM_INTS, row_count * sizeof(int) );
+                    row_satd_inter += row_count;
+                    row_satd_intra += row_count;
+                }
              }
-        }
  
-        i_score = fenc->i_cost_est[b-p0][p1-b];
-        if( b != p1 )
-            i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
-        else
-            fenc->b_intra_calculated = 1;
+            i_score = fenc->i_cost_est[b-p0][p1-b];
+            if( b != p1 )
+                i_score = (uint64_t)i_score * 100 / (120 + h->param.i_bframe_bias);
+            else
+                fenc->b_intra_calculated = 1;
  
-        fenc->i_cost_est[b-p0][p1-b] = i_score;
-        x264_emms();
+            fenc->i_cost_est[b-p0][p1-b] = i_score;
+            x264_emms();
+        }
      }
  
      if( b_intra_penalty )
@@ -1516,6 +1552,10 @@ void x264_slicetype_analyse( x264_t *h, int intra_minigop )
          return;
      }
  
+#if HAVE_OPENCL
+    x264_opencl_slicetype_prep( h, frames, num_frames, a.i_lambda );
+#endif
+
      if( h->param.i_bframe )
      {
          if( h->param.i_bframe_adaptive == X264_B_ADAPT_TRELLIS )
@@ -1549,6 +1589,18 @@ void x264_slicetype_analyse( x264_t *h, int intra_minigop )
                      continue;
                  }
  
+#if HAVE_OPENCL
+                if( h->param.b_opencl )
+                {
+                    int b_work_done = 0;
+                    b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+0, i+2, i+1 );
+                    b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+0, i+1, i+1 );
+                    b_work_done |= x264_opencl_precalculate_frame_cost(h, frames, a.i_lambda, i+1, i+2, i+2 );
+                    if( b_work_done )
+                        x264_opencl_flush( h );
+                }
+#endif
+
                  cost1b1 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+2, i+1, 0 );
                  cost1p0 = x264_slicetype_frame_cost( h, &a, frames, i+0, i+1, i+1, 0 );
                  cost2p0 = x264_slicetype_frame_cost( h, &a, frames, i+1, i+2, i+2, 0 );
@@ -1631,6 +1683,10 @@ void x264_slicetype_analyse( x264_t *h, int intra_minigop )
      /* Restore frametypes for all frames that haven't actually been decided yet. */
      for( int j = reset_start; j <= num_frames; j++ )
          frames[j]->i_type = X264_TYPE_AUTO;
+
+#if HAVE_OPENCL
+    x264_opencl_slicetype_end( h );
+#endif
  }
  
  void x264_slicetype_decide( x264_t *h )
diff --git a/tools/cltostr.pl b/tools/cltostr.pl

new file mode 100644 (file)

index 0000000..371a1bd
--- /dev/null
+++ b/tools/cltostr.pl
@@ -0,0 +1,65 @@
+# Perl script used for compiling OpenCL src into x264 binary
+#
+# Copyright (C) 2013 x264 project
+# Authors: Steve Borho <sborho@multicorewareinc.com>
+
+use Digest::MD5 qw(md5_hex);
+
+# xxd takes a VAR, which will be the variable name
+# and BYTES, a string of bytes to beencoded.
+sub xxd
+{
+  my %args = @_;
+  my $var = $args{VAR};
+  my $bytes = $args{BYTES};
+  my @hexbytes;
+  my @bytes = split //, $$bytes;
+  foreach $b (@bytes)
+  {
+    push @hexbytes, sprintf("0x%02X", ord($b));
+  }
+
+  # Format 'em nice and pretty-like.
+  print 'static const char ' . $var . '[] = {' . "\n";
+  my $count = 0;
+  foreach my $h (@hexbytes)
+  {
+    print "$h, ";
+    $count++;
+    if ($count == 16)
+    {
+      print "\n";
+      $count = 0;
+    }
+  }
+  print "\n0x00 };\n\n";
+
+  return;
+}
+
+if (@ARGV < 1)
+{
+  printf "%s: VARNAME ", $0 . "\n";
+  exit(-1);
+}
+
+
+my @lines;
+while(<STDIN>)
+{
+  s/^\s+//;                # trim leading whitespace
+  if (/^\/\//)
+  {
+    next;   # skip the line if it starts with '//'
+  }
+  push @lines, $_;
+}
+
+my $lines = join '', @lines;
+xxd(VAR => @ARGV[0], BYTES => \$lines);
+
+my $hash = md5_hex($lines);
+@hash = ( $hash =~ m/../g );
+
+
+xxd(VAR => @ARGV[0] . "_hash", BYTES => \$hash);
diff --git a/x264.c b/x264.c

index 3791d8b24c763d662f7ef5f4fc107794d49fc42d..04e1e974fd359e7a79ea2d5bfc1d4bf375818dd3 100644 (file)
--- a/x264.c
+++ b/x264.c
@@ -809,6 +809,9 @@ static void help( x264_param_t *defaults, int longhelp )
          "                                  as opposed to letting them select different algorithms\n" );
      H2( "      --asm <integer>         Override CPU detection\n" );
      H2( "      --no-asm                Disable all CPU optimizations\n" );
+    H2( "      --opencl                Enable use of OpenCL\n" );
+    H2( "      --opencl-clbin <string> Specify path of compiled OpenCL kernel cache\n" );
+    H2( "      --opencl-device <integer>  Specify OpenCL device ordinal\n" );
      H2( "      --visualize             Show MB types overlayed on the encoded video\n" );
      H2( "      --dump-yuv <string>     Save reconstructed frames\n" );
      H2( "      --sps-id <integer>      Set SPS and PPS id numbers [%d]\n", defaults->i_sps_id );
@@ -913,6 +916,9 @@ static struct option long_options[] =
      { "ref",         required_argument, NULL, 'r' },
      { "asm",         required_argument, NULL, 0 },
      { "no-asm",            no_argument, NULL, 0 },
+    { "opencl",            no_argument, NULL, 1 },
+    { "opencl-clbin",required_argument, NULL, 0 },
+    { "opencl-device",required_argument, NULL, 0 },
      { "sar",         required_argument, NULL, 0 },
      { "fps",         required_argument, NULL, OPT_FPS },
      { "frames",      required_argument, NULL, OPT_FRAMES },
diff --git a/x264.h b/x264.h

index 98d38a0e639e69db8f7e2661dac771e904737aab..b2b1f9a36cff5a503ba60a9a02feac3b764163e6 100644 (file)
--- a/x264.h
+++ b/x264.h
@@ -474,6 +474,11 @@ typedef struct x264_param_t
  
      int b_fake_interlaced;
  
+    int b_opencl;            /* use OpenCL when available */
+    int i_opencl_device;     /* specify count of GPU devices to skip, for CLI users */
+    void *opencl_device_id;  /* pass explicit cl_device_id as void*, for API users */
+    char *psz_clbin_file;    /* compiled OpenCL kernel cache file */
+
      /* Slicing parameters */
      int i_slice_max_size;    /* Max size per slice in bytes; includes estimated NAL overhead. */
      int i_slice_max_mbs;     /* Max number of MBs per slice; overrides i_slice_count. */
author	Steve Borho <steve@borho.org>
	Thu, 21 Feb 2013 18:48:40 +0000 (12:48 -0600)
committer	Fiona Glaser <fiona@x264.com>
	Tue, 23 Apr 2013 21:36:20 +0000 (14:36 -0700)
.gitignore		patch \| blob \| history
Makefile		patch \| blob \| history
common/common.c		patch \| blob \| history
common/common.h		patch \| blob \| history
common/frame.c		patch \| blob \| history
common/frame.h		patch \| blob \| history
common/opencl.c	[new file with mode: 0644]	patch \| blob
common/opencl.h	[new file with mode: 0644]	patch \| blob
common/opencl/bidir.cl	[new file with mode: 0644]	patch \| blob
common/opencl/downscale.cl	[new file with mode: 0644]	patch \| blob
common/opencl/intra.cl	[new file with mode: 0644]	patch \| blob
common/opencl/motionsearch.cl	[new file with mode: 0644]	patch \| blob
common/opencl/subpel.cl	[new file with mode: 0644]	patch \| blob
common/opencl/weightp.cl	[new file with mode: 0644]	patch \| blob
common/opencl/x264-cl.h	[new file with mode: 0644]	patch \| blob
configure		patch \| blob \| history
encoder/encoder.c		patch \| blob \| history
encoder/slicetype-cl.c	[new file with mode: 0644]	patch \| blob
encoder/slicetype.c		patch \| blob \| history
tools/cltostr.pl	[new file with mode: 0644]	patch \| blob
x264.c		patch \| blob \| history
x264.h		patch \| blob \| history