build: add support for building CUDA files with clang

author Rodger Combs <rodger.combs@gmail.com>

Tue, 30 Jul 2019 07:51:42 +0000 (02:51 -0500)

committer Timo Rothenpieler <timo@rothenpieler.org>

Sun, 4 Aug 2019 17:08:08 +0000 (19:08 +0200)
author Rodger Combs <rodger.combs@gmail.com>
Tue, 30 Jul 2019 07:51:42 +0000 (02:51 -0500)
committer Timo Rothenpieler <timo@rothenpieler.org>
Sun, 4 Aug 2019 17:08:08 +0000 (19:08 +0200)
diff --git a/Changelog b/Changelog

index beb2d2615d1a3975c3b5f7f364290811959822ed..389ca6c4db8a72a3d9fe8fc10d108b412dd8c818 100644 (file)
--- a/Changelog
+++ b/Changelog
@@ -36,6 +36,7 @@ version 4.2:
  - derain filter
  - deesser filter
  - mov muxer writes tracks with unspecified language instead of English by default
+- add support for using clang to compile CUDA kernels
  
  
  version 4.1:
diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h

new file mode 100644 (file)

index 0000000..dbe50f8
--- /dev/null
+++ b/compat/cuda/cuda_runtime.h
@@ -0,0 +1,131 @@
+/*
+ * Minimum CUDA compatibility definitions header
+ *
+ * Copyright (c) 2019 Rodger Combs
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AV_COMPAT_CUDA_CUDA_RUNTIME_H
+#define AV_COMPAT_CUDA_CUDA_RUNTIME_H
+
+// Common macros
+#define __global__ __attribute__((global))
+#define __device__ __attribute__((device))
+#define __device_builtin__ __attribute__((device_builtin))
+#define __align__(N) __attribute__((aligned(N)))
+#define __inline__ __inline__ __attribute__((always_inline))
+
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#define abs(x) ((x) < 0 ? -(x) : (x))
+
+#define atomicAdd(a, b) (__atomic_fetch_add(a, b, __ATOMIC_SEQ_CST))
+
+// Basic typedefs
+typedef __device_builtin__ unsigned long long cudaTextureObject_t;
+
+typedef struct __device_builtin__ __align__(2) uchar2
+{
+    unsigned char x, y;
+} uchar2;
+
+typedef struct __device_builtin__ __align__(4) ushort2
+{
+    unsigned short x, y;
+} ushort2;
+
+typedef struct __device_builtin__ uint3
+{
+    unsigned int x, y, z;
+} uint3;
+
+typedef struct uint3 dim3;
+
+typedef struct __device_builtin__ __align__(8) int2
+{
+    int x, y;
+} int2;
+
+typedef struct __device_builtin__ __align__(4) uchar4
+{
+    unsigned char x, y, z, w;
+} uchar4;
+
+typedef struct __device_builtin__ __align__(8) ushort4
+{
+    unsigned char x, y, z, w;
+} ushort4;
+
+typedef struct __device_builtin__ __align__(16) int4
+{
+    int x, y, z, w;
+} int4;
+
+// Accessors for special registers
+#define GETCOMP(reg, comp) \
+    asm("mov.u32 %0, %%" #reg "." #comp ";" : "=r"(tmp)); \
+    ret.comp = tmp;
+
+#define GET(name, reg) static inline __device__ uint3 name() {\
+    uint3 ret; \
+    unsigned tmp; \
+    GETCOMP(reg, x) \
+    GETCOMP(reg, y) \
+    GETCOMP(reg, z) \
+    return ret; \
+}
+
+GET(getBlockIdx, ctaid)
+GET(getBlockDim, ntid)
+GET(getThreadIdx, tid)
+
+// Instead of externs for these registers, we turn access to them into calls into trivial ASM
+#define blockIdx (getBlockIdx())
+#define blockDim (getBlockDim())
+#define threadIdx (getThreadIdx())
+
+// Basic initializers (simple macros rather than inline functions)
+#define make_uchar2(a, b) ((uchar2){.x = a, .y = b})
+#define make_ushort2(a, b) ((ushort2){.x = a, .y = b})
+#define make_uchar4(a, b, c, d) ((uchar4){.x = a, .y = b, .z = c, .w = d})
+#define make_ushort4(a, b, c, d) ((ushort4){.x = a, .y = b, .z = c, .w = d})
+
+// Conversions from the tex instruction's 4-register output to various types
+#define TEX2D(type, ret) static inline __device__ void conv(type* out, unsigned a, unsigned b, unsigned c, unsigned d) {*out = (ret);}
+
+TEX2D(unsigned char, a & 0xFF)
+TEX2D(unsigned short, a & 0xFFFF)
+TEX2D(uchar2, make_uchar2(a & 0xFF, b & 0xFF))
+TEX2D(ushort2, make_ushort2(a & 0xFFFF, b & 0xFFFF))
+TEX2D(uchar4, make_uchar4(a & 0xFF, b & 0xFF, c & 0xFF, d & 0xFF))
+TEX2D(ushort4, make_ushort4(a & 0xFFFF, b & 0xFFFF, c & 0xFFFF, d & 0xFFFF))
+
+// Template calling tex instruction and converting the output to the selected type
+template <class T>
+static inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y)
+{
+  T ret;
+  unsigned ret1, ret2, ret3, ret4;
+  asm("tex.2d.v4.u32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :
+      "=r"(ret1), "=r"(ret2), "=r"(ret3), "=r"(ret4) :
+      "l"(texObject), "f"(x), "f"(y));
+  conv(&ret, ret1, ret2, ret3, ret4);
+  return ret;
+}
+
+#endif
diff --git a/configure b/configure

index 5a4f507246b8881fa349ff96c34f566b84eab0ae..bddc05b8501968940ba989380169906e9532ef6b 100755 (executable)
--- a/configure
+++ b/configure
@@ -322,6 +322,7 @@ External library support:
    --disable-amf            disable AMF video encoding code [autodetect]
    --disable-audiotoolbox   disable Apple AudioToolbox code [autodetect]
    --enable-cuda-nvcc       enable Nvidia CUDA compiler [no]
+  --disable-cuda-llvm      disable CUDA compilation using clang [autodetect]
    --disable-cuvid          disable Nvidia CUVID support [autodetect]
    --disable-d3d11va        disable Microsoft Direct3D 11 video acceleration code [autodetect]
    --disable-dxva2          disable Microsoft DirectX 9 video acceleration code [autodetect]
@@ -370,7 +371,7 @@ Toolchain options:
    --cxx=CXX                use C compiler CXX [$cxx_default]
    --objcc=OCC              use ObjC compiler OCC [$cc_default]
    --dep-cc=DEPCC           use dependency generator DEPCC [$cc_default]
-  --nvcc=NVCC              use Nvidia CUDA compiler NVCC [$nvcc_default]
+  --nvcc=NVCC              use Nvidia CUDA compiler NVCC or clang [$nvcc_default]
    --ld=LD                  use linker LD [$ld_default]
    --pkg-config=PKGCONFIG   use pkg-config tool PKGCONFIG [$pkg_config_default]
    --pkg-config-flags=FLAGS pass additional flags to pkgconf []
@@ -1038,12 +1039,16 @@ test_nvcc(){
      tmpcu_=$TMPCU
      tmpo_=$TMPO
      [ -x "$(command -v cygpath)" ] && tmpcu_=$(cygpath -m $tmpcu_) && tmpo_=$(cygpath -m $tmpo_)
-    test_cmd $nvcc -ptx $NVCCFLAGS "$@" $NVCC_C $(nvcc_o $tmpo_) $tmpcu_
+    test_cmd $nvcc $nvccflags "$@" $NVCC_C $(nvcc_o $tmpo_) $tmpcu_
  }
  
  check_nvcc() {
      log check_nvcc "$@"
-    test_nvcc <<EOF
+    name=$1
+    shift 1
+    disabled $name && return
+    disable $name
+    test_nvcc "$@" <<EOF && enable $name
  extern "C" {
      __global__ void hello(unsigned char *data) {}
  }
@@ -1814,6 +1819,7 @@ HWACCEL_AUTODETECT_LIBRARY_LIST="
      audiotoolbox
      crystalhd
      cuda
+    cuda_llvm
      cuvid
      d3d11va
      dxva2
@@ -2987,8 +2993,10 @@ v4l2_m2m_deps="linux_videodev2_h sem_timedwait"
  
  hwupload_cuda_filter_deps="ffnvcodec"
  scale_npp_filter_deps="ffnvcodec libnpp"
-scale_cuda_filter_deps="ffnvcodec cuda_nvcc"
-thumbnail_cuda_filter_deps="ffnvcodec cuda_nvcc"
+scale_cuda_filter_deps="ffnvcodec"
+scale_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
+thumbnail_cuda_filter_deps="ffnvcodec"
+thumbnail_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
  transpose_npp_filter_deps="ffnvcodec libnpp"
  
  amf_deps_any="libdl LoadLibrary"
@@ -3547,7 +3555,8 @@ zscale_filter_deps="libzimg const_nan"
  scale_vaapi_filter_deps="vaapi"
  vpp_qsv_filter_deps="libmfx"
  vpp_qsv_filter_select="qsvvpp"
-yadif_cuda_filter_deps="ffnvcodec cuda_nvcc"
+yadif_cuda_filter_deps="ffnvcodec"
+yadif_cuda_filter_deps_any="cuda_nvcc cuda_llvm"
  
  # examples
  avio_dir_cmd_deps="avformat avutil"
@@ -3651,8 +3660,6 @@ version_script='--version-script'
  objformat="elf32"
  x86asmexe_default="nasm"
  windres_default="windres"
-nvcc_default="nvcc"
-nvccflags_default="-gencode arch=compute_30,code=sm_30 -O2"
  striptype="direct"
  
  # OS
@@ -4220,6 +4227,20 @@ windres_default="${cross_prefix}${windres_default}"
  
  sysinclude_default="${sysroot}/usr/include"
  
+if enabled cuda_sdk; then
+    warn "Option --enable-cuda-sdk is deprecated. Use --enable-cuda-nvcc instead."
+    enable cuda_nvcc
+fi
+
+if enabled cuda_nvcc; then
+    nvcc_default="nvcc"
+    nvccflags_default="-gencode arch=compute_30,code=sm_30 -O2"
+else
+    nvcc_default="clang"
+    nvccflags_default="--cuda-gpu-arch=sm_30 -O2"
+    NVCC_C=""
+fi
+
  set_default arch cc cxx doxygen pkg_config ranlib strip sysinclude \
      target_exec x86asmexe nvcc
  enabled cross_compile || host_cc_default=$cc
@@ -6060,9 +6081,21 @@ check_type "d3d9.h dxva2api.h" DXVA2_ConfigPictureDecode -D_WIN32_WINNT=0x0602
  
  check_type "vdpau/vdpau.h" "VdpPictureInfoHEVC"
  
-if enabled cuda_sdk; then
-    warn "Option --enable-cuda-sdk is deprecated. Use --enable-cuda-nvcc instead."
-    enable cuda_nvcc
+if [ -z "$nvccflags" ]; then
+    nvccflags=$nvccflags_default
+fi
+
+if enabled x86_64 || enabled ppc64 || enabled aarch64; then
+    nvccflags="$nvccflags -m64"
+else
+    nvccflags="$nvccflags -m32"
+fi
+
+if enabled cuda_nvcc; then
+    nvccflags="$nvccflags -ptx"
+else
+    nvccflags="$nvccflags -S -nocudalib -nocudainc --cuda-device-only -include${source_link}/compat/cuda/cuda_runtime.h"
+    check_nvcc cuda_llvm
  fi
  
  if ! disabled ffnvcodec; then
@@ -6140,7 +6173,7 @@ for func in $COMPLEX_FUNCS; do
  done
  
  # these are off by default, so fail if requested and not available
-enabled cuda_nvcc         && { check_nvcc || die "ERROR: failed checking for nvcc."; }
+enabled cuda_nvcc         && { check_nvcc cuda_nvcc || die "ERROR: failed checking for nvcc."; }
  enabled chromaprint       && require chromaprint chromaprint.h chromaprint_get_version -lchromaprint
  enabled decklink          && { require_headers DeckLinkAPI.h &&
                                 { test_cpp_condition DeckLinkAPIVersion.h "BLACKMAGIC_DECKLINK_API_VERSION >= 0x0a090500" || die "ERROR: Decklink API version must be >= 10.9.5."; } }
@@ -6701,16 +6734,6 @@ if [ -z "$optflags" ]; then
      fi
  fi
  
-if [ -z "$nvccflags" ]; then
-    nvccflags=$nvccflags_default
-fi
-
-if enabled x86_64 || enabled ppc64 || enabled aarch64; then
-    nvccflags="$nvccflags -m64"
-else
-    nvccflags="$nvccflags -m32"
-fi
-
  check_optflags(){
      check_cflags "$@"
      enabled lto && check_ldflags "$@"
diff --git a/ffbuild/common.mak b/ffbuild/common.mak

index d2b33320c03a90a2c318abf298eed5d096e7da56..7355508ea02f9d5ffef223f2720c925f0cd9f46b 100644 (file)
--- a/ffbuild/common.mak
+++ b/ffbuild/common.mak
@@ -38,7 +38,6 @@ OBJCCFLAGS  = $(CPPFLAGS) $(CFLAGS) $(OBJCFLAGS)
  ASFLAGS    := $(CPPFLAGS) $(ASFLAGS)
  CXXFLAGS   := $(CPPFLAGS) $(CFLAGS) $(CXXFLAGS)
  X86ASMFLAGS += $(IFLAGS:%=%/) -I$(<D)/ -Pconfig.asm
-NVCCFLAGS  += -ptx
  
  HOSTCCFLAGS = $(IFLAGS) $(HOSTCPPFLAGS) $(HOSTCFLAGS)
  LDFLAGS    := $(ALLFFLIBS:%=$(LD_PATH)lib%) $(LDFLAGS)
@@ -91,7 +90,7 @@ COMPILE_NVCC = $(call COMPILE,NVCC)
  %.h.c:
         $(Q)echo '#include "$*.h"' >$@
  
-%.ptx: %.cu
+%.ptx: %.cu $(SRC_PATH)/compat/cuda/cuda_runtime.h
         $(COMPILE_NVCC)
  
  %.ptx.c: %.ptx
author	Rodger Combs <rodger.combs@gmail.com>
	Tue, 30 Jul 2019 07:51:42 +0000 (02:51 -0500)
committer	Timo Rothenpieler <timo@rothenpieler.org>
	Sun, 4 Aug 2019 17:08:08 +0000 (19:08 +0200)
Changelog		patch \| blob \| history
compat/cuda/cuda_runtime.h	[new file with mode: 0644]	patch \| blob
configure		patch \| blob \| history
ffbuild/common.mak		patch \| blob \| history