avfilter/scale_cuda: add bicubic interpolation

[ffmpeg] / compat / cuda / cuda_runtime.h
diff --git a/compat/cuda/cuda_runtime.h b/compat/cuda/cuda_runtime.h

index 92c55ad8591d5c99a3694518bce3592492f4687a..353efcf5f9c038074e343b00f871e18b6f415249 100644 (file)
--- a/compat/cuda/cuda_runtime.h
+++ b/compat/cuda/cuda_runtime.h
@@ -49,18 +49,23 @@ typedef struct __device_builtin__ __align__(4) ushort2
      unsigned short x, y;
  } ushort2;
  
-typedef struct __device_builtin__ uint3
+typedef struct __device_builtin__ __align__(8) float2
  {
-    unsigned int x, y, z;
-} uint3;
-
-typedef struct uint3 dim3;
+    float x, y;
+} float2;
  
  typedef struct __device_builtin__ __align__(8) int2
  {
      int x, y;
  } int2;
  
+typedef struct __device_builtin__ uint3
+{
+    unsigned int x, y, z;
+} uint3;
+
+typedef struct uint3 dim3;
+
  typedef struct __device_builtin__ __align__(4) uchar4
  {
      unsigned char x, y, z, w;
@@ -76,6 +81,11 @@ typedef struct __device_builtin__ __align__(16) int4
      int x, y, z, w;
  } int4;
  
+typedef struct __device_builtin__ __align__(16) float4
+{
+    float x, y, z, w;
+} float4;
+
  // Accessors for special registers
  #define GETCOMP(reg, comp) \
      asm("mov.u32 %0, %%" #reg "." #comp ";" : "=r"(tmp)); \
@@ -100,24 +110,31 @@ GET(getThreadIdx, tid)
  #define threadIdx (getThreadIdx())
  
  // Basic initializers (simple macros rather than inline functions)
+#define make_int2(a, b) ((int2){.x = a, .y = b})
  #define make_uchar2(a, b) ((uchar2){.x = a, .y = b})
  #define make_ushort2(a, b) ((ushort2){.x = a, .y = b})
+#define make_float2(a, b) ((float2){.x = a, .y = b})
+#define make_int4(a, b, c, d) ((int4){.x = a, .y = b, .z = c, .w = d})
  #define make_uchar4(a, b, c, d) ((uchar4){.x = a, .y = b, .z = c, .w = d})
  #define make_ushort4(a, b, c, d) ((ushort4){.x = a, .y = b, .z = c, .w = d})
+#define make_float4(a, b, c, d) ((float4){.x = a, .y = b, .z = c, .w = d})
  
  // Conversions from the tex instruction's 4-register output to various types
  #define TEX2D(type, ret) static inline __device__ void conv(type* out, unsigned a, unsigned b, unsigned c, unsigned d) {*out = (ret);}
  
  TEX2D(unsigned char, a & 0xFF)
  TEX2D(unsigned short, a & 0xFFFF)
+TEX2D(float, a)
  TEX2D(uchar2, make_uchar2(a & 0xFF, b & 0xFF))
  TEX2D(ushort2, make_ushort2(a & 0xFFFF, b & 0xFFFF))
+TEX2D(float2, make_float2(a, b))
  TEX2D(uchar4, make_uchar4(a & 0xFF, b & 0xFF, c & 0xFF, d & 0xFF))
  TEX2D(ushort4, make_ushort4(a & 0xFFFF, b & 0xFFFF, c & 0xFFFF, d & 0xFFFF))
+TEX2D(float4, make_float4(a, b, c, d))
  
  // Template calling tex instruction and converting the output to the selected type
-template <class T>
-static inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y)
+template<typename T>
+inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y)
  {
    T ret;
    unsigned ret1, ret2, ret3, ret4;
@@ -128,4 +145,41 @@ static inline __device__ T tex2D(cudaTextureObject_t texObject, float x, float y
    return ret;
  }
  
+template<>
+inline __device__ float4 tex2D<float4>(cudaTextureObject_t texObject, float x, float y)
+{
+    float4 ret;
+    asm("tex.2d.v4.f32.f32 {%0, %1, %2, %3}, [%4, {%5, %6}];" :
+        "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) :
+        "l"(texObject), "f"(x), "f"(y));
+    return ret;
+}
+
+template<>
+inline __device__ float tex2D<float>(cudaTextureObject_t texObject, float x, float y)
+{
+    return tex2D<float4>(texObject, x, y).x;
+}
+
+template<>
+inline __device__ float2 tex2D<float2>(cudaTextureObject_t texObject, float x, float y)
+{
+    float4 ret = tex2D<float4>(texObject, x, y);
+    return make_float2(ret.x, ret.y);
+}
+
+// Math helper functions
+static inline __device__ float floorf(float a) { return __builtin_floorf(a); }
+static inline __device__ float floor(float a) { return __builtin_floorf(a); }
+static inline __device__ double floor(double a) { return __builtin_floor(a); }
+static inline __device__ float ceilf(float a) { return __builtin_ceilf(a); }
+static inline __device__ float ceil(float a) { return __builtin_ceilf(a); }
+static inline __device__ double ceil(double a) { return __builtin_ceil(a); }
+static inline __device__ float truncf(float a) { return __builtin_truncf(a); }
+static inline __device__ float trunc(float a) { return __builtin_truncf(a); }
+static inline __device__ double trunc(double a) { return __builtin_trunc(a); }
+static inline __device__ float fabsf(float a) { return __builtin_fabsf(a); }
+static inline __device__ float fabs(float a) { return __builtin_fabsf(a); }
+static inline __device__ double fabs(double a) { return __builtin_fabs(a); }
+
  #endif /* COMPAT_CUDA_CUDA_RUNTIME_H */