/*
								+--------------------------------+
								|                                |
								|    ***  CUDA utilities  ***    |
								|                                |
								|  Copyright  -tHE SWINe- 2010  |
								|                                |
								|           CuUtils.h            |
								|                                |
								+--------------------------------+
*/

#pragma once
#ifndef __CUDA_UTILS_INCLUDED
#define __CUDA_UTILS_INCLUDED

/**
 *	@file gpgpu/CuUtils.h
 *	@author -tHE SWINe-
 *	@brief CUDA utilities
 *	@date 2010
 *
 *	@date 2012-06-19
 *
 *	Moved multiple inclusion guard before file documentation comment.
 *
 */

#include <cuda.h>

/**
 *	@brief device parameters wrapper
 *
 *	This class is able to read-in device parameters,
 *		and hold them in memory in user-friendly form.
 */
class CCuDeviceParams {
protected:
	int m_n_index;
	CUdevice m_h_device;
	std::string m_s_name;
	int m_p_device_caps[2];
	int m_n_multiprocessor_num;
	unsigned int m_n_memory_size;
	bool m_b_kernel_exec_timeout;
	CUdevprop m_t_devprop;

public:
	/**
	 *	@brief default constructor
	 *
	 *	Reads device parameters.
	 *
	 *	@param[in] n_device_index is device index (must be 0 to cuDeviceGetCount() - 1)
	 *
	 *	@note cuInit() must be called before calling this function.
	 *	@note It is recommended to call b_Status() afterwards to see if constructor succeeded.
	 */
	CCuDeviceParams(int n_device_index);

	/**
	 *	@brief prints some basic info about the device
	 *
	 *	@param[in] p_fw is output stream (stdout by default)
	 */
	void Dump(FILE *p_fw = stdout);

	/**
	 *	@brief gets id of device with maximum (theoretical) computing
	 *		power or device with more memory
	 *
	 *	@return Returns id of the most powerful device.
	 */
	static int n_Get_MaxGFlops_DeviceId();

	/**
	 *	@brief determines wheter constructor succeeded
	 *
	 *	In case this function returns false, this object doesn't contain
	 *		valid device parameters, and may not be further used.
	 *
	 *	@return Returns true on success, false on failure.
	 */
	bool b_Status() const
	{
		return m_n_index != -1;
	}

	/**
	 *	@brief gets (zero-based) device index
	 *	@return Returns zero-based device index.
	 */
	int n_DeviceIndex() const
	{
		return m_n_index;
	}

	/**
	 *	@brief gets device handle
	 *	@return Returns device handle.
	 */
	CUdevice h_Device() const
	{
		return m_h_device;
	}

	/**
	 *	@brief gets device major revision number
	 *	@return Returns device major revision number.
	 */
	unsigned int n_Revision_Major() const
	{
		return m_p_device_caps[0];
	}

	/**
	 *	@brief gets device minor revision number
	 *	@return Returns device minor revision number.
	 */
	unsigned int n_Revision_Minor() const
	{
		return m_p_device_caps[1];
	}

	/**
	 *	@brief gets device multiprocessor count
	 *	@return Returns device multiprocessor count.
	 */
	size_t n_Multiprocessor_Num() const
	{
		return m_n_multiprocessor_num;
	}

	/**
	 *	@brief gets device memory size
	 *	@return Returns device memory size.
	 */
	size_t n_Memory_Size() const
	{
		return m_n_memory_size;
	}

	/**
	 *	@brief determines wheter device has kernel execution timeout
	 *	@return Returns true if device has kernel execution timeout, otherwise returns false.
	 */
	bool b_Has_KernelExecTimeout() const
	{
		return m_b_kernel_exec_timeout;
	}

	/**
	 *	@brief gets device properties structure
	 *	@return Returns device properties structure.
	 */
	const CUdevprop &t_Properties() const
	{
		return m_t_devprop;
	}

	/**
	 *	@brief gets device name
	 *	@return Returns device name.
	 */
	const std::string &s_Name() const
	{
		return m_s_name;
	}

	/**
	 *	@brief gets null-terminated string containing device name
	 *	@return Returns null-terminated string containing device name.
	 */
	const char *p_s_Name() const
	{
		return m_s_name.c_str();
	}

	/**
	 *	@brief determines wheter can problem of given size be executed in a single kernel call
	 *	@param[in] n_width is problem width
	 *	@param[in] n_height is problem height
	 *	@param[in] n_depth is problem depth
	 *	@return Returns true if problem fits, otherwise returns false (problem needs to be subdivided first).
	 */
	bool b_ProblemFitsAtOnce(int n_width, int n_height, int n_depth) const;

	/**
	 *	@brief determines wheter can problem of given size be executed in a single kernel call,
	 *		and if it can, calculates thread block and grid sizes
	 *	@param[out] p_block_size is pointer thread block width, height and depth is written
	 *		to uppon succesful return (must be allocated)
	 *	@param[out] p_grid_size is pointer grid width, height and depth is written to uppon
	 *		succesful return (must be allocated)
	 *	@param[in] n_width is problem width
	 *	@param[in] n_height is problem height
	 *	@param[in] n_depth is problem depth
	 *	@return Returns true if problem fits, otherwise returns false (problem needs to be subdivided first).
	 */
	bool CalculateGridParams(int *p_block_size, int *p_grid_size,
		int n_width, int n_height, int n_depth) const;
};

/**
 *	@brief gets alignment of a given data type
 *	@param[in] _Ty is data type
 *	@return Returns alignment of a given data type (should be power of two).
 */
#define n_AlignOf(_Ty) sizeof(_Ty)

/**
 *	@brief calculates offset after alignment
 *	@param[in] n_offset is current offset in bytes
 *	@param[in] n_alignment is alignment for the next item
 *	@return Returns nearest higher n_offset which is integer multiple of n_alignment.
 *	@note n_alignment must be power of two!
 */
#define n_AlignedOffset(n_offset,n_alignment) (((n_offset) + (n_alignment) - 1) & ~((n_alignment) - 1))

/**
 *	@brief provides the same functionality as cutilDrvSafeCall()
 */
class CCuArgLoaderHelper {
public:
	/**
	 *	@brief provides the same functionality as __cuSafeCall()
	 *	@param[in] n_error_code is cuda error code
	 *	@param[in] p_s_file is filename of source file (preprocessor __FILE__)
	 *	@param[in] n_line is source file line (preprocessor __LINE__)
	 *	@return Returns value of n_error_code.
	 */
	static CUresult __SafeCall(CUresult n_error_code, const char *p_s_file, int n_line);
};

/**
 *	@brief checks errors in cuParamSet*() functions
 *	@param[in] x is (return value of) call to cuParamSet*()
 */
#define __SafeParamSet(x) CCuArgLoaderHelper::__SafeCall((x), __FILE__, __LINE__)

/**
 *	@brief calls cuParamSeti with given parameters and makes sure
 *		it returns CUDA_SUCCESS (prints error and calls exit(-1) if it doesn't)
 *	@param[in] k is kernel function handle
 *	@param[in] o is parameter offset (in bytes)
 *	@param[in] v is value of the parameter
 *	@return Returns result of call to cuParamSeti().
 *	@note This macro has the same order and count of parameters as cuParamSeti() does.
 */
#define __SafeParamSeti(k,o,v) __SafeParamSet(cuParamSeti(k,o,v))

/**
 *	@brief calls cuParamSetf with given parameters and makes sure
 *		it returns CUDA_SUCCESS (prints error and calls exit(-1) if it doesn't)
 *	@param[in] k is kernel function handle
 *	@param[in] o is parameter offset (in bytes)
 *	@param[in] v is value of the parameter
 *	@return Returns result of call to cuParamSetf().
 *	@note This macro has the same order and count of parameters as cuParamSetf() does.
 */
#define __SafeParamSetf(k,o,v) __SafeParamSet(cuParamSetf(k,o,v))

/**
 *	@brief calls cuParamSetv with given parameters and makes sure
 *		it returns CUDA_SUCCESS (prints error and calls exit(-1) if it doesn't)
 *	@param[in] k is kernel function handle
 *	@param[in] o is parameter offset (in bytes)
 *	@param[in] v is pointer to value of the parameter
 *	@param[in] s is size of the parameter
 *	@return Returns result of call to cuParamSetv().
 *	@note This macro has the same order and count of parameters as cuParamSetv() does.
 */
#define __SafeParamSetv(k,o,v,s) __SafeParamSet(cuParamSetv(k,o,v,s))

/**
 *	@brief calls cuParamSetSize with given parameters and makes sure
 *		it returns CUDA_SUCCESS (prints error and calls exit(-1) if it doesn't)
 *	@param[in] k is kernel function handle
 *	@param[in] s is size of all parameters (in bytes)
 *	@return Returns result of call to cuParamSetSize().
 *	@note This macro has the same order and count of parameters as cuParamSetSize() does.
 */
#define __SafeParamSetSize(k,s) __SafeParamSet(cuParamSetSize(k,s))

/**
 *	@brief CUDA driver api argument loader for kernel functions
 *	@param[in] n_offset is offset of loaded parameter (in bytes)
 *	@note This class shouldn't be used directly. Use cuParamsSet() macro instead.
 */
template <int n_offset>
class CCuArgLoader {
protected:
	CUfunction m_h_func;

public:
	/**
	 *	@brief default constructor
	 *	@param[in] h_func is CUDA function handle parameters are being set for
	 */
	inline CCuArgLoader(CUfunction h_func)
		:m_h_func(h_func)
	{}

	/**
	 *	@brief gets size of all loaded parameters (in bytes)
	 *	@return Returns size of all the parameters (in bytes).
	 */
	inline int n_Size() const
	{
		return n_offset;
	}

	/**
	 *	@brief loads a single integer parameter n_value
	 *	@param[in] n_value is integer value (of the loaded parameter)
	 *	@return Returns loader with offset of the next parameter.
	 */
	inline CCuArgLoader<n_AlignedOffset(n_offset, n_AlignOf(int)) + sizeof(int)> operator ,(int n_value)
	{
		__SafeParamSeti(m_h_func, n_AlignedOffset(n_offset, n_AlignOf(int)), n_value);
		return CCuArgLoader<n_AlignedOffset(n_offset, n_AlignOf(int)) + sizeof(int)>(m_h_func);
	}

	/**
	 *	@brief loads a single float parameter f_value
	 *	@param[in] f_value is float value (of the loaded parameter)
	 *	@return Returns loader with offset of the next parameter.
	 */
	inline CCuArgLoader<n_AlignedOffset(n_offset, n_AlignOf(float)) + sizeof(float)> operator ,(float f_value)
	{
		__SafeParamSetf(m_h_func, n_AlignedOffset(n_offset, n_AlignOf(float)), f_value);
		return CCuArgLoader<n_AlignedOffset(n_offset, n_AlignOf(float)) + sizeof(float)>(m_h_func);
	}

	/**
	 *	@brief loads a single pointer parameter p_value
	 *	@param[in] p_value is pointer value (of the loaded parameter)
	 *	@return Returns loader with offset of the next parameter.
	 *	@note in case device pointers are used, they need to be cast
	 *		to size_t and then to void* (or use the DevPtr() macro).
	 */
	inline CCuArgLoader<n_AlignedOffset(n_offset, n_AlignOf(void*)) + sizeof(void*)> operator ,(void *p_value)
	{
		__SafeParamSetv(m_h_func, n_AlignedOffset(n_offset, n_AlignOf(void*)), &p_value, sizeof(void*));
		return CCuArgLoader<n_AlignedOffset(n_offset, n_AlignOf(void*)) + sizeof(void*)>(m_h_func);
	}
};

/**
 *	@brief converts device pointer (CUdeviceptr) to void* properly
 *	@param[in] dp_ptr is device pointer (CUdeviceptr)
 *	@return returns value of dp_ptr cast to size_t and then to void*
 *	@note this macro is supposed to be used with cuParamsSet*() macros
 */
#define DevPtr(dp_ptr) ((void*)(size_t)(dp_ptr))

#if !defined(_MSC_VER) || defined(__MWERKS__) || _MSC_VER >= 1400

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of the rest of parameters, those may be int, float
 *		or void* (use DevPtr() macro to pass CUdeviceptr parameters)
 *	@return Returns result of call to cuParamSetSize().
 *	@note This properly aligns arguments, as long as n_AlignOf() macro returns
 *		correct align. To ensure pointer corectness, use "nvcc --machine 32" (or 64).
 *	@note This requires quite recent compiler with variadic macros support.
 *		On older compilers cuParamsSet0() trough cuParamsSet16() may be used
 *		(longer argument lists are also possible, but longer macros aren't implemented).
 */
#define cuParamsSet(h_func, ...) __SafeParamSetSize(h_func, (CCuArgLoader<0>(h_func), __VA_ARGS__).n_Size())

/**
 *	@brief calls all cuda functions required to pass no parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet0(h_func) __SafeParamSetSize(h_func, 0)

/**
 *	@brief calls all cuda functions required to pass 1 parameter to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet1(h_func, ...) cuParamsSet(h_func, __VA_ARGS__)

/**
 *	@brief calls all cuda functions required to pass 2 parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet2(h_func, ...) cuParamsSet(h_func, __VA_ARGS__)

/**
 *	@brief calls all cuda functions required to pass 3 parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet3(h_func, ...) cuParamsSet(h_func, __VA_ARGS__)

/**
 *	@brief calls all cuda functions required to pass 4 parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet4(h_func, ...) cuParamsSet(h_func, __VA_ARGS__)

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet5(h_func, ...) cuParamsSet(h_func, __VA_ARGS__)

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet6(h_func, ...) cuParamsSet(h_func, __VA_ARGS__)

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet7(h_func, ...) cuParamsSet(h_func, __VA_ARGS__)

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet8(h_func, ...) cuParamsSet(h_func, __VA_ARGS__)

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet9(h_func, ...) cuParamsSet(h_func, __VA_ARGS__)

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet10(h_func, ...) cuParamsSet(h_func, __VA_ARGS__)

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet11(h_func, ...) cuParamsSet(h_func, __VA_ARGS__)

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet12(h_func, ...) cuParamsSet(h_func, __VA_ARGS__)

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet13(h_func, ...) cuParamsSet(h_func, __VA_ARGS__)

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet14(h_func, ...) cuParamsSet(h_func, __VA_ARGS__)

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet15(h_func, ...) cuParamsSet(h_func, __VA_ARGS__)

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet16(h_func, ...) cuParamsSet(h_func, __VA_ARGS__)

#else // !defined(_MSC_VER) || defined(__MWERKS__) || _MSC_VER >= 1400

/**
 *	@brief calls all cuda functions required to pass no parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet0(h_func) __SafeParamSetSize(h_func, (CCuArgLoader<0>(h_func)).n_Size())

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet1(h_func,a) __SafeParamSetSize(h_func,(CCuArgLoader<0>(h_func),a).n_Size())

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] b is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet2(h_func,a,b) __SafeParamSetSize(h_func,(CCuArgLoader<0>(h_func),a,b).n_Size())

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] b is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] c is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet3(h_func,a,b,c) __SafeParamSetSize(h_func,(CCuArgLoader<0>(h_func),a,b,c).n_Size())

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] b is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] c is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] d is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet4(h_func,a,b,c,d) __SafeParamSetSize(h_func,\
	(CCuArgLoader<0>(h_func),a,b,c,d).n_Size())

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] b is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] c is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] d is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] e is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet5(h_func,a,b,c,d,e) __SafeParamSetSize(h_func,\
	(CCuArgLoader<0>(h_func),a,b,c,d,e).n_Size())

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] b is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] c is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] d is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] e is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] f is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet6(h_func,a,b,c,d,e,f) __SafeParamSetSize(h_func,\
	(CCuArgLoader<0>(h_func),a,b,c,d,e,f).n_Size())

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] b is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] c is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] d is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] e is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] f is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] g is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet7(h_func,a,b,c,d,e,f,g) __SafeParamSetSize(h_func,\
	(CCuArgLoader<0>(h_func),a,b,c,d,e,f,g).n_Size())

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] b is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] c is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] d is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] e is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] f is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] g is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] h is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet8(h_func,a,b,c,d,e,f,g,h) __SafeParamSetSize(h_func,\
	(CCuArgLoader<0>(h_func),a,b,c,d,e,f,g,h).n_Size())

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] b is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] c is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] d is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] e is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] f is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] g is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] h is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] i is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet9(h_func,a,b,c,d,e,f,g,h,i) __SafeParamSetSize(h_func,\
	(CCuArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i).n_Size())

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] b is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] c is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] d is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] e is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] f is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] g is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] h is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] i is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] j is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet10(h_func,a,b,c,d,e,f,g,h,i,j) __SafeParamSetSize(h_func,\
	(CCuArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j).n_Size())

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] b is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] c is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] d is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] e is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] f is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] g is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] h is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] i is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] j is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] k is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet11(h_func,a,b,c,d,e,f,g,h,i,j,k) __SafeParamSetSize(h_func,\
	(CCuArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k).n_Size())

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] b is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] c is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] d is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] e is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] f is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] g is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] h is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] i is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] j is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] k is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] l is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet12(h_func,a,b,c,d,e,f,g,h,i,j,k,l) __SafeParamSetSize(h_func,\
	(CCuArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k,l).n_Size())

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] b is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] c is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] d is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] e is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] f is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] g is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] h is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] i is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] j is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] k is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] l is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] m is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet13(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m) __SafeParamSetSize(h_func,\
	(CCuArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k,l,m).n_Size())

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] b is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] c is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] d is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] e is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] f is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] g is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] h is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] i is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] j is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] k is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] l is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] m is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] n is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet14(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m,n) __SafeParamSetSize(h_func,\
	(CCuArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k,l,m,n).n_Size())

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] b is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] c is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] d is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] e is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] f is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] g is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] h is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] i is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] j is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] k is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] l is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] m is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] n is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] o is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet15(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) __SafeParamSetSize(h_func,\
	(CCuArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k,l,m,n,o).n_Size())

/**
 *	@brief calls all cuda functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] b is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] c is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] d is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] e is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] f is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] g is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] h is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] i is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] j is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] k is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] l is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] m is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] n is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] o is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@param[in] p is kernel function argument (int, float or void* / DevPtr(CUdeviceptr))
 *	@return Returns result of call to cuParamSetSize().
 *	@note See cuParamsSet() macro documentation for more details.
 */
#define cuParamsSet16(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) __SafeParamSetSize(h_func,\
	(CCuArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p).n_Size())

#endif // !defined(_MSC_VER) || defined(__MWERKS__) || _MSC_VER >= 1400

/**
 *	@brief creates temporary pointer, containing address of device pointer
 *
 *	Use with the DevPtrParam macro.
 */
class CDevPtrRef {
protected:
	void *m_p_pointer;

public:
	/**
	 *	@brief default constructor
	 *	@param[in] dp_pointer is device pointer
	 *
	 *	Initializes address to value of dp_pointer.
	 */
	inline CDevPtrRef(CUdeviceptr dp_pointer)
		:m_p_pointer((void*)(size_t)dp_pointer)
	{}

	/**
	 *	@brief returns pointer to pointer, containing specified address
	 *	@return Returns pointer to be used by cuParamSetv().
	 */
	inline operator void*()
	{
		return &m_p_pointer;
	}
};

/**
 *	@def DevPtrParam
 *	@brief helper for cuParamSetv
 *	@param[in] dp_ptr is device pointer (CUdeviceptr type)
 *
 *	Use in the following way:
 *@code
 *	CUfunction h_kernel; // some function
 *	int n_offset = 0; // some offset
 *	CUdeviceptr dp_pointer; cuMemAlloc(&dp_pointer, 1024); // device-side pointer
 *
 *	cuParamSetv(h_kernel, n_offset, DevPtrParam(dp_pointer))@endcode
 */
#define DevPtrParam(dp_ptr) CDevPtrRef(dp_ptr), sizeof(void*)

#endif // __CUDA_UTILS_INCLUDED
