/*
								+--------------------------------+
								|                                |
								|   ***  OpenCL utilities  ***   |
								|                                |
								|  Copyright  -tHE SWINe- 2010  |
								|                                |
								|           ClUtils.h            |
								|                                |
								+--------------------------------+
*/

/**
 *	@file ClUtils.h
 *	@author -tHE SWINe-
 *	@brief OpenCL utilities
 *	@date 2010
 *
 *	@date 2010-08-10
 *
 *	changed OpenCL program binaries size in CCLUtils from uint64_T to size_t
 *	as visual studio 2008 has some probles with new[uint64_t].
 *
 *	@date 2010-08-13
 *
 *	added CCLProgramCompiler::Get_BuildLog() to enable build error reporting
 *
 *	added new CCLDeviceParams::CCLDeviceParams() taking just cl_device_id device handle
 *
 *	added static version of CCLDeviceParams::b_IsExtensionSupported(), enabling caller
 *	to determine wheter particular device supports given extension without the need to
 *	instantiate the whole CCLDeviceParams.
 *
 *	added static version of CCLDeviceParams::n_GetDeviceInfoString(), enabling caller
 *	to get device info string (eg. the device name) without the need to instantiate
 *	the whole CCLDeviceParams.
 *
 *	added static CCLDeviceParams::n_Query_DeviceProperties(), enabling caller to get
 *	device properties structure without the need to instantiate the whole CCLDeviceParams.
 *
 *	@date 2010-09-28
 *
 *	renamed clParamsSet*() to clSetKernelArgs*() (clParamSet() is CUDA function name
 *	while clSetKernelArg() is OpenCL function name).
 *
 *	added CCLLocalMem so it is possible to specify local memory buffers for kernels.
 *	consider the following code example use:
 *
 *	@code
 *	cl_kernel h_kernel;
 *	cl_mem p_dest, p_src;
 *	size_t n_local_block_size = 128;
 *	clSetKernelArgs(h_kernel, p_dest, p_src, CCLLocalMem(n_local_block_size * sizeof(float)));
 *	// each thread block will have 128 floats (512 bytes) of local memory@endcode
 */

#ifndef __CUDA_UTILS_INCLUDED
#define __CUDA_UTILS_INCLUDED

#include <CL/opencl.h>
#include "../Hash.h"

/**
 *	@def CL_PROGRAM_COMPILER_DISPLAY_BUILD_ERRORS
 *	@brief if defined, CCLProgramCompiler::n_CompileProgram() and CCLProgramCompiler::n_CompileProgramFile() show build log after unsuccessful build
 */
#define CL_PROGRAM_COMPILER_DISPLAY_BUILD_ERRORS

/**
 *	@def CL_PROGRAM_COMPILER_DISPLAY_SUCCESSFUL_BUILD_WARNINGS
 *	@brief if defined, CCLProgramCompiler::n_CompileProgram() and CCLProgramCompiler::n_CompileProgramFile() show build log even after successful build (handy for debugging)
 */
#define CL_PROGRAM_COMPILER_DISPLAY_SUCCESSFUL_BUILD_WARNINGS

/**
 *	@brief class with some simple OpenCL utility functions
 */
class CCLUtils {
public:
	/**
	 *	@brief initializes OpenCL
	 *
	 *	@param[out] p_context is pointer where a new OpenCL context handle is written upon successful return
	 *	@param[in] n_device_type is required device type (eg. CL_DEVICE_TYPE_GPU)
	 *	@param[in] b_implementation_profile_selection chooses between
	 *		"OpenCL implementation-specific" profile selection, or profile selection
	 *		based on implemented features
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	static int n_OpenCL_Init(cl_context *p_context, int n_device_type = CL_DEVICE_TYPE_GPU,
		bool b_implementation_profile_selection = false);

	/**
	 *	@brief gets list of devices available in specified context
	 *
	 *	@param[in] h_context is handle to OpenCL GPU-type context
	 *	@param[out] r_device_list is list to be filled with device id's
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	static int n_GetDeviceList(cl_context h_context, std::vector<cl_device_id> &r_device_list);

	/**
	 *	@brief gets id of device with maximum (theoretical) computing
	 *		power or device with more memory
	 *
	 *	@param[in] h_context is handle to OpenCL GPU-type context
	 *
	 *	@return Returns index of the most powerful device, or -1 on error.
	 */
	static int n_Get_MaxGFlops_DeviceId(cl_context h_context);

	/**
	 *	@brief gets id of device with maximum (theoretical) computing
	 *		power or device with more memory
	 *
	 *	@param[out] p_device_id is pointer where id of the most
	 *		powerful device handle is written upon successful return
	 *	@param[in] h_context is handle to OpenCL GPU-type context
	 *
	 *	@return Returns index of the most powerful device, or -1 on error.
	 */
	static int n_Get_MaxGFlops_DeviceId(cl_device_id *p_device_id, cl_context h_context);
};

/**
 *	@brief device parameters wrapper
 *
 *	This class is able to read-in device parameters,
 *		and hold them in memory in user-friendly form.
 */
class CCLDeviceParams {
public:
	/**
	 *	@brief device properties structure (CUDA lookalike)
	 */
	struct CLdevprop {
		int maxThreadsPerBlock;     /**< Maximum number of threads per block */
		int maxThreadsDim[3];       /**< Maximum size of each dimension of a block */
		int maxGridSize[3];         /**< Maximum size of each dimension of a grid */
		int sharedMemPerBlock;      /**< Shared memory available per block in bytes */
		int totalConstantMemory;    /**< Constant memory available on device in bytes */
		int SIMDWidth;              /**< Warp size in threads */
		int memPitch;               /**< Maximum pitch in bytes allowed by memory copies */
		int regsPerBlock;           /**< 32-bit registers available per block */
		int clockRate;              /**< Clock frequency in kilohertz */
		int textureAlign;           /**< Alignment requirement for textures */
	};

protected:
	cl_device_id m_h_device;
	std::string m_s_name;
	int m_p_device_caps[2];
	int m_n_multiprocessor_num;
	unsigned int m_n_memory_size;
	bool m_b_kernel_exec_timeout;
	CLdevprop m_t_devprop;

public:
	/**
	 *	@brief default constructor
	 *
	 *	Reads device parameters.
	 *
	 *	@param[in] h_device is device handle
	 *
	 *	@note OpenCL must be initialized before calling this function.
	 *	@note It is recommended to call b_Status() afterwards to see if constructor succeeded.
	 */
	CCLDeviceParams(cl_device_id h_device);

	/**
	 *	@brief constructor
	 *
	 *	Reads device parameters.
	 *
	 *	@param[in] h_context is handle to OpenCL GPU-type context
	 *	@param[in] n_device_index is device index (must be 0 to cuDeviceGetCount() - 1)
	 *
	 *	@note OpenCL must be initialized before calling this function.
	 *	@note It is recommended to call b_Status() afterwards to see if constructor succeeded.
	 */
	CCLDeviceParams(cl_context h_context, int n_device_index);

	/**
	 *	@brief prints some basic info about the device
	 *
	 *	@param[in] p_fw is output stream (stdout by default)
	 */
	void Dump(FILE *p_fw = stdout);

	/**
	 *	@brief gets device info string
	 *
	 *	@param[out] r_s_str is reference to output string
	 *	@param[in] n_name is requrested value name (e.g. CL_DEVICE_EXTENSIONS)
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	int n_GetDeviceInfoString(std::string &r_s_str, int n_name);

	/**
	 *	@brief gets device info string
	 *
	 *	@param[out] r_s_str is reference to output string
	 *	@param[in] h_device is target device
	 *	@param[in] n_name is requrested value name (e.g. CL_DEVICE_EXTENSIONS)
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	static int n_GetDeviceInfoString(std::string &r_s_str, cl_device_id h_device, int n_name);

	/**
	 *	@brief determines whether is specified extension supported by the hardware, or not
	 *
	 *	@param[in] p_s_extension_name is name of extension queried
	 *
	 *	@return Returns true if extension is supported, or false if it's not.
	 */
	bool b_IsExtensionSupported(const char *p_s_extension_name);

	/**
	 *	@brief determines whether is specified extension supported by the hardware, or not
	 *
	 *	@param[in] h_device is target device
	 *	@param[in] p_s_extension_name is name of extension queried
	 *
	 *	@return Returns true if extension is supported, or false if it's not.
	 */
	static bool b_IsExtensionSupported(cl_device_id h_device, const char *p_s_extension_name);

	/**
	 *	@brief gets device properties structure
	 *
	 *	@param[out] r_t_devprop is reference to output structure
	 *	@param[in] h_device is target device
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	static int n_Query_DeviceProperties(CLdevprop &r_t_devprop, cl_device_id h_device);

	/**
	 *	@brief determines wheter constructor succeeded
	 *
	 *	In case this function returns false, this object doesn't contain
	 *		valid device parameters, and may not be further used.
	 *
	 *	@return Returns true on success, false on failure.
	 */
	inline bool b_Status() const
	{
		return m_h_device != 0;
	}

	/**
	 *	@brief gets device handle
	 *	@return Returns device handle.
	 */
	inline cl_device_id h_Device() const
	{
		return m_h_device;
	}

	/**
	 *	@brief gets device major revision number
	 *	@return Returns device major revision number.
	 */
	inline unsigned int n_Revision_Major() const
	{
		return m_p_device_caps[0];
	}

	/**
	 *	@brief gets device minor revision number
	 *	@return Returns device minor revision number.
	 */
	inline unsigned int n_Revision_Minor() const
	{
		return m_p_device_caps[1];
	}

	/**
	 *	@brief gets device multiprocessor count
	 *	@return Returns device multiprocessor count.
	 */
	inline size_t n_Multiprocessor_Num() const
	{
		return m_n_multiprocessor_num;
	}

	/**
	 *	@brief gets device memory size
	 *	@return Returns device memory size.
	 */
	inline size_t n_Memory_Size() const
	{
		return m_n_memory_size;
	}

	/**
	 *	@brief determines wheter device has kernel execution timeout
	 *	@return Returns true if device has kernel execution timeout, otherwise returns false.
	 */
	inline bool b_Has_KernelExecTimeout() const
	{
		return m_b_kernel_exec_timeout;
	}

	/**
	 *	@brief gets device properties structure
	 *	@return Returns device properties structure.
	 */
	inline const CLdevprop &t_Properties() const
	{
		return m_t_devprop;
	}

	/**
	 *	@brief gets device name
	 *	@return Returns device name.
	 */
	inline const std::string &s_Name() const
	{
		return m_s_name;
	}

	/**
	 *	@brief gets null-terminated string containing device name
	 *	@return Returns null-terminated string containing device name.
	 */
	inline const char *p_s_Name() const
	{
		return m_s_name.c_str();
	}

	/**
	 *	@brief determines wheter can problem of given size be executed in a single kernel call
	 *	@param[in] n_width is problem width
	 *	@param[in] n_height is problem height
	 *	@param[in] n_depth is problem depth
	 *	@return Returns true if problem fits, otherwise returns false (problem needs to be subdivided first).
	 */
	inline bool b_ProblemFitsAtOnce(int n_width, int n_height, int n_depth) const;

	/**
	 *	@brief determines wheter can problem of given size be executed in a single kernel call,
	 *		and if it can, calculates thread block and grid sizes
	 *	@param[out] p_block_size is pointer thread block width, height and depth is written
	 *		to uppon succesful return (must be allocated)
	 *	@param[out] p_grid_size is pointer grid width, height and depth is written to uppon
	 *		succesful return (must be allocated)
	 *	@param[in] n_width is problem width
	 *	@param[in] n_height is problem height
	 *	@param[in] n_depth is problem depth
	 *	@return Returns true if problem fits, otherwise returns false (problem needs to be subdivided first).
	 */
	inline bool CalculateGridParams(int *p_block_size, int *p_grid_size,
		int n_width, int n_height, int n_depth) const;

protected:
	/**
	 *	@brief fills-in device parameters
	 *	@return Returns true on success, false on failure.
	 */
	bool QueryDeviceParams();
};

/**
 *	@brief utility class for calling OpenCL compiler and caching of compiled programs in a file
 */
class CCLProgramCompiler {
public:
	/**
	 *	@brief status word flag names
	 */
	enum {
		cache_ReadAttempted = 1, /**< reading of program binaries from cache file was attempted */
			cache_ReadSucceeded = 2, /**< reading of program binaries from cache file was successful */
			cache_ReadFailed_FileNotFound = 4, /**< reading of program binaries failed because specified file couldn't be opened */
			cache_ReadFailed_IO = 8, /**< reading of program binaries failed because of i/o error */
			cache_ReadFailed_OutOfMemory = 16, /**< reading of program binaries failed because of memory allocation error */
			cache_ReadFailed_SourceChecksum = 32, /**< reading of program binaries failed because source code is different (cache miss) */
			cache_ReadFailed_BinaryChecksum = 64, /**< reading of program binaries failed because it's corrupt */
			cache_ReadFailed_CreateProgram = 128, /**< reading of program binaries failed because clCreateProgramWithBinary() failed (wrong device?) */
		prog_CompiledFromSource = 256, /**< program needed to be compiled from source code at runtime */
			prog_CreateSucceeded = 512, /**< program was compiled, clCreateProgramWithSource() succeeded */
			prog_BuildSucceeded = 1024, /**< program was compiled, clBuildProgram() succeeded */
		cache_WriteAttempted = 2048, /**< attempted to write cache file with program binaries */
			cache_WriteSucceeded = 4069 /**< writing program binaries was successful */
	};

	/**
	 *	@brief prints human-readable information from status word to specified stream
	 *
	 *	@param[in] n_status_word is status word, returned in last
	 *		parameter of n_CompileProgram() or n_CompileProgramFile()
	 *	@param[in] p_fw is output stream (stdout by default)
	 */
	static void Dump_StatusWord(int n_status_word, FILE *p_fw = stdout);

	/**
	 *	@brief compiles OpenCL "C" program, while allowing for it's binary to be cached
	 *
	 *	In case program is compiled from source code and build fails, build log is printed
	 *	for each device to stderr (CL_PROGRAM_COMPILER_DISPLAY_BUILD_ERRORS must be defined).
	 *	In case CL_PROGRAM_COMPILER_DISPLAY_SUCCESSFUL_BUILD_WARNINGS is defined, build log
	 *	is displayed even if build succeeds (only if it's not empty).
	 *
	 *	@param[in] h_context is OpenCL context
	 *	@param[out] p_program is pointer to program handle, which is written upon successful return
	 *	@param[in] p_s_source is program source code
	 *	@param[in] n_device_num is number of devices program is compiled for, for more
	 *		information refer to clBuildProgram() function documentation
	 *	@param[in] p_device_list is list of devices program is compiled for (or NULL
	 *		if n_device_num is 0), for more information refer to clBuildProgram() function documentation
	 *	@param[in] p_s_build_options is string with OpenCL build options, for more
	 *		information refer to clBuildProgram() function documentation
	 *	@param[in] p_s_cache_file is either null, or filename where compiled
	 *		binary (along with source checksum) should be found/stored
	 *	@param[out] p_status_word is pointer to integer where status word should be written, or NULL
	 *
	 *	@return Returns CL_SUCCESS on success, or other OpenCL error code on failure. The function
	 *		succeeds regardless of whether program failed to load from file, or whether binaries
	 *		couldn't be written to cache file.
	 *
	 *	@note Without specifying at least one device in the list, caching feature
	 *		will not be able to save/load the program.
	 *	@note Without specifying at least one device in the list, build log won't be displayed on build error.
	 */
	static int n_CompileProgram(cl_context h_context, cl_program *p_program,
		const char *p_s_source, size_t n_device_num = 0, const cl_device_id *p_device_list = 0,
		const char *p_s_build_options = "", const char *p_s_cache_file = 0, int *p_status_word = 0);

	/**
	 *	@brief compiles OpenCL "C" program from file, while allowing for it's binary to be cached
	 *
	 *	In case program is compiled from source code and build fails, build log is printed
	 *	for each device to stderr (CL_PROGRAM_COMPILER_DISPLAY_BUILD_ERRORS must be defined).
	 *	In case CL_PROGRAM_COMPILER_DISPLAY_SUCCESSFUL_BUILD_WARNINGS is defined, build log
	 *	is displayed even if build succeeds (only if it's not empty).
	 *
	 *	@param[in] h_context is OpenCL context
	 *	@param[out] p_program is pointer to program handle, which is written upon successful return
	 *	@param[in] p_s_source_file is name of file with program source code
	 *	@param[in] n_device_num is number of devices program is compiled for, for more
	 *		information refer to clBuildProgram() function documentation
	 *	@param[in] p_device_list is list of devices program is compiled for (or NULL
	 *		if n_device_num is 0), for more information refer to clBuildProgram() function documentation
	 *	@param[in] p_s_build_options is string with OpenCL build options, for more
	 *		information refer to clBuildProgram() function documentation
	 *	@param[in] p_s_cache_file is either null, or filename where compiled
	 *		binary (along with source checksum) should be found/stored
	 *
	 *	@return Returns CL_SUCCESS on success, or other OpenCL error code on failure. The function
	 *		succeeds regardless of whether program failed to load from file, or whether binaries
	 *		couldn't be written to cache file.
	 *
	 *	@note Without specifying at least one device in the list, caching feature
	 *		will not be able to save/load the program.
	 *	@note Without specifying at least one device in the list, build log won't be displayed on build error.
	 */
	static int n_CompileProgramFile(cl_context h_context, cl_program *p_program,
		const char *p_s_source_file, size_t n_device_num = 0, const cl_device_id *p_device_list = NULL,
		const char *p_s_build_options = "", const char *p_s_cache_file = 0, int *p_status_word = 0);

	/**
	 *	@brief gets build status and build log for specified program and device
	 *
	 *	@param[out] r_s_build_log is string where build log is written upon successful return
	 *	@param[out] r_n_build_status is string where build status is written upon successful return
	 *	@param[in] h_program is program in question
	 *	@param[in] h_device is (one of) target device(s) for which program was built
	 *
	 *	@return Returns OpenCL error code (CL_SUCCESS on success).
	 */
	static int Get_BuildLog(std::string &r_s_build_log, cl_build_status &r_n_build_status,
		cl_program h_program, cl_device_id h_device);

	/**
	 *	@brief calculates SHA1 hash of source code and build options strings
	 *
	 *	@param[in] p_s_source is OpenCL "C" source code
	 *	@param[in] p_s_build_options is OpenCL compiler options string (may be NULL)
	 *
	 *	@return Returns SHA1 of both strings.
	 */
	static TSHA1 t_Hash_ProgramSource_BuildOptions(const char *p_s_source, const char *p_s_build_options = 0);

	/**
	 *	@brief reads program binaries and creates cl_program
	 *
	 *	@param[in] p_s_filename is file, containing binaries (saved using WriteProgramBinaries())
	 *	@param[in] t_hash is hash of source code and build options (obtained using t_Hash_ProgramSource_BuildOptions())
	 *	@param[in] h_context is OpenCL context
	 *	@param[out] p_program is pointer to program handle, which is written upon successful return
	 *	@param[in] n_device_num is number of devices program is compiled for, for more
	 *		information refer to clBuildProgram() function documentation
	 *	@param[in] p_device_list is list of devices program is compiled for (or NULL
	 *		if n_device_num is 0), for more information refer to clBuildProgram() function documentation
	 *
	 *	@return Returns cache_ReadSucceeded on success, or one of cache_ReadFailed_* on failure.
	 */
	static int n_ReadProgramBinaries(const char *p_s_filename, TSHA1 t_hash, cl_context h_context,
		cl_program *p_program, size_t n_device_num, const cl_device_id *p_device_list);

	/**
	 *	@brief writes binaries of cl_program to a file
	 *
	 *	@param[in] h_program is handle to an existing OpenCL program
	 *	@param[in] t_hash is hash of program source code and build options (obtained using t_Hash_ProgramSource_BuildOptions())
	 *	@param[in] p_s_filename is output file name
	 *
	 *	@return Returns true on success, false on failure.
	 */
	static bool WriteProgramBinaries(cl_program h_program, TSHA1 t_hash, const char *p_s_filename);
};

/**
 *	@brief provides the same functionality as cutilDrvSafeCall()
 */
class CCLArgLoaderHelper {
public:
	/**
	 *	@brief provides the same functionality as __cuSafeCall()
	 *	@param[in] n_error_code is OpenCL error code
	 *	@param[in] p_s_file is filename of source file (preprocessor __FILE__)
	 *	@param[in] n_line is source file line (preprocessor __LINE__)
	 *	@return Returns value of n_error_code.
	 */
	static int __SafeCall(int n_error_code, const char *p_s_file, int n_line);
};

/**
 *	@brief checks errors in cuParamSet*() functions
 *	@param[in] x is (return value of) call to cuParamSet*()
 */
#define __SafeParamSet(x) CCLArgLoaderHelper::__SafeCall((x), __FILE__, __LINE__)

/**
 *	@brief calls clSetKernelArg with given parameters and makes sure
 *		it returns CL_SUCCESS (prints error and calls exit(-1) if it doesn't)
 *	@param[in] k is kernel function handle
 *	@param[in] i is parameter zero-based index
 *	@param[in] v is value of the parameter
 *	@return Returns result of call to cuParamSeti().
 */
#define __SafeParamSeti(k,i,v) __SafeParamSet(clSetKernelArg(k,i,sizeof(v),&v))

/**
 *	@brief calls clSetKernelArg with given parameters and makes sure
 *		it returns CL_SUCCESS (prints error and calls exit(-1) if it doesn't)
 *	@param[in] k is kernel function handle
 *	@param[in] i is parameter zero-based index
 *	@param[in] v is value of the parameter
 *	@return Returns result of call to cuParamSetf().
 */
#define __SafeParamSetf(k,i,v) __SafeParamSet(clSetKernelArg(k,i,sizeof(v),&v))

/**
 *	@brief calls clSetKernelArg with given parameters and makes sure
 *		it returns CL_SUCCESS (prints error and calls exit(-1) if it doesn't)
 *	@param[in] k is kernel function handle
 *	@param[in] i is parameter zero-based index
 *	@param[in] s is size of the parameter
 *	@param[in] v is pointer to value of the parameter
 *	@return Returns result of call to clSetKernelArg().
 *	@note This macro has the same order and count of parameters as clSetKernelArg() does.
 */
#define __SafeParamSetv(k,i,s,v) __SafeParamSet(clSetKernelArg(k,i,s,v))

/**
 *	@brief local memory allocation size, used to specify local memory size for kernel arguments
 */
class CCLLocalMem {
protected:
	size_t m_n_size;

public:
	/**
	 *	@brief default constructor
	 *	@param[in] n_size is local memory allocation size
	 */
	inline CCLLocalMem(size_t n_size)
		:m_n_size(n_size)
	{}

	/**
	 *	@brief gets local memory allocation size
	 *	@return Returns local memory allocation size.
	 */
	inline size_t n_Size() const
	{
		return m_n_size;
	}
};

/**
 *	@brief OpenCL driver api argument loader for kernel functions
 *	@param[in] n_offset is offset of loaded parameter (in bytes)
 *	@note This class shouldn't be used directly. Use clSetKernelArgs() macro instead.
 */
template <int n_index>
class CCLArgLoader {
protected:
	cl_kernel m_h_func;

public:
	/**
	 *	@brief default constructor
	 *	@param[in] h_func is OpenCL function handle parameters are being set for
	 */
	inline CCLArgLoader(cl_kernel h_func)
		:m_h_func(h_func)
	{}

	/**
	 *	@brief gets count of all loaded parameters
	 *	@return Returns count of all the parameters
	 */
	inline int n_Count() const
	{
		return n_index + 1;
	}

	/**
	 *	@brief loads a single integer parameter n_value
	 *	@param[in] n_value is integer value (of the loaded parameter)
	 *	@return Returns loader with offset of the next parameter.
	 */
	inline CCLArgLoader<n_index + 1> operator ,(int n_value)
	{
		__SafeParamSeti(m_h_func, n_index, n_value);
		return CCLArgLoader<n_index + 1>(m_h_func);
	}

	/**
	 *	@brief loads a single float parameter f_value
	 *	@param[in] f_value is float value (of the loaded parameter)
	 *	@return Returns loader with offset of the next parameter.
	 */
	inline CCLArgLoader<n_index + 1> operator ,(float f_value)
	{
		__SafeParamSetf(m_h_func, n_index, f_value);
		return CCLArgLoader<n_index + 1>(m_h_func);
	}

	/**
	 *	@brief loads a single pointer parameter p_value
	 *	@param[in] p_value is pointer value (of the loaded parameter)
	 *	@return Returns loader with offset of the next parameter.
	 *	@note in case device pointers are used, they need to be cast
	 *		to size_t and then to void* (or use the DevPtr() macro).
	 */
	inline CCLArgLoader<n_index + 1> operator ,(cl_mem p_value)
	{
		__SafeParamSetv(m_h_func, n_index, sizeof(cl_mem), (void*)&p_value);
		return CCLArgLoader<n_index + 1>(m_h_func);
	}

	/**
	 *	@brief loads a single local memory parameter t_local_mem_cfg
	 *	@param[in] t_local_mem_cfg is local memory allocation size
	 *	@return Returns loader with offset of the next parameter.
	 *	@note in case device pointers are used, they need to be cast
	 *		to size_t and then to void* (or use the DevPtr() macro).
	 */
	inline CCLArgLoader<n_index + 1> operator ,(CCLLocalMem t_local_mem_cfg)
	{
		__SafeParamSetv(m_h_func, n_index, t_local_mem_cfg.n_Size(), NULL);
		return CCLArgLoader<n_index + 1>(m_h_func);
	}
};

#if !defined(_MSC_VER) || defined(__MWERKS__) || _MSC_VER >= 1400

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of the rest of parameters, those may be int, float or cl_mem
 *	@note This requires quite recent compiler with variadic macros support.
 *		On older compilers clSetKernelArgs0() trough clSetKernelArgs16() may be used
 *		(longer argument lists are also possible, but longer macros aren't implemented).
 */
#define clSetKernelArgs(h_func, ...) (CCLArgLoader<0>(h_func), __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass no parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs0(h_func) do { /* do nothing */ } while(0)

/**
 *	@brief calls all OpenCL functions required to pass 1 parameter to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs1(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass 2 parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs2(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass 3 parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs3(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass 4 parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs4(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs5(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs6(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs7(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs8(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs9(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs10(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs11(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs12(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs13(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs14(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs15(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] ... is list of function arguments (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs16(h_func, ...) clSetKernelArgs(h_func, __VA_ARGS__)

#else // !defined(_MSC_VER) || defined(__MWERKS__) || _MSC_VER >= 1400

/**
 *	@brief calls all OpenCL functions required to pass no parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs0(h_func) do { /* do nothing */ } while(0)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs1(h_func,a) (CCLArgLoader<0>(h_func),a)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs2(h_func,a,b) (CCLArgLoader<0>(h_func),a,b)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs3(h_func,a,b,c) (CCLArgLoader<0>(h_func),a,b,c)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs4(h_func,a,b,c,d) \
	(CCLArgLoader<0>(h_func),a,b,c,d)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs5(h_func,a,b,c,d,e) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs6(h_func,a,b,c,d,e,f) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs7(h_func,a,b,c,d,e,f,g) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs8(h_func,a,b,c,d,e,f,g,h) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs9(h_func,a,b,c,d,e,f,g,h,i) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs10(h_func,a,b,c,d,e,f,g,h,i,j) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs11(h_func,a,b,c,d,e,f,g,h,i,j,k) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs12(h_func,a,b,c,d,e,f,g,h,i,j,k,l) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k,l)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs13(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k,l,m)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs14(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m,n) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k,l,m,n)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@param[in] o is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs15(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k,l,m,n,o)

/**
 *	@brief calls all OpenCL functions required to pass parameters to a kernel
 *	@param[in] h_func is handle to the kernel function (CUfunction)
 *	@param[in] a is kernel function argument (int, float or cl_mem)
 *	@param[in] b is kernel function argument (int, float or cl_mem)
 *	@param[in] c is kernel function argument (int, float or cl_mem)
 *	@param[in] d is kernel function argument (int, float or cl_mem)
 *	@param[in] e is kernel function argument (int, float or cl_mem)
 *	@param[in] f is kernel function argument (int, float or cl_mem)
 *	@param[in] g is kernel function argument (int, float or cl_mem)
 *	@param[in] h is kernel function argument (int, float or cl_mem)
 *	@param[in] i is kernel function argument (int, float or cl_mem)
 *	@param[in] j is kernel function argument (int, float or cl_mem)
 *	@param[in] k is kernel function argument (int, float or cl_mem)
 *	@param[in] l is kernel function argument (int, float or cl_mem)
 *	@param[in] m is kernel function argument (int, float or cl_mem)
 *	@param[in] n is kernel function argument (int, float or cl_mem)
 *	@param[in] o is kernel function argument (int, float or cl_mem)
 *	@param[in] p is kernel function argument (int, float or cl_mem)
 *	@note See clSetKernelArgs() macro documentation for more details.
 */
#define clSetKernelArgs16(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) \
	(CCLArgLoader<0>(h_func),a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p)

#endif // !defined(_MSC_VER) || defined(__MWERKS__) || _MSC_VER >= 1400

/**
 *	@def DevPtrParam
 *	@brief helper for clSetKernelArg
 *	@param[in] dp_ptr is device pointer (cl_mem type)
 *
 *	Use in the following way:
 *@code
 *	cl_kernel h_kernel; // some function
 *	int n_index = 0; // some parameter index
 *	int n_err_num; // memory allocation result
 *	cl_mem dp_pointer = clCreateBuffer(h_gpu_context, CL_MEM_READ_WRITE, 1024, NULL, &n_err_num); // device-side pointer
 *
 *	clSetKernelArg(h_kernel, n_index, DevPtrParam(dp_pointer))@endcode
 */
#define DevPtrParam(dp_ptr) sizeof(cl_mem), (void*)&(dp_ptr)

#endif //__CUDA_UTILS_INCLUDED
