/*
								+--------------------------------+
								|                                |
								| *** CUDA to OpenCL pseudo- *** |
								| *** automatic porting lib  *** |
								|                                |
								|  Copyright  -tHE SWINe- 2010  |
								|                                |
								|          CUDA_to_CL.h          |
								|                                |
								+--------------------------------+
*/

#pragma once
#ifndef __CUDA_TO_OPENCL_CONVERSION_HEADER_INCLUDED
#define __CUDA_TO_OPENCL_CONVERSION_HEADER_INCLUDED

/**
 *	@file CUDA_to_CL.h
 *	@author -tHE SWINe-
 *	@brief CUDA to OpenCL pseudoautomatic porting library
 *	@date 2010
 *
 *	@date 2010-09-28
 *
 *	renamed clParamsSet*() to clSetKernelArgs*() (clParamSet() is CUDA function name
 *	while clSetKernelArg() is OpenCL function name).
 *
 *	@date 2012-06-19
 *
 *	Added \#pragma once.
 *
 */

#include <CL/opencl.h>
#include <stdio.h>
#include "ClUtils.h"

typedef int CUresult;
typedef cl_device_id CUdevice;
typedef cl_command_queue CUcontext;

//typedef cl_kernel CUfunction; // this is not enough

/**
 *	@brief CUfunction type
 *
 *	cl_kernel doesn't map to CUfunction perfectly, because CUfunction has some associated state
 *	which cl_kernel doesn't.
 */
class CUfunction {
protected:
	cl_kernel m_p_kernel;
	size_t m_n_block_size[3]; // associated CUDA state

public:
	/**
	 *	@default constructor
	 *
	 *	Sets null kernel handle, and block size of 1x1x1.
	 */
	CUfunction()
		:m_p_kernel(0)
	{
		for(int i = 0; i < 3; ++ i)
			m_n_block_size[i] = 1; // default block size ... ?
	}

	/**
	 *	@default constructor
	 *
	 *	Sets specified kernel handle, and block size of 1x1x1.
	 *
	 *	@param[in] p_kernel is kernel handle
	 */
	CUfunction(cl_kernel p_kernel)
		:m_p_kernel(p_kernel)
	{
		for(int i = 0; i < 3; ++ i)
			m_n_block_size[i] = 1; // default block size ... ?
	}

	/**
	 *	@brief gets kernel handle to be used in OpenCL calls
	 *	@return Returns kernel handle.
	 */
	inline operator cl_kernel() const
	{
		return m_p_kernel;
	}

	/**
	 *	@brief gets pointer to kernel handle to be used in OpenCL calls
	 *	@return Returns pointer to kernel handle.
	 */
	inline cl_kernel *operator &()
	{
		return &m_p_kernel;
	}

	/**
	 *	@brief sets block dimensions
	 *
	 *	@param[in] n_block_width is block width
	 *	@param[in] n_block_height is block height
	 *	@param[in] n_block_depth is block depth
	 *
	 *	@note This is supposed to set block dimensions for the kernel globally,
	 *		but it does set it only locally in this simple implementation. Refer to
	 *		copy-constructor documentation for more details.
	 */
	inline void SetBlockDimensions(size_t n_block_width, size_t n_block_height, size_t n_block_depth)
	{
		m_n_block_size[0] = n_block_width;
		m_n_block_size[1] = n_block_height;
		m_n_block_size[2] = n_block_depth;
	}

	/**
	 *	@brief gets block width
	 *	@return Returns block width.
	 */
	inline size_t GetBlockWidth() const
	{
		return m_n_block_size[0];
	}

	/**
	 *	@brief gets block height
	 *	@return Returns block height.
	 */
	inline size_t GetBlockHeight() const
	{
		return m_n_block_size[1];
	}

	/**
	 *	@brief gets block depth
	 *	@return Returns block depth.
	 */
	inline size_t GetBlockDepth() const
	{
		return m_n_block_size[2];
	}

protected:
	/**
	 *	@brief copy-constructor (should be used with caution)
	 *
	 *	Side-effect of this copy-constructor is decoupling of cl_kernel
	 *	and it's associated state. There should be just a single copy of
	 *	CUfunction instance for each kernel, otherwise inconsistencies
	 *	can occur when setting block dimensions at one instance and
	 *	block dimensions in the other instances do not change.
	 *	This would require more complicated design with global pool and
	 *	reference counting to be dealt with properly, but since most
	 *	designs do not really copy CUfunctions too much, and more importantly
	 *	it's usual to set block dimensions everytime just before kernel
	 *	invocation, it would be overkill.
	 *
	 *	@param[in] r_function is copied function
	 */
	CUfunction(const CUfunction &r_function)
		:m_p_kernel(r_function.m_p_kernel)
	{
		for(int i = 0; i < 3; ++ i)
			m_n_block_size[i] = r_function.m_n_block_size[i];
	}

	/**
	 *	@brief copy-operator (should be used with caution)
	 *
	 *	Side-effect of this copy-operator is decoupling of cl_kernel
	 *	and it's associated state. There should be just a single copy of
	 *	CUfunction instance for each kernel, otherwise inconsistencies
	 *	can occur when setting block dimensions at one instance and
	 *	block dimensions in the other instances do not change.
	 *	This would require more complicated design with global pool and
	 *	reference counting to be dealt with properly, but since most
	 *	designs do not really copy CUfunctions too much, and more importantly
	 *	it's usual to set block dimensions everytime just before kernel
	 *	invocation, it would be overkill.
	 *
	 *	@param[in] r_function is copied function
	 *
	 *	@return Returns reference to this.
	 */
	CUfunction &operator =(const CUfunction &r_function)
	{
		m_p_kernel = r_function.m_p_kernel;
		for(int i = 0; i < 3; ++ i)
			m_n_block_size[i] = r_function.m_n_block_size[i];

		return *this;
	}
};

typedef cl_mem CUdeviceptr;
typedef cl_program CUmodule;
// basic data types

/**
 *	@brief class, holding "active" OpenCL context
 *
 *	This is needed because CUDA functions have no context parameter,
 *	while some OpenCL functions do. To provide means of porting CUDA
 *	code to OpenCL without need to edit too much of code, OpenCL
 *	context is made global.
 *
 *	@note There is no such thing as active OpenCL context in OpenCL specification,
 *	but most applications do with just a single OpenCL context (which may
 *	contain several compute devices), the only reason for wanting multiple
 *	contexts is when application utilizes several different platforms.
 *	Since CUDA is exclusively for GPUs, single platform and hence single
 *	context per application should be sufficient.
 */
class COpenCLActiveContext {
protected:
	static cl_context m_h_active_context;

public:
	static inline cl_context h_Get()
	{
		return m_h_active_context;
	}

	static inline void Set(cl_context h_context)
	{
		m_h_active_context = h_context;
	}

	static inline bool b_IsActive(cl_context h_context)
	{
		return m_h_active_context == h_context;
	}

	static inline bool b_IsContext()
	{
		return m_h_active_context != 0;
	}
};

/**
 *	@brief class, holding "active" device for memcpy commands
 *
 *	This is needed, because CUDA cuMemcpy*to*() functions do not have
 *	device handle parameter, while OpenCL functions do (these require
 *	command queue handle, to be specific).
 *
 *	@remarks When porting code, working with more devices, it is
 *	potentially necessary to use CUDAtoCL_SetMemcpyDevice macro.
 */
class COpenCLActiveDevice {
protected:
	static CUcontext m_h_active_device;

public:
	static inline CUcontext h_Get()
	{
		return m_h_active_device;
	}

	static inline void Set(CUcontext h_device)
	{
		m_h_active_device = h_device;
	}

	static inline bool b_IsActive(CUcontext h_device)
	{
		return m_h_active_device == h_device;
	}

	static inline bool b_IsDevice()
	{
		return m_h_active_device != 0;
	}
};

/**
 *	@def CUDAtoCL_SetActiveDevice
 *	@brief sets device which should be affected by following cuMemcpy*to*() or cuLaunchGrid() functions
 *
 *	For more information, refer to COpenCLActiveDevice documentation.
 *
 *	@note This is automaticaly called in cuCtxCreate()
 */
#define CUDAtoCL_SetActiveDevice(d) COpenCLActiveDevice::Set(d)

/**
 *	@brief CUDA error code enumerants.
 *
 *	Those are copied from original CUDA header so they have the same value
 *	as in CUDA program in order to maintain consistency just in case there's
 *	someone who doesn't use enum names, but error code values directly.
 */
enum {
    CUDA_SUCCESS                    = 0,        ///< No errors
    CUDA_ERROR_INVALID_VALUE        = 1,        ///< Invalid value
    CUDA_ERROR_OUT_OF_MEMORY        = 2,        ///< Out of memory
    CUDA_ERROR_NOT_INITIALIZED      = 3,        ///< Driver not initialized
    CUDA_ERROR_DEINITIALIZED        = 4,        ///< Driver deinitialized

    CUDA_ERROR_NO_DEVICE            = 100,      ///< No CUDA-capable device available
    CUDA_ERROR_INVALID_DEVICE       = 101,      ///< Invalid device

    CUDA_ERROR_INVALID_IMAGE        = 200,      ///< Invalid kernel image
    CUDA_ERROR_INVALID_CONTEXT      = 201,      ///< Invalid context
    CUDA_ERROR_CONTEXT_ALREADY_CURRENT = 202,   ///< Context already current
    CUDA_ERROR_MAP_FAILED           = 205,      ///< Map failed
    CUDA_ERROR_UNMAP_FAILED         = 206,      ///< Unmap failed
    CUDA_ERROR_ARRAY_IS_MAPPED      = 207,      ///< Array is mapped
    CUDA_ERROR_ALREADY_MAPPED       = 208,      ///< Already mapped
    CUDA_ERROR_NO_BINARY_FOR_GPU    = 209,      ///< No binary for GPU
    CUDA_ERROR_ALREADY_ACQUIRED     = 210,      ///< Already acquired
    CUDA_ERROR_NOT_MAPPED           = 211,      ///< Not mapped
    CUDA_ERROR_NOT_MAPPED_AS_ARRAY   = 212,      ///< Mapped resource not available for access as an array
    CUDA_ERROR_NOT_MAPPED_AS_POINTER = 213,      ///< Mapped resource not available for access as a pointer
    CUDA_ERROR_ECC_UNCORRECTABLE    = 214,      ///< Uncorrectable ECC error detected

    CUDA_ERROR_INVALID_SOURCE       = 300,      ///< Invalid source
    CUDA_ERROR_FILE_NOT_FOUND       = 301,      ///< File not found

    CUDA_ERROR_INVALID_HANDLE       = 400,      ///< Invalid handle

    CUDA_ERROR_NOT_FOUND            = 500,      ///< Not found

    CUDA_ERROR_NOT_READY            = 600,      ///< CUDA not ready

    CUDA_ERROR_LAUNCH_FAILED        = 700,      ///< Launch failed
    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES = 701,   ///< Launch exceeded resources
    CUDA_ERROR_LAUNCH_TIMEOUT       = 702,      ///< Launch exceeded timeout
    CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING = 703, ///< Launch with incompatible texturing

    CUDA_ERROR_POINTER_IS_64BIT     = 800,      ///< Attempted to retrieve 64-bit pointer via 32-bit API function
    CUDA_ERROR_SIZE_IS_64BIT        = 801,      ///< Attempted to retrieve 64-bit size via 32-bit API function

    CUDA_ERROR_UNKNOWN              = 999       ///< Unknown error
};
// constants

/**
 *	@brief device parameters wrapper
 *
 *	This class is able to read-in device parameters,
 *		and hold them in memory in user-friendly form.
 */
class CEmulatedCuDeviceParams : public CCLDeviceParams {
public:
	typedef CCLDeviceParams::CLdevprop CUdevprop; /**< @brief CUDA device properties structure */

protected:
	int m_n_index;

public:
	/**
	 *	@brief default constructor
	 *
	 *	Reads device parameters.
	 *
	 *	@param[in] n_device_index is device index (must be 0 to cuDeviceGetCount() - 1)
	 *
	 *	@note cuInit() must be called before calling this function.
	 *	@note It is recommended to call b_Status() afterwards to see if constructor succeeded.
	 */
	CEmulatedCuDeviceParams(int n_device_index)
		:CCLDeviceParams(COpenCLActiveContext::h_Get(), n_device_index), m_n_index(n_device_index)
	{}

	/**
	 *	@brief gets (zero-based) device index
	 *	@return Returns zero-based device index.
	 */
	inline int n_DeviceIndex() const
	{
		return m_n_index;
	}

	/**
	 *	@brief gets id of device with maximum (theoretical) computing
	 *		power or device with more memory
	 *
	 *	@return Returns id of the most powerful device.
	 */
	static int n_Get_MaxGFlops_DeviceId()
	{
		if(!COpenCLActiveContext::b_IsContext())
			return -1;
		return CCLUtils::n_Get_MaxGFlops_DeviceId(COpenCLActiveContext::h_Get());
	}
};

typedef CEmulatedCuDeviceParams CCuDeviceParams; /**< @brief use CEmulatedCuDeviceParams as CCuDeviceParams */
typedef CEmulatedCuDeviceParams::CUdevprop CUdevprop; /**< @brief CUDA device properties structure */

#if !defined(_MSC_VER) || defined(__MWERKS__) || _MSC_VER >= 1400
#define cuParamsSet(h_func, ...) clParamsSet(h_func, __VA_ARGS__)
#endif
#define cuParamsSet0(h_func) clSetKernelArgs0(h_func)
#define cuParamsSet1(h_func,a) clSetKernelArgs1(h_func,a)
#define cuParamsSet2(h_func,a,b) clSetKernelArgs2(h_func,a,b)
#define cuParamsSet3(h_func,a,b,c) clSetKernelArgs3(h_func,a,b,c)
#define cuParamsSet4(h_func,a,b,c,d) clSetKernelArgs4(h_func,a,b,c,d)
#define cuParamsSet5(h_func,a,b,c,d,e) clSetKernelArgs5(h_func,a,b,c,d,e)
#define cuParamsSet6(h_func,a,b,c,d,e,f) clSetKernelArgs6(h_func,a,b,c,d,e,f)
#define cuParamsSet7(h_func,a,b,c,d,e,f,g) clSetKernelArgs7(h_func,a,b,c,d,e,f,g)
#define cuParamsSet8(h_func,a,b,c,d,e,f,g,h) clSetKernelArgs8(h_func,a,b,c,d,e,f,g,h)
#define cuParamsSet9(h_func,a,b,c,d,e,f,g,h,i) clSetKernelArgs9(h_func,a,b,c,d,e,f,g,h,i)
#define cuParamsSet10(h_func,a,b,c,d,e,f,g,h,i,j) clSetKernelArgs10(h_func,a,b,c,d,e,f,g,h,i,j)
#define cuParamsSet11(h_func,a,b,c,d,e,f,g,h,i,j,k) clSetKernelArgs11(h_func,a,b,c,d,e,f,g,h,i,j,k)
#define cuParamsSet12(h_func,a,b,c,d,e,f,g,h,i,j,k,l) clSetKernelArgs12(h_func,a,b,c,d,e,f,g,h,i,j,k,l)
#define cuParamsSet13(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m) clSetKernelArgs13(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m)
#define cuParamsSet14(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m,n) clSetKernelArgs14(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m,n)
#define cuParamsSet15(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o) clSetKernelArgs15(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o)
#define cuParamsSet16(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p) clSetKernelArgs16(h_func,a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p)
// kernel parameter loading macros

/**
 *	@brief gets alignment of a given data type
 *	@param[in] _Ty is data type
 *	@return Returns alignment of a given data type (should be power of two).
 *	@note this isn't really required for OpenCL, it's just for CUDA
 */
#define n_AlignOf(_Ty) sizeof(_Ty)

/**
 *	@brief calculates offset after alignment
 *	@param[in] n_offset is current offset in bytes
 *	@param[in] n_alignment is alignment for the next item
 *	@return Returns nearest higher n_offset which is integer multiple of n_alignment.
 *	@note n_alignment must be power of two!
 *	@note this isn't really required for OpenCL, it's just for CUDA
 */
#define n_AlignedOffset(n_offset,n_alignment) (((n_offset) + (n_alignment) - 1) & ~((n_alignment) - 1))

/**
 *	@brief has no effect
 *	@param[in] dp_ptr is device pointer (CUdeviceptr)
 *	@return returns address of dp_ptr cast to void*
 *	@note this macro is supposed to be used with clParamsSet*() macros
 */
#define DevPtr(dp_ptr) (dp_ptr)

#ifdef DevPtrParam
#undef DevPtrParam
#endif

/**
 *	@def DevPtrParam
 *	@brief helper for cuParamSetv()
 *	@param[in] dp_ptr is device pointer (CUdeviceptr type)
 *
 *	Use in the following way:
 *@code
 *	CUfunction h_kernel; // some function
 *	int n_offset = 0; // some parameter offset (in bytes)
 *	int n_err_num; // memory allocation result
 *	CUdeviceptr dp_pointer = clCreateBuffer(h_gpu_context, CL_MEM_READ_WRITE, 1024, NULL, &n_err_num); // device-side pointer
 *
 *	cuParamSetv(h_kernel, n_offset, DevPtrParam(dp_pointer))@endcode
 */
#define DevPtrParam(dp_ptr) (void*)&(dp_ptr), sizeof(cl_mem)

/*
 *								=== functions ===
 */

inline CUresult cuInit(int flags)
{
	cl_context_properties p_props[3] = {cl_context_properties(CL_CONTEXT_PLATFORM), 0, 0};
	{
		cl_platform_id p_platforms[1];
		cl_uint n_platform_num = 0;
		if(clGetPlatformIDs(sizeof(p_platforms) / sizeof(p_platforms[0]),
		   p_platforms, &n_platform_num) != CL_SUCCESS || n_platform_num == 0)
			return CUDA_ERROR_NO_DEVICE;
		p_props[1] = cl_context_properties(p_platforms[0]);
	}
	// get first available OpenCL platform (todo - get first GPU platform)

	cl_int n_result;
	cl_context h_context = clCreateContextFromType(p_props, CL_DEVICE_TYPE_GPU, 0, 0, &n_result);
	if(n_result != CL_SUCCESS)
		return CUDA_ERROR_NO_DEVICE;
	// create OpenCL device & context

	COpenCLActiveContext::Set(h_context);
	// make context "active"

	return CUDA_SUCCESS;
}

inline CUresult cuDriverGetVersion(int *p_version)
{
	/*if(!COpenCLActiveContext::b_IsContext())
		return CUDA_ERROR_NOT_INITIALIZED;*/

	*p_version = CL_VERSION_1_0; // version of header actually
	
	return CUDA_SUCCESS;
}

inline CUresult cuDeviceGetCount(int *p_count)
{
	if(!COpenCLActiveContext::b_IsContext())
		return CUDA_ERROR_NOT_INITIALIZED;

	size_t n_dev_size;
	clGetContextInfo(COpenCLActiveContext::h_Get(), CL_CONTEXT_DEVICES, 0, 0, &n_dev_size);
	*p_count = n_dev_size / sizeof(cl_device_id); // it's in bytes

	return CUDA_SUCCESS;
}

inline CUresult cuDeviceGet(CUdevice *p_device, int n_index)
{
	if(!COpenCLActiveContext::b_IsContext())
		return CUDA_ERROR_NOT_INITIALIZED;

	std::vector<cl_device_id> device_list;
	if(CCLUtils::n_GetDeviceList(COpenCLActiveContext::h_Get(), device_list) != CL_SUCCESS ||
	   n_index < 0 || n_index >= device_list.size())
		return CUDA_ERROR_NO_DEVICE;

	*p_device = device_list[n_index];

	return CUDA_SUCCESS;
}

#define CU_CTX_SCHED_AUTO 1

#define __CUDA_TO_CL_ENABLE_PROFILING__

inline CUresult cuCtxCreate(CUcontext *p_context, int n_flags, CUdevice h_device)
{
	if(!COpenCLActiveContext::b_IsContext())
		return CUDA_ERROR_NOT_INITIALIZED;

	cl_int n_result;
	cl_command_queue_properties n_properties = 0;
	if(n_flags & CU_CTX_SCHED_AUTO)
		n_properties |= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE;
#ifdef __CUDA_TO_CL_ENABLE_PROFILING__
	n_properties |= CL_QUEUE_PROFILING_ENABLE;
#endif // __CUDA_TO_CL_ENABLE_PROFILING__
	cl_command_queue cmd_queue = clCreateCommandQueue(COpenCLActiveContext::h_Get(), h_device, n_properties, &n_result);
	if(n_result != CL_SUCCESS)
		return CUDA_ERROR_NO_DEVICE;
	// create command queue

	*p_context = cmd_queue;

	COpenCLActiveDevice::Set(cmd_queue);
	// !!!

	return CUDA_SUCCESS;
}

inline CUresult cuCtxDetach(CUcontext t_context)
{
	return CUDA_SUCCESS;
}

inline CUresult cuCtxDestroy(CUcontext t_context)
{
	clReleaseCommandQueue(t_context);
	// destroy command queue (note this is asynchronous)

	return CUDA_SUCCESS;
}

inline CUresult cuModuleLoad(CUmodule *p_module, const char *p_s_cubin_filename)
{
	if(!COpenCLActiveContext::b_IsContext())
		return CUDA_ERROR_NOT_INITIALIZED;

	std::string s_cl_filename;
	try {
		s_cl_filename = p_s_cubin_filename;
		if(s_cl_filename.rfind('.') != std::string::npos)
			s_cl_filename.erase(s_cl_filename.rfind('.'));
		s_cl_filename += ".cl";
	} catch(std::bad_alloc&) {
		return CUDA_ERROR_OUT_OF_MEMORY;
	}
	// replace ".ptx" or ".cubin" with ".cl"

	std::string s_program;
	{
		FILE *p_fr;
		if(!(p_fr = fopen(s_cl_filename.c_str(), "rb")))
			return CUDA_ERROR_FILE_NOT_FOUND;
		fseek(p_fr, 0, SEEK_END);
		size_t n_file_size = ftell(p_fr);
		try {
			s_program.resize(n_file_size);
		} catch(std::bad_alloc&) {
			fclose(p_fr);
			return CUDA_ERROR_OUT_OF_MEMORY;
		}
		fseek(p_fr, 0, SEEK_SET);
		if(fread(&s_program[0], 1, n_file_size, p_fr) != n_file_size) {
			fclose(p_fr);
			return CUDA_ERROR_NOT_FOUND; // not really great, but what the hell
		}
		fclose(p_fr);
	}
	// read program from a file

	const char *p_s_program = s_program.c_str();
	unsigned int n_program_length = 0; // is null-terminated, no need for length
	cl_int n_result;
	cl_program p_prog = clCreateProgramWithSource(COpenCLActiveContext::h_Get(),
		1, &p_s_program, &n_program_length, &n_result);
	if(n_result != CL_SUCCESS)
		return CUDA_ERROR_INVALID_VALUE;
	if(clBuildProgram(p_prog, 0, NULL, "", NULL, NULL) != CL_SUCCESS)
		return CUDA_ERROR_INVALID_VALUE;
	// create program

	*p_module = p_prog;

	return CUDA_SUCCESS;
}

inline CUresult cuModuleUnload(CUmodule h_module)
{
	clReleaseProgram(h_module);

	return CUDA_SUCCESS;
}

inline CUresult cuModuleGetFunction(cl_kernel *p_func, CUmodule h_module, const char *p_s_name) // note clReleaseKernel should be called (but will not, since CUDA doesn't need to free functions)
{
	cl_int n_result;
	cl_kernel p_kernel = clCreateKernel(h_module, p_s_name, &n_result);
	if(n_result != CL_SUCCESS) {
		switch(n_result) {
		case CL_INVALID_PROGRAM:
		case CL_INVALID_PROGRAM_EXECUTABLE:
			return CUDA_ERROR_INVALID_HANDLE;
		case CL_INVALID_KERNEL_NAME:
			return CUDA_ERROR_NOT_FOUND;
		default:
			return CUDA_ERROR_UNKNOWN;
		};
	}

	*p_func = p_kernel;

	return CUDA_SUCCESS;
}

inline CUresult cuMemAlloc(CUdeviceptr *p_pointer, size_t n_size)
{
	if(!COpenCLActiveContext::b_IsContext())
		return CUDA_ERROR_NOT_INITIALIZED;

	cl_int n_result;
	cl_mem p_memory = clCreateBuffer(COpenCLActiveContext::h_Get(), CL_MEM_READ_WRITE, n_size, NULL, &n_result);
	if(n_result == CL_INVALID_BUFFER_SIZE)
		return CUDA_ERROR_INVALID_VALUE;
	if(n_result != CL_SUCCESS)
		return CUDA_ERROR_OUT_OF_MEMORY;

	*p_pointer = p_memory;

	return CUDA_SUCCESS;
}

inline CUresult cuMemFree(CUdeviceptr h_pointer)
{
	clReleaseMemObject(h_pointer);

	return CUDA_SUCCESS;
}

inline CUresult cuMemcpyDtoH(void *p_host_dest, CUdeviceptr h_device_src, size_t n_size)
{
	if(!COpenCLActiveDevice::b_IsDevice())
		return CUDA_ERROR_NOT_INITIALIZED;
	cl_command_queue h_command_queue = COpenCLActiveDevice::h_Get();

	int n_result = clEnqueueReadBuffer(h_command_queue, h_device_src, CL_TRUE, 0, n_size, p_host_dest, 0, NULL, NULL);
	if(n_result != CL_SUCCESS)
		return CUDA_ERROR_UNKNOWN;

	return CUDA_SUCCESS;
}

inline CUresult cuMemcpyHtoD(CUdeviceptr h_device_dest, void *p_host_src, size_t n_size)
{
	if(!COpenCLActiveDevice::b_IsDevice())
		return CUDA_ERROR_NOT_INITIALIZED;
	cl_command_queue h_command_queue = COpenCLActiveDevice::h_Get();

	int n_result = clEnqueueWriteBuffer(h_command_queue, h_device_dest, CL_TRUE, 0, n_size, p_host_src, 0, NULL, NULL);
	if(n_result != CL_SUCCESS)
		return CUDA_ERROR_UNKNOWN;

	return CUDA_SUCCESS;
}

inline CUresult cuMemcpyDtoD(CUdeviceptr h_device_dest, CUdeviceptr h_device_src, size_t n_size)
{
	if(!COpenCLActiveDevice::b_IsDevice())
		return CUDA_ERROR_NOT_INITIALIZED;
	cl_command_queue h_command_queue = COpenCLActiveDevice::h_Get();

	int n_result = clEnqueueCopyBuffer(h_command_queue, h_device_src, h_device_dest, 0, 0, n_size, 0, NULL, NULL);
	if(n_result != CL_SUCCESS)
		return CUDA_ERROR_UNKNOWN;

	return CUDA_SUCCESS;
}

inline CUresult cuMemcpyDtoH_off(void *p_host_dest, CUdeviceptr h_device_src, size_t n_offset_src, size_t n_size)
{
	if(!COpenCLActiveDevice::b_IsDevice())
		return CUDA_ERROR_NOT_INITIALIZED;
	cl_command_queue h_command_queue = COpenCLActiveDevice::h_Get();

	int n_result = clEnqueueReadBuffer(h_command_queue, h_device_src, CL_TRUE, n_offset_src, n_size, p_host_dest, 0, NULL, NULL);
	if(n_result != CL_SUCCESS)
		return CUDA_ERROR_UNKNOWN;

	return CUDA_SUCCESS;
}

inline CUresult cuMemcpyHtoD_off(CUdeviceptr h_device_dest, size_t n_offset_dest, void *p_host_src, size_t n_size)
{
	if(!COpenCLActiveDevice::b_IsDevice())
		return CUDA_ERROR_NOT_INITIALIZED;
	cl_command_queue h_command_queue = COpenCLActiveDevice::h_Get();

	int n_result = clEnqueueWriteBuffer(h_command_queue, h_device_dest, CL_TRUE, n_offset_dest, n_size, p_host_src, 0, NULL, NULL);
	if(n_result != CL_SUCCESS)
		return CUDA_ERROR_UNKNOWN;

	return CUDA_SUCCESS;
}

inline CUresult cuMemcpyDtoD_off(CUdeviceptr h_device_dest, size_t n_offset_dest, CUdeviceptr h_device_src, size_t n_offset_src, size_t n_size)
{
	if(!COpenCLActiveDevice::b_IsDevice())
		return CUDA_ERROR_NOT_INITIALIZED;
	cl_command_queue h_command_queue = COpenCLActiveDevice::h_Get();

	int n_result = clEnqueueCopyBuffer(h_command_queue, h_device_src, h_device_dest,
		n_offset_src, n_offset_dest, n_size, 0, NULL, NULL);
	if(n_result != CL_SUCCESS)
		return CUDA_ERROR_UNKNOWN;

	return CUDA_SUCCESS;
}

/**
 *	@brief class, responsible for kernel parameter index calculation
 *
 *	In CUDA, kernel parameters are specified by relatively low-level api,
 *	which just memcpy() them to some buffer, hence it's parameters are offset
 *	and size (both in bytes).
 *	In contrast, OpenCL has higher level api, which requires parameter index,
 *	which needs to be calculated. This class checks whether parameters go in
 *	order as they lie in memory and returns incrementing index for each parameter
 *	to be used by OpenCL calls.
 */
class CCuParamReindexer {
protected:
	static cl_kernel m_p_func;
	static size_t m_n_param_off;
	static int m_n_param_index;
	static bool m_b_setsize_called;

public:
	/**
	 *	@brief resets parameter index counters, checks if parameter size is calculated properly
	 *
	 *	@param[in] p_func is kernel handle
	 *	@param[in] n_size is size of all parameters for the particular kernel call
	 */
	static inline void OnSetSize_Called(cl_kernel p_func, size_t n_size)
	{
		_ASSERTE(!m_n_param_off || m_n_param_off == n_size);
		// cuParamSetSize() should be called ?at the beginning? or at the end of cuParamSet*() block, size should check out
		// this assertion is triggered when it isn't

		if(m_p_func != p_func) {
			m_n_param_off = 0; // !!
			m_n_param_index = 0; // !!
		}
		m_p_func = p_func;
		m_b_setsize_called = true;
	}

	/**
	 *	@brief returns (increasing) parameter index, checks wheter parameter offsets are increasing and contiguous
	 *
	 *	@param[in] p_func is kernel handle
	 *	@param[in] n_param_offset is parameter offset in parameter buffer
	 *	@param[in] n_param_align is parameter data type alignment (probably equal to size)
	 *	@param[in] n_param_size is parameter data type size
	 */
	static inline int n_ParamIndex(cl_kernel p_func, size_t n_param_offset, size_t n_param_align, size_t n_param_size)
	{
		CheckFunction(p_func);

		if(!n_param_offset) {
			m_n_param_off = 0;
			m_n_param_index = 0;
		}
		// it is probably legal to specify parameters for a single kernel multiple times, without calling cuParamSetSize(),
		// so parameter index must be reset everytime first parameter is specified (it will still catch
		// ill-ordered parameter specification code)

		_ASSERTE(m_n_param_off == n_param_offset);
		// this assertion is triggered when parameters do not go in natural order, or if offsets are not calculated properly

		m_n_param_off = n_AlignedOffset(n_param_offset, n_param_align) + n_param_size;
		// increase parameter offset

		return m_n_param_index ++;
		// return parameter index, increase it (parameters must go in order, which is checked by assertion above)
	}

protected:
	/**
	 *	@brief resets parameter index counters, makes sure parameters come
	 *	in such order their indices can be clearly determined
	 *
	 *	@param[in] p_func is kernel handle
	 */
	static inline void CheckFunction(cl_kernel p_func)
	{
		_ASSERTE(!m_p_func || m_p_func == p_func || m_b_setsize_called); // cuParamSetSize() must be called before calling another function
		// this assertion is triggered when cuParamSet*() are called for different kernels, and aren't separated by call to cuParamSetSize()

		if(m_p_func != p_func) {
			m_p_func = p_func;
			m_b_setsize_called = false;
			m_n_param_off = 0; // !!
			m_n_param_index = 0; // !!
		}
	}
};

inline CUresult cuParamSetv(cl_kernel p_func, size_t n_offset, const void *p_param, size_t n_size)
{
	int n_param_index = CCuParamReindexer::n_ParamIndex(p_func, n_offset, n_size, n_size);

	clSetKernelArg(p_func, n_param_index, n_size, p_param);

	return CUDA_SUCCESS;
}

inline CUresult cuParamSeti(cl_kernel p_func, size_t n_offset, int n_param)
{
	int n_param_index = CCuParamReindexer::n_ParamIndex(p_func, n_offset, n_AlignOf(int), sizeof(int));

	clSetKernelArg(p_func, n_param_index, sizeof(int), &n_param);

	return CUDA_SUCCESS;
}

inline CUresult cuParamSetf(cl_kernel p_func, size_t n_offset, float f_param)
{
	int n_param_index = CCuParamReindexer::n_ParamIndex(p_func, n_offset, n_AlignOf(f_param), sizeof(f_param));

	clSetKernelArg(p_func, n_param_index, sizeof(float), &f_param);

	return CUDA_SUCCESS;
}

inline CUresult cuParamSetSize(cl_kernel p_func, size_t n_size)
{
	CCuParamReindexer::OnSetSize_Called(p_func, n_size);

	return CUDA_SUCCESS; // does nothing
}

inline CUresult cuFuncSetBlockShape(CUfunction &r_p_func, size_t n_block_width, size_t n_block_height, size_t n_block_depth)
{
	r_p_func.SetBlockDimensions(n_block_width, n_block_height, n_block_depth);
	// set block dimensions for that particular kernel

	return CUDA_SUCCESS; // can't tell whether it's wrong by now
}

inline CUresult cuLaunchGrid(const CUfunction &p_func, size_t n_grid_width, size_t n_grid_height)
{
	if(!COpenCLActiveDevice::b_IsDevice())
		return CUDA_ERROR_NOT_INITIALIZED;
	cl_command_queue h_command_queue = COpenCLActiveDevice::h_Get();

	size_t *p_global_work_offset = NULL; // current OpenCL limitation - can't contain values
	size_t p_global_work_size[3] = {n_grid_width * p_func.GetBlockWidth(), n_grid_height * p_func.GetBlockHeight(), 1 * p_func.GetBlockDepth()};
	size_t p_local_work_size[3] = {p_func.GetBlockWidth(), p_func.GetBlockHeight(), p_func.GetBlockDepth()};
	int n_result = clEnqueueNDRangeKernel(h_command_queue, p_func, 3,
		p_global_work_offset, p_global_work_size, p_local_work_size, 0, NULL, NULL);

	if(n_result == CL_SUCCESS)
		return CUDA_SUCCESS;
	switch(n_result) {
	case CL_MEM_OBJECT_ALLOCATION_FAILURE:
	case CL_OUT_OF_HOST_MEMORY:
		return CUDA_ERROR_OUT_OF_MEMORY;
	default:
		return CUDA_ERROR_UNKNOWN;
	}
	// todo - do more fine-grained error code translation
}

inline CUresult cuCtxSynchronize()
{
	if(!COpenCLActiveDevice::b_IsDevice())
		return CUDA_ERROR_NOT_INITIALIZED;
	cl_command_queue h_command_queue = COpenCLActiveDevice::h_Get();

	int n_result = clFlush(h_command_queue);
	if(n_result != CL_SUCCESS) {
		switch(n_result) {
		case CL_OUT_OF_HOST_MEMORY:
			return CUDA_ERROR_OUT_OF_MEMORY;
		default:
			return CUDA_ERROR_UNKNOWN;
		}
	}
	n_result = clFinish(h_command_queue);
	if(n_result != CL_SUCCESS) {
		switch(n_result) {
		case CL_OUT_OF_HOST_MEMORY:
			return CUDA_ERROR_OUT_OF_MEMORY;
		default:
			return CUDA_ERROR_UNKNOWN;
		}
	}

	return CUDA_SUCCESS;
}

/*
 *								=== ~functions ===
 */

#endif // __CUDA_TO_OPENCL_CONVERSION_HEADER_INCLUDED
