/*
								+---------------------------------+
								|                                 |
								| ***  OpenCL vector kernels  *** |
								|                                 |
								|  Copyright   -tHE SWINe- 2013  |
								|                                 |
								|         VectorKernels.h         |
								|                                 |
								+---------------------------------+
*/

#pragma once
#ifndef __OPENCL_VECTOR_KERNELS_INCLUDED
#define __OPENCL_VECTOR_KERNELS_INCLUDED

/**
 *	@file VectorKernels.h
 *	@author -tHE SWINe-
 *	@brief OpenCL vector kernels class
 *	@date 2013
 */

#include <assert.h>
#define _ASSERTE(x) assert(x)
#include "ClUtils.h"

/**
 *	@brief calculates number of elements of triangular matrix (including the diagonal)
 *	@param[in] n is number of columns of a square triangular matrix, in elements
 */
inline size_t n_TriangularSize(size_t n)
{
	return (((n) * ((n) + 1)) >> 1);
}

/**
 *	@brief calculates number of elements in upper submatrix
 *		of upper triangular matrix (including the diagonal)
 *
 *	@param[in] n is number of columns of a rectangular triangular matrix, in elements
 *	@param[in] m is number of rows of a rectangular triangular matrix, in elements, must not exceed n
 */
inline size_t n_TriangularSize2(size_t n, size_t m)
{
	return (((m) * ((n) + (n) + 1 - (m))) >> 1);
}

/**
 *	@brief calculates offset in densely packed buffer storing
 *		an upper triangular matrix (including the diagonal)
 *
 *	@param[in] r is zero-based index of row, in elements
 *	@param[in] c is zero-based index of column, in elements
 *	@param[in] n is number of columns of a square triangular matrix, in elements
 *
 *	@note A part of this offset can be precalculated, see n_TriangularOffset2().
 */
inline size_t n_TriangularOffset(size_t r, size_t c, size_t n)
{
	return ((((r) * ((n) + (n) - 1 - (r))) >> 1) + (c));
}

/**
 *	@brief calculates offset in densely packed buffer storing
 *		an upper triangular matrix (including the diagonal)
 *
 *	@param[in] r is zero-based index of row, in elements
 *	@param[in] c is zero-based index of column, in elements
 *	@param[in] _2n1 is twice number of columns of a square triangular matrix (in elements), minus one
 *
 *	@note A part of this offset can be precalculated, see n_TriangularOffset2().
 */
inline size_t n_TriangularOffset2(size_t r, size_t c, size_t _2n1)
{
	return ((((r) * ((_2n1) - (r))) >> 1) + (c));
}

/**
 *	@brief calculates offset of a starting element in a row in densely
 *		packed buffer storing an upper triangular matrix (including the diagonal)
 *
 *	@param[in] r is zero-based index of row, in elements
 *	@param[in] _2n1 is twice number of columns of a square triangular matrix (in elements), minus one
 *
 *	@note To calculate element offset, just add zero-based index of a column.
 */
inline size_t n_TriangularRowOffset(size_t r, size_t _2n1)
{
	return ((((r) * ((_2n1) - (r))) >> 1));
}

/**
 *	@brief GPU vector kernels wrapper object
 *	@todo document this
 */
class CCLVectorKernels {
protected:
	bool m_b_status; /**< @brief flag whether the programs were compiled */

	size_t n_reduction_local_work_size; /**< @brief size of reduction work size (32 or 64) */
	const char *p_s_scalar_type; /**< @brief data type for storing vector scalars */
	const char *p_s_reduction_accum_type; /**< @brief data type for reduction accumulator */
	const char *p_s_reduction_type; /**< @brief data type for reduction results */
	const char *p_s_factor_type; /**< @brief data type for vector scaling factors */
	size_t m_n_scalar_type_size; /**< @brief size of the data type for storing vector scalars, in bytes */
	size_t m_n_factor_type; /**< @brief size of the data type for vector scaling factors, in bytes */
	const char *p_s_crosscorelelem_operation; /**< @brief per-element input operation in cross-corelation */
	const char *p_s_crosscorelreduce_operation; /**< @brief per-node reduction operation in cross-corelation */
	const char *p_s_crosscorelfinal_operation; /**< @brief final reduction operation in cross-corelation */
	const char *p_s_corelelem_operation; /**< @brief per-element input operation in corelation */
	const char *p_s_corelreduce_operation; /**< @brief per-node reduction operation in corelation */
	const char *p_s_corelfinal_operation; /**< @brief final reduction operation in corelation */
	const char *p_s_reductionelem_operation; /**< @brief per-element input operation in reduction */
	const char *p_s_reductionreduce_operation; /**< @brief per-node reduction operation in reduction */
	const char *p_s_reductionfinal_operation; /**< @brief final reduction operation in reduction */
	const char *p_s_scaleelem_operation; /**< @brief per-element operation in scaling */

	cl_program h_program; /**< @brief handle to compiled OpenCL program */
	cl_program h_program_const; /**< @brief handle to second program (to come arround the bug with limit of up to one __constant / program) */

	cl_kernel h_length_kernel_v3; /**< @brief kernel that calculates vector reductions */
	cl_kernel h_correl_kernel_v3; /**< @brief kernel that calculates vector cross-corelations */
	cl_kernel h_correl_kernel_v3_upper; /**< @brief kernel that calculates vector cross-corelations */
	cl_kernel h_correl_kernel_v3_upper_pack; /**< @brief kernel that calculates vector cross-corelations */
	cl_kernel h_correl_kernel_v3_NM; /**< @brief kernel that calculates vector corelations */
	cl_kernel h_correl_kernel_v4; /**< @brief kernel that calculates vector cross-corelations */
	cl_kernel h_correl_kernel_v4_upper; /**< @brief kernel that calculates vector cross-corelations */
	cl_kernel h_correl_kernel_v4_upper_pack; /**< @brief kernel that calculates vector cross-corelations */
	cl_kernel h_correl_kernel_v4_NM; /**< @brief kernel that calculates vector corelations */
	cl_kernel h_scale_kernel_v2_const_npot; /**< @brief kernel that calculates scaled vectors */
	cl_kernel h_scale_kernel_v2_const_pot; /**< @brief kernel that calculates scaled vectors with power of two lengths */

	size_t n_SM_num; /**< @brief number of streaming multiprocessors on the device */
	size_t n_const_memory_size; /**< @brief amount of constant memory on the device, in bytes */
	size_t n_local_memory_size; /**< @brief amount of shared memory per SM on the device, in bytes */

public:
	/**
	 *	@brief default constructor; initializes the kernel operations and data types
	 */
	CCLVectorKernels();

	/**
	 *	@brief destructor; deletes the programs if compiled
	 */
	~CCLVectorKernels();

	void Set_DataTypes(const char *p_s_data_type = "float", int n_size_of_data_type = 4);

	void Configure_ReductionOp(const char *p_s_element_operation = "((x)*(x))",
		const char *p_s_reduce_operation = "((x)+(y))",
		const char *p_s_final_operation = "(sqrt(x))");

	void Configure_CorelationOp(const char *p_s_element_operation = "(((x)-(y))*((x)-(y)))",
		const char *p_s_reduce_operation = "((x)+(y))",
		const char *p_s_final_operation = "(sqrt(x))");

	void Configure_CrossCorelationOp(const char *p_s_element_operation = "(((x)-(y))*((x)-(y)))",
		const char *p_s_reduce_operation = "((x)+(y))",
		const char *p_s_final_operation = "(sqrt(x))");

	void Configure_ScalingOp(const char *p_s_element_operation = "((x)/(y))");

	bool Compile(cl_context h_context, cl_device_id h_device,
		const char *p_s_kernels_src = "Kernels.c", bool b_verbose = false);

	bool Calculate_VectorReduction(cl_command_queue h_cmd_queue,
		size_t n_vector_length, size_t n_vector_num,
		cl_mem dp_vector_lengths, const cl_mem dp_vectors,
		size_t n_max_pass_size = 1 << 24, size_t n_SM_overload = 16) const;

	bool Calculate_VectorsCorelation(cl_command_queue h_cmd_queue,
		size_t n_vector_length, size_t n_vector_num, size_t n_matrix_stride,
		size_t n_vector_num2, cl_mem dp_corel_full_matrix, const cl_mem dp_vectors,
		const cl_mem dp_vectors2, size_t n_max_pass_size = 1 << 12, size_t n_SM_overload = 16) const;

	bool Calculate_VectorsCorelation_Cached(cl_command_queue h_cmd_queue,
		size_t n_vector_length, size_t n_vector_num, size_t n_matrix_stride,
		size_t n_vector_num2, cl_mem dp_corel_full_matrix, const cl_mem dp_vectors,
		const cl_mem dp_vectors2, size_t n_max_pass_size = 1 << 12, size_t n_SM_overload = 16) const;

	bool Calculate_VectorsCrossCorelation(cl_command_queue h_cmd_queue,
		size_t n_vector_length, size_t n_vector_num, size_t n_matrix_stride,
		cl_mem dp_crosscorel_full_matrix, const cl_mem dp_vectors,
		size_t n_max_pass_size = 1 << 24, size_t n_SM_overload = 16) const;

	bool Calculate_VectorsCrossCorelation_Upper(cl_command_queue h_cmd_queue,
		size_t n_vector_length, size_t n_vector_num, size_t n_matrix_stride,
		cl_mem dp_crosscorel_full_matrix, const cl_mem dp_vectors,
		size_t n_max_pass_size = 1 << 24, size_t n_SM_overload = 16) const;

	bool Calculate_VectorsCrossCorelation_UpperPacked(cl_command_queue h_cmd_queue,
		size_t n_vector_length, size_t n_vector_num, size_t n_matrix_stride,
		cl_mem dp_crosscorel_triu_matrix, const cl_mem dp_vectors,
		size_t n_max_pass_size = 1 << 24, size_t n_SM_overload = 16) const;

	bool Calculate_VectorsCrossCorelation_Cached(cl_command_queue h_cmd_queue,
		size_t n_vector_length, size_t n_vector_num, size_t n_matrix_stride,
		cl_mem dp_crosscorel_full_matrix, const cl_mem dp_vectors,
		size_t n_max_pass_size = 1 << 14, size_t n_max_vpass_size = 1 << 14, size_t n_SM_overload = 8) const;

	bool Calculate_VectorsCrossCorelation_Upper_Cached(cl_command_queue h_cmd_queue,
		size_t n_vector_length, size_t n_vector_num, size_t n_matrix_stride,
		cl_mem dp_crosscorel_full_matrix, const cl_mem dp_vectors,
		size_t n_max_pass_size = 1 << 14, size_t n_max_vpass_size = 1 << 14, size_t n_SM_overload = 8) const;

	bool Calculate_VectorsCrossCorelation_UpperPacked_Cached(cl_command_queue h_cmd_queue,
		size_t n_vector_length, size_t n_vector_num, size_t n_matrix_stride,
		cl_mem dp_crosscorel_triu_matrix, const cl_mem dp_vectors,
		size_t n_max_pass_size = 1 << 14, size_t n_max_vpass_size = 1 << 14, size_t n_SM_overload = 8) const;

	bool Calculate_ScaledVectors(cl_context h_context, cl_command_queue h_cmd_queue,
		size_t n_vector_length, size_t n_vector_num,
		cl_mem dp_vectors_dst, const cl_mem dp_vectors, const cl_mem dp_vector_lengths,
		size_t n_max_pass_size = 1 << 24) const;

protected:
	inline CCLVectorKernels(const CCLVectorKernels &UNUSED(r_other)) {} // this object is not copy-able; use pointers instead
	inline CCLVectorKernels &operator =(const CCLVectorKernels &UNUSED(r_other)) { return *this; } // this object is not copy-able; use pointers instead
};

#endif // __OPENCL_VECTOR_KERNELS_INCLUDED
