/*
								+---------------------------------+
								|                                 |
								| ***  OpenCL vector kernels  *** |
								|                                 |
								|  Copyright   -tHE SWINe- 2013  |
								|                                 |
								|        VectorKernels.cpp        |
								|                                 |
								+---------------------------------+
*/

#include <assert.h>
#define _ASSERTE(x) assert(x)
#include "VectorKernels.h"
#include "StlUtils.h"
#include "MinMax.h"

#if defined(_MSC_VER) && !defined(__MWERKS__) && !defined(for)
#define for if(0) {} else for
#endif

/*
 *								=== CCLVectorKernels ===
 */

CCLVectorKernels::CCLVectorKernels()
	:m_b_status(false),
	n_reduction_local_work_size(64),
	p_s_scalar_type("float"),
	p_s_reduction_accum_type("float"),
	p_s_reduction_type("float"),
	p_s_factor_type("float"),
	p_s_crosscorelelem_operation("(((x)-(y))*((x)-(y)))"),
	p_s_crosscorelreduce_operation("((x)+(y))"),
	p_s_crosscorelfinal_operation("((_TyReduction)sqrt((float)x))"),
	p_s_corelelem_operation("(((x)-(y))*((x)-(y)))"),
	p_s_corelreduce_operation("((x)+(y))"),
	p_s_corelfinal_operation("((_TyReduction)sqrt((float)x))"),
	p_s_reductionelem_operation("((x)*(x))"),
	p_s_reductionreduce_operation("((x)+(y))"),
	p_s_reductionfinal_operation("((_TyReduction)sqrt((float)x))"),
	p_s_scaleelem_operation("((x)/(y))"),
	h_program(0), h_program_const(0),
	h_length_kernel_v3(0), h_correl_kernel_v3(0), h_correl_kernel_v3_upper(0),
	h_correl_kernel_v3_upper_pack(0), h_correl_kernel_v3_NM(0),
	h_correl_kernel_v4(0), h_correl_kernel_v4_upper(0),
	h_correl_kernel_v4_upper_pack(0), h_correl_kernel_v4_NM(0),
	h_scale_kernel_v2_const_npot(0), h_scale_kernel_v2_const_pot(0),
	n_SM_num(1), n_const_memory_size(16384), n_local_memory_size(16384)
{}

CCLVectorKernels::~CCLVectorKernels()
{
	if(h_program) {
		clReleaseKernel(h_length_kernel_v3);
		clReleaseKernel(h_correl_kernel_v3);
		clReleaseKernel(h_correl_kernel_v3_NM);
		clReleaseKernel(h_correl_kernel_v3_upper);
		clReleaseKernel(h_correl_kernel_v3_upper_pack);
		clReleaseKernel(h_correl_kernel_v4);
		clReleaseKernel(h_correl_kernel_v4_NM);
		clReleaseKernel(h_correl_kernel_v4_upper);
		clReleaseKernel(h_correl_kernel_v4_upper_pack);
		clReleaseKernel(h_scale_kernel_v2_const_npot);
		clReleaseProgram(h_program);
	}
	if(h_program_const) {
		clReleaseKernel(h_scale_kernel_v2_const_pot);
		clReleaseProgram(h_program_const);
	}
}

void CCLVectorKernels::Set_DataTypes(const char *p_s_data_type /*= "float"*/, int n_size_of_data_type /*= 4*/)
{
	_ASSERTE(!strchr(p_s_data_type, ' ')); // may not contain spaces at the moment
	_ASSERTE(n_size_of_data_type >= 4); // OpenCL cannot operate on bytes and shorts
	_ASSERTE(n_size_of_data_type == sizeof(float) ||
		n_size_of_data_type == sizeof(double)); // now the code is full of sizeof(float), need to replace them with this
	p_s_scalar_type = p_s_data_type;
	p_s_reduction_accum_type = p_s_data_type;
	p_s_reduction_type = p_s_data_type;
	p_s_factor_type = p_s_data_type;
	m_n_scalar_type_size = n_size_of_data_type;
	m_n_factor_type = n_size_of_data_type;
}

void CCLVectorKernels::Configure_ReductionOp(const char *p_s_element_operation /*= "((x)*(x))"*/,
	const char *p_s_reduce_operation /*= "((x)+(y))"*/,
	const char *p_s_final_operation /*= "(sqrt(x))"*/)
{
	_ASSERTE(!strchr(p_s_element_operation, ' ')); // may not contain spaces at the moment
	_ASSERTE(!strchr(p_s_reduce_operation, ' ')); // may not contain spaces at the moment
	_ASSERTE(!strchr(p_s_final_operation, ' ')); // may not contain spaces at the moment
	p_s_reductionelem_operation = p_s_element_operation;
	p_s_reductionreduce_operation = p_s_reduce_operation;
	p_s_reductionfinal_operation = p_s_final_operation;
}

void CCLVectorKernels::Configure_CorelationOp(const char *p_s_element_operation /*= "(((x)-(y))*((x)-(y)))"*/,
	const char *p_s_reduce_operation /*= "((x)+(y))"*/,
	const char *p_s_final_operation /*= "(sqrt(x))"*/)
{
	_ASSERTE(!strchr(p_s_element_operation, ' ')); // may not contain spaces at the moment
	_ASSERTE(!strchr(p_s_reduce_operation, ' ')); // may not contain spaces at the moment
	_ASSERTE(!strchr(p_s_final_operation, ' ')); // may not contain spaces at the moment
	p_s_corelelem_operation = p_s_element_operation;
	p_s_corelreduce_operation = p_s_reduce_operation;
	p_s_corelfinal_operation = p_s_final_operation;
}

void CCLVectorKernels::Configure_CrossCorelationOp(const char *p_s_element_operation /*= "(((x)-(y))*((x)-(y)))"*/,
	const char *p_s_reduce_operation /*= "((x)+(y))"*/,
	const char *p_s_final_operation /*= "(sqrt(x))"*/)
{
	_ASSERTE(!strchr(p_s_element_operation, ' ')); // may not contain spaces at the moment
	_ASSERTE(!strchr(p_s_reduce_operation, ' ')); // may not contain spaces at the moment
	_ASSERTE(!strchr(p_s_final_operation, ' ')); // may not contain spaces at the moment
	p_s_crosscorelelem_operation = p_s_element_operation;
	p_s_crosscorelreduce_operation = p_s_reduce_operation;
	p_s_crosscorelfinal_operation = p_s_final_operation;
}

void CCLVectorKernels::Configure_ScalingOp(const char *p_s_element_operation /*= "((x)/(y))"*/)
{
	_ASSERTE(!strchr(p_s_element_operation, ' ')); // may not contain spaces at the moment
	p_s_scaleelem_operation = p_s_element_operation;
}

bool CCLVectorKernels::Compile(cl_context h_context, cl_device_id h_device,
	const char *p_s_kernels_src /*= "Kernels.c"*/, bool b_verbose /*= false*/)
{
	if(m_b_status)
		return true;
	// already compiled

	_ASSERTE(b_Is_POT(n_reduction_local_work_size)); // must be a power of two

	std::string s_preprocessor;
	if(!stl_ut::Format(s_preprocessor,
	   "-D REDUCTION_LOCAL_WORK_SIZE=" PRIsize " "
	   "-D SCALAR_TYPE=%s "
	   "-D REDUCTION_ACCUM_TYPE=%s "
	   "-D REDUCTION_TYPE=%s "
	   "-D FACTOR_TYPE=%s "
	   "-D CrossCorelElemOp=%s "
	   "-D CrossCorelReduceOp=%s "
	   "-D CrossCorelFinalOp=%s "
	   "-D CorelElemOp=%s "
	   "-D CorelReduceOp=%s "
	   "-D CorelFinalOp=%s "
	   "-D ReductionElemOp=%s "
	   "-D ReductionReduceOp=%s "
	   "-D ReductionFinalOp=%s "
	   "-D ScaleElemOp=%s",
	   n_reduction_local_work_size,
	   p_s_scalar_type,
	   p_s_reduction_accum_type,
	   p_s_reduction_type,
	   p_s_factor_type,
	   p_s_crosscorelelem_operation,
	   p_s_crosscorelreduce_operation,
	   p_s_crosscorelfinal_operation,
	   p_s_corelelem_operation,
	   p_s_corelreduce_operation,
	   p_s_corelfinal_operation,
	   p_s_reductionelem_operation,
	   p_s_reductionreduce_operation,
	   p_s_reductionfinal_operation,
	   p_s_scaleelem_operation))
		return false;
	int n_flags;
	if(b_verbose)
		printf("loading \'%s\' ... ", p_s_kernels_src);
	int n_result = CCLProgramCompiler::n_CompileProgramFile(h_context, &h_program, p_s_kernels_src,
		1, &h_device, s_preprocessor.c_str(), "Kernels.clbin", &n_flags);
	if(b_verbose)
		CCLProgramCompiler::Dump_StatusWord(n_flags); // see the result
	if(n_result != CL_SUCCESS) {
		fprintf(stderr, "error: failed to load OpenCL program\n");
		return false;
	}
	if(b_verbose)
		printf("loading \'%s\' ... ", p_s_kernels_src);
	if(!stl_ut::AppendCStr(s_preprocessor, " -D WANT_SECOND_CONST"))
		return false;
	n_result = CCLProgramCompiler::n_CompileProgramFile(h_context, &h_program_const, p_s_kernels_src,
		1, &h_device, s_preprocessor.c_str(), "Kernels1.clbin", &n_flags);
	if(b_verbose) 
		CCLProgramCompiler::Dump_StatusWord(n_flags); // see the result
	if(n_result != CL_SUCCESS) {
		fprintf(stderr, "error: failed to load OpenCL program\n");
		return false;
	}
	// compile program

	CCLDeviceParams device_params(h_device);
	n_SM_num = device_params.n_Multiprocessor_Num();
	n_const_memory_size = device_params.t_Properties().totalConstantMemory;
	n_local_memory_size = device_params.t_Properties().sharedMemPerBlock;
	// get device params

	{
		cl_int n_result;
		h_length_kernel_v3 = clCreateKernel(h_program, "VectorLengths_v3_multi", &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		h_correl_kernel_v3 = clCreateKernel(h_program, "VectorCorelN_to_N", &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		h_correl_kernel_v3_NM = clCreateKernel(h_program, "VectorCorelN_to_M", &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		h_correl_kernel_v3_upper = clCreateKernel(h_program, "VectorCorelN_to_N_upper", &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		h_correl_kernel_v3_upper_pack = clCreateKernel(h_program, "VectorCorelN_to_N_upper_packed", &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		h_correl_kernel_v4 = clCreateKernel(h_program, "VectorCorelN_to_N_v2", &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		h_correl_kernel_v4_NM = clCreateKernel(h_program, "VectorCorelN_to_M_v2", &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		h_correl_kernel_v4_upper = clCreateKernel(h_program, "VectorCorelN_to_N_v2_upper", &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		h_correl_kernel_v4_upper_pack = clCreateKernel(h_program, "VectorCorelN_to_N_v2_upper_pack", &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		h_scale_kernel_v2_const_npot = clCreateKernel(h_program, "ScaleVectors_v2_const_NPOT", &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		h_scale_kernel_v2_const_pot = clCreateKernel(h_program_const, "ScaleVectors_v2_const_POT", &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
	}
	// get OpenCL kernel(s)

	m_b_status = true;
	// compiled

	return true;
}

bool CCLVectorKernels::Calculate_VectorReduction(cl_command_queue h_cmd_queue,
	size_t n_vector_length, size_t n_vector_num,
	cl_mem dp_vector_lengths, const cl_mem dp_vectors,
	size_t n_max_pass_size /*= 1 << 24*/, size_t n_SM_overload /*= 16*/) const
{
	if(!m_b_status)
		return false;
	// if it did not compile, do nothing

	size_t n_term_num = n_vector_num; // alias
	size_t n_reduction_size = n_vector_length;
	// determine number of elements, processed by a single thread

	/*size_t n_single_thread_elems = max(size_t(1),
		(n_vector_length + n_reduction_local_work_size - 1) / n_reduction_local_work_size); // just debug
	printf("parallel multi-reduction: %d dimensions, %d local work-items,"
		" %d elems processed by a single thread\n", n_reduction_size, n_reduction_local_work_size,
		n_single_thread_elems);*/ // debug

	{
		//const size_t n_max_pass_size = 1 << 24; // probaby a good thing if it's power of two!
		// max # vectors in one pass (disregards GPU size, just gives a number of vectors to process in one pass)

		size_t n_pass_num = (n_term_num + n_max_pass_size - 1) / n_max_pass_size;
		size_t n_nominal_pass_size = n_term_num / n_pass_num;
		size_t n_last_pass_size = n_term_num - (n_pass_num - 1) * n_nominal_pass_size;
		// decide number of passes

		//printf("have %d passes\n", n_pass_num);
		for(size_t n_pass = 0, n_first_vector = 0; n_pass < n_pass_num; ++ n_pass) {
			size_t n_pass_size = (n_pass + 1 == n_pass_num)? n_last_pass_size : n_nominal_pass_size;

			size_t n_local_work_size = n_reduction_local_work_size;
			size_t n_global_work_size = min(size_t(n_SM_overload * n_SM_num), n_Make_POT(n_pass_size)) * n_local_work_size; // changes how many groups are scheduled; probably several times GPU size is good (8x or 16x; 32x doesn't add any more gains)

			/*size_t n_work_group_num = n_global_work_size / n_local_work_size;
			printf("pass(local work size: %d, work group num: %d, pass size: %d, kernel iterations: %d)\n",
				n_local_work_size, n_work_group_num, n_pass_size,
				(n_pass_size + n_work_group_num - 1) / n_work_group_num);*/ // debug

			clSetKernelArgs5(h_length_kernel_v3, dp_vector_lengths, dp_vectors,
				int(n_first_vector), int(n_vector_length), int(n_first_vector + n_pass_size)); // must be 2* with gtx580 - t_odo // had missing if in the kernel
			// set kernel parameters

			int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_length_kernel_v3, 1,
				NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels 8 (gs %d, ls %d, result %d)\n",
					n_global_work_size, n_local_work_size, n_result);
				return false;
			}
			// launch kernel

			n_first_vector += n_pass_size;
		}
		// calculate parts of squared lengths of all the vectors
	}
	// reduction step

	return true;
}

bool CCLVectorKernels::Calculate_VectorsCorelation(cl_command_queue h_cmd_queue,
	size_t n_vector_length, size_t n_vector_num, size_t n_matrix_stride,
	size_t n_vector_num2, cl_mem dp_corel_full_matrix, const cl_mem dp_vectors,
	const cl_mem dp_vectors2, size_t n_max_pass_size /*= 1 << 12*/, size_t n_SM_overload /*= 16*/) const
{
	if(!m_b_status)
		return false;
	// if it did not compile, do nothing

	cl_mem dp_vector_correlations = dp_corel_full_matrix; // alias
	size_t n_term_num = n_vector_num; // alias
	size_t n_term2_num = n_vector_num2; // alias
	size_t n_term_num_POT = n_matrix_stride;
	_ASSERTE(n_term_num_POT >= n_term_num);

	size_t n_reduction_size = n_vector_length;
	// determine number of elements, processed by a single thread

	/*size_t n_single_thread_elems = max(size_t(1),
		(n_vector_length + n_reduction_local_work_size - 1) / n_reduction_local_work_size); // just debug
	printf("parallel multi-reduction: %d dimensions, %d local work-items,"
		" %d elems processed by a single thread\n", n_reduction_size, n_reduction_local_work_size,
		n_single_thread_elems);*/ // debug

	{
		//const size_t n_max_pass_size = 1 << 12; // probaby a good thing if it's power of two!
		// max # vectors in one pass (disregards GPU size, just gives a number of vectors to process in one pass)

		size_t n_pass_num = (n_term_num + n_max_pass_size - 1) / n_max_pass_size;
		size_t n_nominal_pass_size = n_term_num / n_pass_num;
		size_t n_last_pass_size = n_term_num - (n_pass_num - 1) * n_nominal_pass_size;
		size_t n_vpass_num = (n_term2_num + n_max_pass_size - 1) / n_max_pass_size;
		size_t n_nominal_vpass_size = n_term2_num / n_vpass_num;
		size_t n_last_vpass_size = n_term2_num - (n_vpass_num - 1) * n_nominal_vpass_size;
		// decide number of passes

		//printf("have %d x %d passes\n", n_pass_num, n_pass_num);
		for(size_t n_vpass = 0, n_first_vector2 = 0; n_vpass < n_vpass_num; ++ n_vpass) {
			size_t n_vpass_size = (n_vpass + 1 == n_vpass_num)? n_last_vpass_size : n_nominal_vpass_size;
			// vertical passes

			for(size_t n_pass = 0, n_first_vector = 0; n_pass < n_pass_num; ++ n_pass) {
				size_t n_pass_size = (n_pass + 1 == n_pass_num)? n_last_pass_size : n_nominal_pass_size;

				size_t n_local_work_size = n_reduction_local_work_size;
				size_t n_global_work_size = min(size_t(n_SM_overload * n_SM_num),
					n_Make_POT(n_vpass_size * n_pass_size)) * n_local_work_size; // changes how many groups are scheduled; probably several times GPU size is good (8x or 16x; 32x doesn't add any more gains)

				/*printf("correlation tile (%d - %d) * (%d - %d)\n",
					n_first_vector, n_first_vector + n_pass_size,
					n_first_vector2, n_first_vector2 + n_vpass_size); // debug
				size_t n_work_group_num = n_global_work_size / n_local_work_size;
				printf("pass(local work size: %d, work group num: %d, pass size: %d, kernel iterations: %d)\n",
					n_local_work_size, n_work_group_num, n_pass_size,
					(n_pass_size + n_work_group_num - 1) / n_work_group_num);*/ // debug

				clSetKernelArgs10(h_correl_kernel_v3_NM, dp_vector_correlations,
					int(n_first_vector + n_first_vector2 * n_term_num_POT), int(n_term_num_POT), // first (points to a scanline in the matrix), stride
					dp_vectors, int(n_first_vector), int(n_vector_length), int(n_first_vector + n_pass_size),
					dp_vectors2, int(n_first_vector2), int(n_first_vector2 + n_vpass_size));
				// set kernel parameters

				int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_correl_kernel_v3_NM, 1,
					NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels 8 (gs %d, ls %d, result %d)\n",
						n_global_work_size, n_local_work_size, n_result);
					return false;
				}
				// launch kernel

				n_first_vector += n_pass_size;
				//break; // debug - do only the first tile
			}

			n_first_vector2 += n_vpass_size;
			//break; // debug - do only the first tile
		}
		// calculate correlations of all the vectors (works in tiles, last block row and block
		// column is possibly different from the others)
	}
	// reduction step

	return true;
}

bool CCLVectorKernels::Calculate_VectorsCorelation_Cached(cl_command_queue h_cmd_queue,
	size_t n_vector_length, size_t n_vector_num, size_t n_matrix_stride,
	size_t n_vector_num2, cl_mem dp_corel_full_matrix, const cl_mem dp_vectors,
	const cl_mem dp_vectors2, size_t n_max_pass_size /*= 1 << 12*/, size_t n_SM_overload /*= 16*/) const
{
	if(!m_b_status)
		return false;
	// if it did not compile, do nothing

	if((n_reduction_local_work_size + n_vector_length) * m_n_scalar_type_size/*sizeof(float)*/ > n_local_memory_size)
		return false;
	// the vectors are too big, can not be cached - use the uncached variant

	cl_mem dp_vector_correlations = dp_corel_full_matrix; // alias
	size_t n_term_num = n_vector_num; // alias
	size_t n_term2_num = n_vector_num2; // alias
	size_t n_term_num_POT = n_matrix_stride;
	_ASSERTE(n_term_num_POT >= n_term_num);
	size_t n_reduction_size = n_vector_length;
	// determine number of elements, processed by a single thread

	/*size_t n_single_thread_elems = max(size_t(1),
		(n_vector_length + n_reduction_local_work_size - 1) / n_reduction_local_work_size); // just debug
	printf("parallel multi-reduction: %d dimensions, %d local work-items,"
		" %d elems processed by a single thread\n", n_reduction_size, n_reduction_local_work_size,
		n_single_thread_elems);*/ // debug

	{
		//const size_t n_max_pass_size = 1 << 12; // probaby a good thing if it's power of two!
		// max # vectors in one pass (disregards GPU size, just gives a number of vectors to process in one pass)

		size_t n_pass_num = (n_term_num + n_max_pass_size - 1) / n_max_pass_size;
		size_t n_nominal_pass_size = n_term_num / n_pass_num;
		size_t n_last_pass_size = n_term_num - (n_pass_num - 1) * n_nominal_pass_size;
		size_t n_vpass_num = (n_term2_num + n_max_pass_size - 1) / n_max_pass_size;
		size_t n_nominal_vpass_size = n_term2_num / n_vpass_num;
		size_t n_last_vpass_size = n_term2_num - (n_vpass_num - 1) * n_nominal_vpass_size;
		// decide number of passes

		//printf("have %d x %d passes\n", n_pass_num, n_pass_num);
		for(size_t n_vpass = 0, n_first_vector2 = 0; n_vpass < n_vpass_num; ++ n_vpass) {
			size_t n_vpass_size = (n_vpass + 1 == n_vpass_num)? n_last_vpass_size : n_nominal_vpass_size;
			// vertical passes

			for(size_t n_pass = 0, n_first_vector = 0; n_pass < n_pass_num; ++ n_pass) {
				size_t n_pass_size = (n_pass + 1 == n_pass_num)? n_last_pass_size : n_nominal_pass_size;

				size_t n_local_work_size = n_reduction_local_work_size;
				size_t n_global_work_size = min(size_t(n_SM_overload * n_SM_num),
					n_Make_POT(n_vpass_size * n_pass_size)) * n_local_work_size; // changes how many groups are scheduled; probably several times GPU size is good (8x or 16x; 32x doesn't add any more gains)

				/*printf("correlation tile (%d - %d) * (%d - %d)\n",
					n_first_vector, n_first_vector + n_pass_size,
					n_first_vector2, n_first_vector2 + n_vpass_size); // debug
				size_t n_work_group_num = n_global_work_size / n_local_work_size;
				printf("pass(local work size: %d, work group num: %d, pass size: %d, kernel iterations: %d)\n",
					n_local_work_size, n_work_group_num, n_pass_size,
					(n_pass_size + n_work_group_num - 1) / n_work_group_num);*/ // debug

				clSetKernelArgs11(h_correl_kernel_v4_NM, dp_vector_correlations,
					int(n_first_vector + n_first_vector2 * n_term_num_POT), int(n_term_num_POT), // first (points to a scanline in the matrix), stride
					dp_vectors, int(n_first_vector), int(n_vector_length), int(n_first_vector + n_pass_size),
					dp_vectors2, int(n_first_vector2), int(n_first_vector2 + n_vpass_size),
					CCLLocalMem(n_vector_length * m_n_scalar_type_size/*sizeof(float)*/));
				// set kernel parameters

				int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_correl_kernel_v4_NM, 1,
					NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels 8 (gs %d, ls %d, result %d)\n",
						n_global_work_size, n_local_work_size, n_result);
					return false;
				}
				// launch kernel

				n_first_vector += n_pass_size;
				//break; // debug - do only the first tile
			}

			n_first_vector2 += n_vpass_size;
			//break; // debug - do only the first tile
		}
		// calculate correlations of all the vectors (works in tiles, last block row and block
		// column is possibly different from the others)
	}
	// reduction step

	return true;
}

bool CCLVectorKernels::Calculate_VectorsCrossCorelation(cl_command_queue h_cmd_queue,
	size_t n_vector_length, size_t n_vector_num, size_t n_matrix_stride,
	cl_mem dp_crosscorel_full_matrix, const cl_mem dp_vectors,
	size_t n_max_pass_size /*= 1 << 24*/, size_t n_SM_overload /*= 16*/) const
{
	if(!m_b_status)
		return false;
	// if it did not compile, do nothing

	cl_mem dp_vector_correlations = dp_crosscorel_full_matrix; // alias
	size_t n_term_num = n_vector_num; // alias
	size_t n_term_num_POT = n_matrix_stride;
	_ASSERTE(n_term_num_POT >= n_term_num);

	size_t n_reduction_size = n_vector_length;
	// determine number of elements, processed by a single thread

	/*size_t n_single_thread_elems = max(size_t(1),
		(n_vector_length + n_reduction_local_work_size - 1) / n_reduction_local_work_size); // just debug
	printf("parallel multi-reduction: %d dimensions, %d local work-items,"
		" %d elems processed by a single thread\n", n_reduction_size, n_reduction_local_work_size,
		n_single_thread_elems);*/ // debug

	{
		//const size_t n_max_pass_size = 1 << 24; // probaby a good thing if it's power of two!
		// max # vectors in one pass (disregards GPU size, just gives a number of vectors to process in one pass)

		size_t n_pass_num = (n_term_num + n_max_pass_size - 1) / n_max_pass_size;
		size_t n_nominal_pass_size = n_term_num / n_pass_num;
		size_t n_last_pass_size = n_term_num - (n_pass_num - 1) * n_nominal_pass_size;
		// decide number of passes

		//printf("have %d x %d passes\n", n_pass_num, n_pass_num);
		for(size_t n_vpass = 0, n_first_vector2 = 0; n_vpass < n_pass_num; ++ n_vpass) {
			size_t n_vpass_size = (n_vpass + 1 == n_pass_num)? n_last_pass_size : n_nominal_pass_size;
			// vertical passes

			for(size_t n_pass = 0, n_first_vector = 0; n_pass < n_pass_num; ++ n_pass) {
				size_t n_pass_size = (n_pass + 1 == n_pass_num)? n_last_pass_size : n_nominal_pass_size;

				size_t n_local_work_size = n_reduction_local_work_size;
				size_t n_global_work_size = min(size_t(n_SM_overload * n_SM_num),
					n_Make_POT(n_vpass_size * n_pass_size)) * n_local_work_size; // changes how many groups are scheduled; probably several times GPU size is good (8x or 16x; 32x doesn't add any more gains)

				/*printf("correlation tile (%d - %d) * (%d - %d)\n",
					n_first_vector, n_first_vector + n_pass_size,
					n_first_vector2, n_first_vector2 + n_vpass_size); // debug
				size_t n_work_group_num = n_global_work_size / n_local_work_size;
				printf("pass(local work size: %d, work group num: %d, pass size: %d, kernel iterations: %d)\n",
					n_local_work_size, n_work_group_num, n_pass_size,
					(n_pass_size + n_work_group_num - 1) / n_work_group_num);*/ // debug

				clSetKernelArgs9(h_correl_kernel_v3, dp_vector_correlations,
					int(n_first_vector + n_first_vector2 * n_term_num_POT), int(n_term_num_POT), // first (points to a scanline in the matrix), stride
					dp_vectors, int(n_first_vector), int(n_vector_length), int(n_first_vector + n_pass_size),
					int(n_first_vector2), int(n_first_vector2 + n_vpass_size));
				// set kernel parameters

				int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_correl_kernel_v3, 1,
					NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels 8 (gs %d, ls %d, result %d)\n",
						n_global_work_size, n_local_work_size, n_result);
					return false;
				}
				// launch kernel

				n_first_vector += n_pass_size;
				//break; // debug - do only the first tile
			}

			n_first_vector2 += n_vpass_size;
			//break; // debug - do only the first tile
		}
		// calculate correlations of all the vectors (works in tiles, last block row and block
		// column is possibly different from the others)
	}
	// reduction step

	return true;
}

bool CCLVectorKernels::Calculate_VectorsCrossCorelation_Upper(cl_command_queue h_cmd_queue,
	size_t n_vector_length, size_t n_vector_num, size_t n_matrix_stride,
	cl_mem dp_crosscorel_full_matrix, const cl_mem dp_vectors,
	size_t n_max_pass_size /*= 1 << 24*/, size_t n_SM_overload /*= 16*/) const
{
	if(!m_b_status)
		return false;
	// if it did not compile, do nothing

	cl_mem dp_vector_correlations = dp_crosscorel_full_matrix; // alias
	size_t n_term_num = n_vector_num; // alias
	size_t n_term_num_POT = n_matrix_stride;
	_ASSERTE(n_term_num_POT >= n_term_num);
	size_t n_reduction_size = n_vector_length;
	// determine number of elements, processed by a single thread

	/*size_t n_single_thread_elems = max(size_t(1),
		(n_vector_length + n_reduction_local_work_size - 1) / n_reduction_local_work_size); // just debug
	printf("parallel multi-reduction: %d dimensions, %d local work-items,"
		" %d elems processed by a single thread\n", n_reduction_size, n_reduction_local_work_size,
		n_single_thread_elems);*/ // debug

	{
		//const size_t n_max_pass_size = 1 << 24; // probaby a good thing if it's power of two!
		// max # vectors in one pass (disregards GPU size, just gives a number of vectors to process in one pass)

		size_t n_pass_num = (n_term_num + n_max_pass_size - 1) / n_max_pass_size;
		size_t n_nominal_pass_size = n_term_num / n_pass_num;
		size_t n_last_pass_size = n_term_num - (n_pass_num - 1) * n_nominal_pass_size;
		// decide number of passes

		//printf("have %d x %d passes\n", n_pass_num, n_pass_num);
		for(size_t n_vpass = 0, n_first_vector2 = 0; n_vpass < n_pass_num; ++ n_vpass) {
			size_t n_vpass_size = (n_vpass + 1 == n_pass_num)? n_last_pass_size : n_nominal_pass_size;
			// vertical passes

			for(size_t n_pass = 0, n_first_vector = 0; n_pass < n_pass_num; ++ n_pass) {
				size_t n_pass_size = (n_pass + 1 == n_pass_num)? n_last_pass_size : n_nominal_pass_size;

				if(n_first_vector2 >= n_first_vector + n_pass_size) {
					n_first_vector += n_pass_size;
					continue;
				}
				// would be in the lower diagonal

				size_t n_local_work_size = n_reduction_local_work_size;
				size_t n_global_work_size = min(size_t(n_SM_overload * n_SM_num),
					n_Make_POT(n_vpass_size * n_pass_size)) * n_local_work_size; // changes how many groups are scheduled; probably several times GPU size is good (8x or 16x; 32x doesn't add any more gains)

				/*printf("correlation tile (%d - %d) * (%d - %d)\n",
					n_first_vector, n_first_vector + n_pass_size,
					n_first_vector2, n_first_vector2 + n_vpass_size); // debug
				size_t n_work_group_num = n_global_work_size / n_local_work_size;
				printf("pass(local work size: %d, work group num: %d, pass size: %d, kernel iterations: %d)\n",
					n_local_work_size, n_work_group_num, n_pass_size,
					(n_pass_size + n_work_group_num - 1) / n_work_group_num);*/ // debug

				clSetKernelArgs9(h_correl_kernel_v3_upper, dp_vector_correlations,
					int(n_first_vector + n_first_vector2 * n_term_num_POT), int(n_term_num_POT), // first (points to a scanline in the matrix), stride
					dp_vectors, int(n_first_vector), int(n_vector_length), int(n_first_vector + n_pass_size),
					int(n_first_vector2), int(n_first_vector2 + n_vpass_size));
				// set kernel parameters

				int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_correl_kernel_v3_upper, 1,
					NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels 8 (gs %d, ls %d, result %d)\n",
						n_global_work_size, n_local_work_size, n_result);
					return false;
				}
				// launch kernel

				n_first_vector += n_pass_size;
				//break; // debug - do only the first tile
			}

			n_first_vector2 += n_vpass_size;
			//break; // debug - do only the first tile
		}
		// calculate correlations of all the vectors (works in tiles, last block row and block
		// column is possibly different from the others)
	}
	// reduction step

	return true;
}

bool CCLVectorKernels::Calculate_VectorsCrossCorelation_UpperPacked(cl_command_queue h_cmd_queue,
	size_t n_vector_length, size_t n_vector_num, size_t n_matrix_stride,
	cl_mem dp_crosscorel_triu_matrix, const cl_mem dp_vectors,
	size_t n_max_pass_size /*= 1 << 24*/, size_t n_SM_overload /*= 16*/) const
{
	if(!m_b_status)
		return false;
	// if it did not compile, do nothing

	cl_mem dp_vector_correlations = dp_crosscorel_triu_matrix; // alias
	size_t n_term_num = n_vector_num; // alias
	size_t n_term_num_POT = n_matrix_stride;
	_ASSERTE(n_term_num_POT >= n_term_num);
	size_t n_reduction_size = n_vector_length;
	// determine number of elements, processed by a single thread

	/*size_t n_single_thread_elems = max(size_t(1),
		(n_vector_length + n_reduction_local_work_size - 1) / n_reduction_local_work_size); // just debug
	printf("parallel multi-reduction: %d dimensions, %d local work-items,"
		" %d elems processed by a single thread\n", n_reduction_size, n_reduction_local_work_size,
		n_single_thread_elems);*/ // debug

	{
		//const size_t n_max_pass_size = 1 << 24; // probaby a good thing if it's power of two!
		// max # vectors in one pass (disregards GPU size, just gives a number of vectors to process in one pass)

		size_t n_pass_num = (n_term_num + n_max_pass_size - 1) / n_max_pass_size;
		size_t n_nominal_pass_size = n_term_num / n_pass_num;
		size_t n_last_pass_size = n_term_num - (n_pass_num - 1) * n_nominal_pass_size;
		// decide number of passes

		//printf("have %d x %d passes\n", n_pass_num, n_pass_num);
		for(size_t n_vpass = 0, n_first_vector2 = 0; n_vpass < n_pass_num; ++ n_vpass) {
			size_t n_vpass_size = (n_vpass + 1 == n_pass_num)? n_last_pass_size : n_nominal_pass_size;
			// vertical passes

			for(size_t n_pass = 0, n_first_vector = 0; n_pass < n_pass_num; ++ n_pass) {
				size_t n_pass_size = (n_pass + 1 == n_pass_num)? n_last_pass_size : n_nominal_pass_size;

				if(n_first_vector2 >= n_first_vector + n_pass_size) {
					n_first_vector += n_pass_size;
					continue;
				}
				// would be in the lower diagonal

				size_t n_local_work_size = n_reduction_local_work_size;
				size_t n_global_work_size = min(size_t(n_SM_overload * n_SM_num),
					n_Make_POT(n_vpass_size * n_pass_size)) * n_local_work_size; // changes how many groups are scheduled; probably several times GPU size is good (8x or 16x; 32x doesn't add any more gains)

				/*printf("correlation tile (%d - %d) * (%d - %d)\n",
					n_first_vector, n_first_vector + n_pass_size,
					n_first_vector2, n_first_vector2 + n_vpass_size); // debug
				size_t n_work_group_num = n_global_work_size / n_local_work_size;
				printf("pass(local work size: %d, work group num: %d, pass size: %d, kernel iterations: %d)\n",
					n_local_work_size, n_work_group_num, n_pass_size,
					(n_pass_size + n_work_group_num - 1) / n_work_group_num);*/ // debug

				clSetKernelArgs9(h_correl_kernel_v3_upper_pack, dp_vector_correlations,
					int(n_TriangularOffset(n_first_vector2, n_first_vector, n_term_num_POT)), int(n_term_num_POT), // first (points to a scanline in the matrix), stride
					dp_vectors, int(n_first_vector), int(n_vector_length), int(n_first_vector + n_pass_size),
					int(n_first_vector2), int(n_first_vector2 + n_vpass_size));
				// set kernel parameters

				int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_correl_kernel_v3_upper_pack, 1,
					NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels 8 (gs %d, ls %d, result %d)\n",
						n_global_work_size, n_local_work_size, n_result);
					return false;
				}
				// launch kernel

				n_first_vector += n_pass_size;
				//break; // debug - do only the first tile
			}

			n_first_vector2 += n_vpass_size;
			//break; // debug - do only the first tile
		}
		// calculate correlations of all the vectors (works in tiles, last block row and block
		// column is possibly different from the others)
	}
	// reduction step

	return true;
}

bool CCLVectorKernels::Calculate_VectorsCrossCorelation_Cached(cl_command_queue h_cmd_queue,
	size_t n_vector_length, size_t n_vector_num, size_t n_matrix_stride,
	cl_mem dp_crosscorel_full_matrix, const cl_mem dp_vectors,
	size_t n_max_pass_size /*= 1 << 14*/, size_t n_max_vpass_size /*= 1 << 14*/, size_t n_SM_overload /*= 8*/) const
{
	if(!m_b_status)
		return false;
	// if it did not compile, do nothing

	if((n_reduction_local_work_size + n_vector_length) * m_n_scalar_type_size/*sizeof(float)*/ > n_local_memory_size)
		return false;
	// the vectors are too big, can not be cached - use the uncached variant

	cl_mem dp_vector_correlations = dp_crosscorel_full_matrix; // alias
	size_t n_term_num = n_vector_num; // alias
	size_t n_term_num_POT = n_matrix_stride;
	_ASSERTE(n_term_num_POT >= n_term_num);
	size_t n_reduction_size = n_vector_length;
	// determine number of elements, processed by a single thread

	/*size_t n_single_thread_elems = max(size_t(1),
		(n_vector_length + n_reduction_local_work_size - 1) / n_reduction_local_work_size); // just debug
	printf("parallel multi-reduction: %d dimensions, %d local work-items,"
		" %d elems processed by a single thread\n", n_reduction_size, n_reduction_local_work_size,
		n_single_thread_elems);*/ // debug

	{
		//const size_t n_max_vpass_size = 1 << 14;
		//const size_t n_max_pass_size = 1 << 14; // probably a good thing if it's a power of two!
		// max # vectors in one pass (disregards GPU size, just gives a number of vectors to process in one pass)

		size_t n_pass_num = (n_term_num + n_max_pass_size - 1) / n_max_pass_size;
		size_t n_nominal_pass_size = n_term_num / n_pass_num;
		size_t n_last_pass_size = n_term_num - (n_pass_num - 1) * n_nominal_pass_size;
		size_t n_vpass_num = (n_term_num + n_max_vpass_size - 1) / n_max_vpass_size;
		size_t n_nominal_vpass_size = n_term_num / n_vpass_num;
		size_t n_last_vpass_size = n_term_num - (n_vpass_num - 1) * n_nominal_vpass_size;
		// decide number of passes

		//printf("have %d x %d passes\n", n_pass_num, n_pass_num);
		for(size_t n_vpass = 0, n_first_vector2 = 0; n_vpass < n_vpass_num; ++ n_vpass) {
			size_t n_vpass_size = (n_vpass + 1 == n_vpass_num)? n_last_vpass_size : n_nominal_vpass_size;
			// vertical passes

			for(size_t n_pass = 0, n_first_vector = 0; n_pass < n_pass_num; ++ n_pass) {
				size_t n_pass_size = (n_pass + 1 == n_pass_num)? n_last_pass_size : n_nominal_pass_size;

				size_t n_local_work_size = n_reduction_local_work_size;
				size_t n_global_work_size = min(size_t(n_SM_overload * n_SM_num),
					n_Make_POT(n_vpass_size * n_pass_size)) * n_local_work_size; // changes how many groups are scheduled; probably several times GPU size is good (8x or 16x; 32x doesn't add any more gains)

				/*printf("correlation tile (%d - %d) * (%d - %d)\n",
					n_first_vector, n_first_vector + n_pass_size,
					n_first_vector2, n_first_vector2 + n_vpass_size); // debug
				size_t n_work_group_num = n_global_work_size / n_local_work_size;
				printf("pass(local work size: %d, work group num: %d, pass size: %d, kernel iterations: %d)\n",
					n_local_work_size, n_work_group_num, n_pass_size,
					(n_pass_size + n_work_group_num - 1) / n_work_group_num);*/ // debug

				clSetKernelArgs10(h_correl_kernel_v4, dp_vector_correlations,
					int(n_first_vector + n_first_vector2 * n_term_num_POT), int(n_term_num_POT), // first (points to a scanline in the matrix), stride
					dp_vectors, int(n_first_vector), int(n_vector_length), int(n_first_vector + n_pass_size),
					int(n_first_vector2), int(n_first_vector2 + n_vpass_size),
					CCLLocalMem(n_vector_length * m_n_scalar_type_size/*sizeof(float)*/)); // the size must match type in shader
				// set kernel parameters

				int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_correl_kernel_v4, 1,
					NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels 8 (gs %d, ls %d, result %d)\n",
						n_global_work_size, n_local_work_size, n_result);
					return false;
				}
				// launch kernel

				n_first_vector += n_pass_size;
				//break; // debug - do only the first tile
			}

			n_first_vector2 += n_vpass_size;
			//break; // debug - do only the first tile
		}
		// calculate correlations of all the vectors (works in tiles, last block row and block
		// column is possibly different from the others)
	}
	// reduction step

	return true;
}

bool CCLVectorKernels::Calculate_VectorsCrossCorelation_Upper_Cached(cl_command_queue h_cmd_queue,
	size_t n_vector_length, size_t n_vector_num, size_t n_matrix_stride,
	cl_mem dp_crosscorel_full_matrix, const cl_mem dp_vectors,
	size_t n_max_pass_size /*= 1 << 14*/, size_t n_max_vpass_size /*= 1 << 14*/, size_t n_SM_overload /*= 8*/) const
{
	if(!m_b_status)
		return false;
	// if it did not compile, do nothing

	if((n_reduction_local_work_size + n_vector_length) * m_n_scalar_type_size/*sizeof(float)*/ > n_local_memory_size)
		return false;
	// the vectors are too big, can not be cached - use the uncached variant

	cl_mem dp_vector_correlations = dp_crosscorel_full_matrix; // alias
	size_t n_term_num = n_vector_num; // alias
	size_t n_term_num_POT = n_matrix_stride;
	_ASSERTE(n_term_num_POT >= n_term_num);
	size_t n_reduction_size = n_vector_length;
	// determine number of elements, processed by a single thread

	/*size_t n_single_thread_elems = max(size_t(1),
		(n_vector_length + n_reduction_local_work_size - 1) / n_reduction_local_work_size); // just debug
	printf("parallel multi-reduction: %d dimensions, %d local work-items,"
		" %d elems processed by a single thread\n", n_reduction_size, n_reduction_local_work_size,
		n_single_thread_elems);*/ // debug

	{
		//const size_t n_max_vpass_size = 1 << 14;
		//const size_t n_max_pass_size = 1 << 14; // probably a good thing if it's a power of two!
		// max # vectors in one pass (disregards GPU size, just gives a number of vectors to process in one pass)

		size_t n_pass_num = (n_term_num + n_max_pass_size - 1) / n_max_pass_size;
		size_t n_nominal_pass_size = n_term_num / n_pass_num;
		size_t n_last_pass_size = n_term_num - (n_pass_num - 1) * n_nominal_pass_size;
		size_t n_vpass_num = (n_term_num + n_max_vpass_size - 1) / n_max_vpass_size;
		size_t n_nominal_vpass_size = n_term_num / n_vpass_num;
		size_t n_last_vpass_size = n_term_num - (n_vpass_num - 1) * n_nominal_vpass_size;
		// decide number of passes

		//printf("have %d x %d passes\n", n_pass_num, n_pass_num);
		for(size_t n_vpass = 0, n_first_vector2 = 0; n_vpass < n_vpass_num; ++ n_vpass) {
			size_t n_vpass_size = (n_vpass + 1 == n_vpass_num)? n_last_vpass_size : n_nominal_vpass_size;
			// vertical passes

			for(size_t n_pass = 0, n_first_vector = 0; n_pass < n_pass_num; ++ n_pass) {
				size_t n_pass_size = (n_pass + 1 == n_pass_num)? n_last_pass_size : n_nominal_pass_size;

				if(n_first_vector2 >= n_first_vector + n_pass_size) {
					n_first_vector += n_pass_size;
					continue;
				}
				// would be in the lower diagonal

				size_t n_local_work_size = n_reduction_local_work_size;
				size_t n_global_work_size = min(size_t(n_SM_overload * n_SM_num),
					n_Make_POT(n_vpass_size * n_pass_size)) * n_local_work_size; // changes how many groups are scheduled; probably several times GPU size is good (8x or 16x; 32x doesn't add any more gains)

				/*printf("correlation tile (%d - %d) * (%d - %d)\n",
					n_first_vector, n_first_vector + n_pass_size,
					n_first_vector2, n_first_vector2 + n_vpass_size); // debug
				size_t n_work_group_num = n_global_work_size / n_local_work_size;
				printf("pass(local work size: %d, work group num: %d, pass size: %d, kernel iterations: %d)\n",
					n_local_work_size, n_work_group_num, n_pass_size,
					(n_pass_size + n_work_group_num - 1) / n_work_group_num);*/ // debug

				clSetKernelArgs10(h_correl_kernel_v4_upper, dp_vector_correlations,
					int(n_first_vector + n_first_vector2 * n_term_num_POT), int(n_term_num_POT), // first (points to a scanline in the matrix), stride
					dp_vectors, int(n_first_vector), int(n_vector_length), int(n_first_vector + n_pass_size),
					int(n_first_vector2), int(n_first_vector2 + n_vpass_size),
					CCLLocalMem(n_vector_length * m_n_scalar_type_size/*sizeof(float)*/)); // the size must match type in shader
				// set kernel parameters

				int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_correl_kernel_v4_upper, 1,
					NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels 8 (gs %d, ls %d, result %d)\n",
						n_global_work_size, n_local_work_size, n_result);
					return false;
				}
				// launch kernel

				n_first_vector += n_pass_size;
				//break; // debug - do only the first tile
			}

			n_first_vector2 += n_vpass_size;
			//break; // debug - do only the first tile
		}
		// calculate correlations of all the vectors (works in tiles, last block row and block
		// column is possibly different from the others)
	}
	// reduction step

	return true;
}

bool CCLVectorKernels::Calculate_VectorsCrossCorelation_UpperPacked_Cached(cl_command_queue h_cmd_queue,
	size_t n_vector_length, size_t n_vector_num, size_t n_matrix_stride,
	cl_mem dp_crosscorel_triu_matrix, const cl_mem dp_vectors,
	size_t n_max_pass_size /*= 1 << 14*/, size_t n_max_vpass_size /*= 1 << 14*/, size_t n_SM_overload /*= 8*/) const
{
	if(!m_b_status)
		return false;
	// if it did not compile, do nothing

	if((n_reduction_local_work_size + n_vector_length) * m_n_scalar_type_size/*sizeof(float)*/ > n_local_memory_size)
		return false;
	// the vectors are too big, can not be cached - use the uncached variant

	cl_mem dp_vector_correlations = dp_crosscorel_triu_matrix; // alias
	size_t n_term_num = n_vector_num; // alias
	size_t n_term_num_POT = n_matrix_stride;
	_ASSERTE(n_term_num_POT >= n_term_num);
	size_t n_reduction_size = n_vector_length;
	// determine number of elements, processed by a single thread

	/*size_t n_single_thread_elems = max(size_t(1),
		(n_vector_length + n_reduction_local_work_size - 1) / n_reduction_local_work_size); // just debug
	printf("parallel multi-reduction: %d dimensions, %d local work-items,"
		" %d elems processed by a single thread\n", n_reduction_size, n_reduction_local_work_size,
		n_single_thread_elems);*/ // debug

	{
		//const size_t n_max_vpass_size = 1 << 14;
		//const size_t n_max_pass_size = 1 << 14; // probably a good thing if it's a power of two!
		// max # vectors in one pass (disregards GPU size, just gives a number of vectors to process in one pass)

		size_t n_pass_num = (n_term_num + n_max_pass_size - 1) / n_max_pass_size;
		size_t n_nominal_pass_size = n_term_num / n_pass_num;
		size_t n_last_pass_size = n_term_num - (n_pass_num - 1) * n_nominal_pass_size;
		size_t n_vpass_num = (n_term_num + n_max_vpass_size - 1) / n_max_vpass_size;
		size_t n_nominal_vpass_size = n_term_num / n_vpass_num;
		size_t n_last_vpass_size = n_term_num - (n_vpass_num - 1) * n_nominal_vpass_size;
		// decide number of passes

		//printf("have %d x %d passes\n", n_pass_num, n_pass_num);
		for(size_t n_vpass = 0, n_first_vector2 = 0; n_vpass < n_vpass_num; ++ n_vpass) {
			size_t n_vpass_size = (n_vpass + 1 == n_vpass_num)? n_last_vpass_size : n_nominal_vpass_size;
			// vertical passes

			for(size_t n_pass = 0, n_first_vector = 0; n_pass < n_pass_num; ++ n_pass) {
				size_t n_pass_size = (n_pass + 1 == n_pass_num)? n_last_pass_size : n_nominal_pass_size;

				if(n_first_vector2 >= n_first_vector + n_pass_size) {
					n_first_vector += n_pass_size;
					continue;
				}
				// would be in the lower diagonal

				size_t n_local_work_size = n_reduction_local_work_size;
				size_t n_global_work_size = min(size_t(n_SM_overload * n_SM_num),
					n_Make_POT(n_vpass_size * n_pass_size)) * n_local_work_size; // changes how many groups are scheduled; probably several times GPU size is good (8x or 16x; 32x doesn't add any more gains)

				/*printf("correlation tile (%d - %d) * (%d - %d)\n",
					n_first_vector, n_first_vector + n_pass_size,
					n_first_vector2, n_first_vector2 + n_vpass_size); // debug
				size_t n_work_group_num = n_global_work_size / n_local_work_size;
				printf("pass(local work size: %d, work group num: %d, pass size: %d, kernel iterations: %d)\n",
					n_local_work_size, n_work_group_num, n_pass_size,
					(n_pass_size + n_work_group_num - 1) / n_work_group_num);*/ // debug

				clSetKernelArgs10(h_correl_kernel_v4_upper_pack, dp_vector_correlations,
					int(n_TriangularOffset(n_first_vector2, n_first_vector, n_term_num_POT)), int(n_term_num_POT), // first (points to a scanline in the matrix), stride
					dp_vectors, int(n_first_vector), int(n_vector_length), int(n_first_vector + n_pass_size),
					int(n_first_vector2), int(n_first_vector2 + n_vpass_size),
					CCLLocalMem(n_vector_length * m_n_scalar_type_size/*sizeof(float)*/)); // the size must match type in shader
				// set kernel parameters

				int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_correl_kernel_v4_upper_pack, 1,
					NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels 8 (gs %d, ls %d, result %d)\n",
						n_global_work_size, n_local_work_size, n_result);
					return false;
				}
				// launch kernel

				n_first_vector += n_pass_size;
				//break; // debug - do only the first tile
			}

			n_first_vector2 += n_vpass_size;
			//break; // debug - do only the first tile
		}
		// calculate correlations of all the vectors (works in tiles, last block row and block
		// column is possibly different from the others)
	}
	// reduction step

	return true;
}

bool CCLVectorKernels::Calculate_ScaledVectors(cl_context h_context, cl_command_queue h_cmd_queue,
	size_t n_vector_length, size_t n_vector_num,
	cl_mem dp_vectors_dst, const cl_mem dp_vectors, const cl_mem dp_vector_lengths,
	size_t n_max_pass_size /*= 1 << 24*/) const
{
	if(!m_b_status)
		return false;
	// if it did not compile, do nothing

	size_t n_term_num = n_vector_num; // alias

	cl_mem dp_vector_lengths_const; // todo - cache this!
	if(n_term_num * m_n_factor_type/*sizeof(float)*/ > n_const_memory_size) {
		int n_result;
		dp_vector_lengths_const = clCreateBuffer(h_context, CL_MEM_READ_ONLY, n_const_memory_size, NULL, &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to alloc device buffer\n");
			return false;
		}
	}

	{
		const size_t n_max_lengths_in_const_memory = n_const_memory_size / m_n_factor_type/*sizeof(float)*/;
		_ASSERTE(n_max_lengths_in_const_memory > 0);
		const size_t n_max_pass_size_limit = (n_max_lengths_in_const_memory >= SIZE_MAX / n_vector_length)? SIZE_MAX :
			n_Align_Up(max(size_t(1), n_max_lengths_in_const_memory - 1) * n_vector_length - 127, size_t(128));
		// calculate maximal pass size. it is limited by constant memory size and vector length

		n_max_pass_size = min(n_max_pass_size_limit, n_max_pass_size);
		// max # vector elements in one pass

		/*printf("max lengths in const memory: %d, max pass size: %d (" PRIsizeB ")\n",
			n_max_lengths_in_const_memory, n_max_pass_size, PRIsizeBparams(n_max_pass_size));*/ // debug

		uint64_t n_task_size = n_term_num * n_vector_length;
		// size of the whole task (now we're working per element instead of per vector)

		uint64_t n_pass_num = (n_task_size + n_max_pass_size - 1) / n_max_pass_size;
		_ASSERTE(n_task_size / n_pass_num <= SIZE_MAX); // shouldn't overflow
		size_t n_nominal_pass_size = (n_pass_num > 10)? n_max_pass_size : size_t(n_task_size / n_pass_num);
		_ASSERTE(n_task_size - (n_pass_num - 1) * n_nominal_pass_size <= SIZE_MAX); // shouldn't overflow
		size_t n_last_pass_size = size_t(n_task_size - (n_pass_num - 1) * n_nominal_pass_size);
		_ASSERTE(n_nominal_pass_size * (n_pass_num - 1) + n_last_pass_size == n_task_size); // just make sure the math is right
		// decide number of passes

		size_t n_lengths_size = n_term_num * m_n_factor_type/*sizeof(float)*/;
		cl_kernel h_scale_kernel_v2 = (b_Is_POT(n_vector_length))?
			h_scale_kernel_v2_const_pot : h_scale_kernel_v2_const_npot;
		size_t n_kernel_arg5 = (b_Is_POT(n_vector_length))? n_Log2(n_vector_length) : n_vector_length;
		// choose the right kernel

		/*printf("using %s version of kernel, arg5 is %d, vector length is %d\n",
			(b_Is_POT(n_vector_length))? "POT" : "NPOT", n_kernel_arg5, n_vector_length);*/ // debug

		for(uint64_t n_pass = 0, n_first_elem = 0; n_pass < n_pass_num; ++ n_pass) {
			size_t n_pass_size = (n_pass + 1 == n_pass_num)? n_last_pass_size : n_nominal_pass_size;

			size_t n_elems_per_thread = 1;
			size_t n_local_work_size = 128;
			while((n_nominal_pass_size / n_local_work_size) / n_elems_per_thread > 32)
				n_elems_per_thread *= 2;
			// determine how much elements per thread

			/*printf("pass size: %d, local work size: %d, elements per thread: %d, thread blocks: %d\n",
				n_pass_size, n_local_work_size, n_elems_per_thread,
				(n_pass_size / n_local_work_size) / n_elems_per_thread);*/ // debug

			size_t n_first_vector = size_t(n_first_elem / n_vector_length);
			size_t n_last_vector = size_t((n_first_elem + n_pass_size - 1) / n_vector_length + 1);
			_ASSERTE(n_last_vector - n_first_vector <= n_max_lengths_in_const_memory);

			/*printf("pass size: %d, first vector: %d, last vector: %d, cmem usage: " PRIsizeB "B\n",
				n_pass_size, n_first_vector, n_last_vector,
				PRIsizeBparams((n_last_vector - n_first_vector) * m_n_factor_type + 0 * sizeof(float)));*/ // debug

			size_t n_first_vector_arg;
			cl_mem dp_vector_lengths_arg;
			if(n_lengths_size > n_const_memory_size) {
				int n_result;
				if((n_result = clEnqueueCopyBuffer(h_cmd_queue, dp_vector_lengths, dp_vector_lengths_const,
				   n_first_vector * m_n_factor_type/*sizeof(float)*/, 0, (n_last_vector - n_first_vector) *
				   sizeof(float), 0, NULL, NULL)) != CL_SUCCESS) {
					fprintf(stderr, "error: failed to copy data to constant memory (%d)\n", n_result);
					if(n_term_num * m_n_factor_type/*sizeof(float)*/ > n_const_memory_size)
						clReleaseMemObject(dp_vector_lengths_const);
					return false;
				}
				dp_vector_lengths_arg = dp_vector_lengths_const;
				n_first_vector_arg = n_first_vector; // memory starts with this vector
			} else {
				dp_vector_lengths_arg = dp_vector_lengths; // use the original array without copying, it fits in const memory
				n_first_vector_arg = 0; // memory starts with the first vector
			}
			// copy lengths to constant memory

			size_t n_global_work_size = n_Align_Up(n_pass_size / n_elems_per_thread, n_local_work_size);

			_ASSERTE(n_first_elem <= INT_MAX);
			_ASSERTE(n_first_elem <= INT_MAX - n_pass_size);
			_ASSERTE(n_first_vector_arg <= INT_MAX); // ...
			clSetKernelArgs7(h_scale_kernel_v2, dp_vector_lengths_arg, int(n_first_vector_arg),
				dp_vectors_dst, dp_vectors, int(n_first_elem), int(n_kernel_arg5),
				int(n_first_elem + n_pass_size));
			// set kernel parameters

			int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_scale_kernel_v2, 1,
				NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels 6 (gs %d, ls %d, result %d)\n",
					n_global_work_size, n_local_work_size, n_result);
				if(n_term_num * m_n_factor_type/*sizeof(float)*/ > n_const_memory_size)
					clReleaseMemObject(dp_vector_lengths_const);
				return false;
			}
			// launch kernel

			n_first_elem += n_pass_size;
		}
		// multiply all the vectors by their inverse lengths
	}
	// scaling step

	if(n_term_num * m_n_factor_type/*sizeof(float)*/ > n_const_memory_size)
		clReleaseMemObject(dp_vector_lengths_const);

	return true;
}

/*
 *								=== ~CCLVectorKernels ===
 */
