/*
								+--------------------------------+
								|                                |
								| *** OCL global scan reduce *** |
								|                                |
								|  Copyright  -tHE SWINe- 2016  |
								|                                |
								|         ScanReducev3.h         |
								|                                |
								+--------------------------------+
*/

#pragma once
#ifndef __OPENCL_GLOBAL_SCAN_REDUCTION_INCLUDED
#define __OPENCL_GLOBAL_SCAN_REDUCTION_INCLUDED

/**
 *	@file gpgpu/ScanReducev3.h
 *	@date 2016
 *	@author -tHE SWINe-
 *	@brief autotuned OpenCL global scan and reduce primitives
 */

#include "TiledScanReduce.h"
#include "TempBuffer.h"

class CCLReduce {
protected:
	enum {
		autotune_Variant_Num = 3
	};

	TKernelAutotuneInfo<autotune_Variant_Num> m_t_reduce_autotune;
	CCLReductionConfig m_config;
	CCLTiled_ReduceScan_Impl m_p_tile_reduce[autotune_Variant_Num]; // tiled kernel specializations
	CCLTempBufferStack m_temp_buffers; // temp buffers for recurs
	size_t m_n_max_workgroup_num; // maximum resident workgroups on the device

	static const TKernelAutotuneInfo<autotune_Variant_Num> m_p_reduce_autotune_info[4]; // 680, 780, k40 and generic

public:
	CCLReduce(cl_context h_context)
		:m_temp_buffers(h_context)
	{}

	bool Set_ReduceOps(const char *p_s_elem_op = "x", const char *p_s_reduce_op = "x+y",
		const char *p_s_finalize_op = "x", const char *p_s_identity = "0")
	{
		return m_config.Set_ReduceOps(p_s_elem_op, p_s_reduce_op, p_s_finalize_op, p_s_identity);
	}

	bool Set_DataType(const char *p_s_data_type)
	{
		return m_config.Set_DataType(p_s_data_type);
	}

	const CCLReductionConfig &r_Configuration() const
	{
		return m_config;
	}

	bool b_Status() const
	{
		for(size_t i = 0; i < m_t_reduce_autotune.n_tuning_num; ++ i) {
			if(!m_p_tile_reduce[i].b_Status())
				return false;
		}
		return true;
	}

	/**
	 *	@brief compiles the kernels with the current settings
	 *
	 *	@param[in] h_context is OpenCL context
	 *	@param[in] h_device is target device (currently only supports single device)
	 *	@param[in] b_verbose is verbosity flag (set to enable verbose)
	 *	@param[in] n_max_SM_resident_workgroups is maximum number of workgroups that run
	 *		in a streaming multiprocessor concurrently (16 on kepler, 32 on maxwell and pascal,
	 *		at the same time only 64 warps can run, reducing the possible occupancy depending
	 *		on the workgroup size)
	 *
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note Once the kernels are compiled, this function has no effect and
	 *		always returns true.
	 */
	bool Compile(cl_context h_context, cl_device_id h_device, bool b_verbose = false,
		bool b_compiler_verbose = false, bool b_use_nv_shuffle = true,
		bool b_use_Harris_scan = false, size_t n_max_SM_resident_workgroup_num = 16)
	{
		_ASSERTE(h_context == m_temp_buffers.h_Context());
		// make sure this is the same context

		CCLDeviceParams dev(h_device);
		// get device info

		m_n_max_workgroup_num = dev.n_Multiprocessor_Num() * n_max_SM_resident_workgroup_num;

		std::string s_device_type;
		if(!CCLDeviceClassId::Get(s_device_type, dev))
			return false;
		// get device class-id used by autotune

		/*const struct {
			const char *p_s_dev_id;
			size_t n_workgroup_size0, n_tile_size0;
			size_t n_thresh0;
			size_t n_workgroup_size1, n_tile_size1;
			size_t n_thresh1;
			size_t n_workgroup_size2, n_tile_size2;
		} p_autotune_info[] = {
			{"generic", 128, 256, SIZE_MAX}, // use 128, 256 for all inputs
			{"nv_3_5_tes", 128, 128, 20000, 128, 512, 2000000, 256, 1024}, // for inputs <= 20k, use (128, 128), for inputs <= 2M use (128, 512), for bigger use (256, 1024)
			{"nv_3_5_gf", 128, 128, 20000, 128, 512, 2000000, 256, 1024},
			{"nv_3_0_gf", 128, 512, 256, 128, 128, 100000, 256, 1024} // for inputs <= 256 use (512, 256), for inputs <= 100k use (128, 128), for bigger use (256, 1024)
		};
		size_t n_autotune_num = sizeof(p_autotune_info) / sizeof(p_autotune_info[0]);
		size_t n_tuning = 0;
		for(size_t i = 1; i < n_autotune_num; ++ i) {
			if(!s_device_type.compare(p_autotune_info[i].p_s_dev_id)) {
				n_tuning = i;
				break;
			}
		}
		// figure out which tuning to use*/

		m_t_reduce_autotune = *std::find(m_p_reduce_autotune_info, m_p_reduce_autotune_info +
			(sizeof(m_p_reduce_autotune_info) / sizeof(m_p_reduce_autotune_info[0]) - 1), // the last one is generic in case nothing else matches
			std::make_pair(s_device_type.c_str(), m_config.n_DataType_Size()));
		// find autotune

		/*m_p_thresh[0] = p_autotune_info[n_tuning].n_thresh0;
		m_p_thresh[1] = p_autotune_info[n_tuning].n_thresh1;
		m_p_tile_scan[0].Set_WorkGroupSize_TileSize(p_autotune_info[n_tuning].n_workgroup_size0,
			p_autotune_info[n_tuning].n_tile_size0);
		m_p_tile_scan[1].Set_WorkGroupSize_TileSize(p_autotune_info[n_tuning].n_workgroup_size1,
			p_autotune_info[n_tuning].n_tile_size1);
		m_p_tile_scan[2].Set_WorkGroupSize_TileSize(p_autotune_info[n_tuning].n_workgroup_size2,
			p_autotune_info[n_tuning].n_tile_size2);*/
		for(size_t i = 0; i < m_t_reduce_autotune.n_tuning_num; ++ i) {
			m_p_tile_reduce[i].Set_WorkGroupSize_TileSize(
				m_t_reduce_autotune.p_tuning[i].first, m_t_reduce_autotune.p_tuning[i].second);
			if(!m_p_tile_reduce[i].Compile(h_context, h_device, m_config, false, true, b_verbose,
			   b_compiler_verbose, b_use_nv_shuffle, b_use_Harris_scan))
				return false;
		}
		// build all
		// t_odo - figure out what to do if two or more tunings are the same // encode that using SIZE_MAX in the thresholds

		return true;
	}

	bool Benchmark(cl_command_queue h_cmd_queue)
	{
		if(!b_Status())
			return false;

		CCLContextWrapper context(m_temp_buffers.h_Context());
		CCLCommandQueueWrapper cmd_queue(h_cmd_queue);

		bool b_results_correct = true;
		const size_t p_size[] = {1000 * 10, 1000 * 50, 1000 * 100, 1000 * 200,
			1000 * 500, 1000000, 1000000 * 2, 1000000 * 5, 1000000 * 10, 1000000 * 20};

		for(int n_test = 0; n_test < sizeof(p_size) / sizeof(p_size[0]); ++ n_test) {
			size_t n = p_size[n_test];

			printf("preparing data ...\r");

			std::vector<uint32_t> scan_data(n);
			for(size_t i = 0; i < n; ++ i)
				scan_data[i] = (uint32_t)i;
			std::random_shuffle(scan_data.begin(), scan_data.end());
			//for(size_t i = 0; i < n; ++ i)
			//	std::swap(scan_data[i], scan_data[CUniformIntegerDistribution<size_t>(i, n - 1)(CCLibGenerator<false>())]);
			// generate some data

			CCLUniqueMem dp_data_buffer, dp_reductions_buffer;
			size_t n_buffer_size_elems = n;
			size_t n_reductions_elems = 1;
			if(!(dp_data_buffer = context.h_CreateBuffer(n_buffer_size_elems * sizeof(uint32_t))) ||
			   !(dp_reductions_buffer = context.h_CreateBuffer(n_reductions_elems * sizeof(uint32_t)))) {
				fprintf(stderr, "error: failed to alloc device buffer\n");
				return false;
			}
			// allocate memory

			printf("running global reduce test ...  \r");

			CTimer test_timer;
			double f_time = 0;
			int n_pass_num = 0;
			for(;;) {
				cmd_queue.n_Enqueue_Memcpy_HtoD(dp_data_buffer, 0,
					&scan_data[0], scan_data.size() * sizeof(uint32_t));
				cmd_queue.n_Enqueue_Memcpy_HtoD(dp_reductions_buffer, 0,
					&scan_data[0], n_reductions_elems * sizeof(uint32_t)); // clear this buffer as well
				cmd_queue.n_Finish();
				// prepare data ...

				double f_start_time = test_timer.f_Time();

				//printf("\n=== scan of %d elems ===\n", scan_data.size()); // separate debug outputs

				{
					CLresult n_result = Enqueue_Reduce(cmd_queue, dp_reductions_buffer, dp_data_buffer, n);
					if(n_result != CL_SUCCESS) {
						fprintf(stderr, "error: Enqueue_Reduce() failed with: %d (%s, %d)\n",
							n_result, __FILE__, __LINE__);
						return false;
					}
				}

				//printf("\n"); // separate debug outputs

				CLresult n_result = cmd_queue.n_Finish();
				if(n_result) {
					fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
					return false;
				}

				double f_pass_time = test_timer.f_Time() - f_start_time;
				f_time += f_pass_time;
				++ n_pass_num;

				if((f_time > .5f && n_pass_num > 10) || f_time > 4)
					break;
				// make sure the timing is stable, don't take too long at the same time
			}
			//-- n_pass_num; // the first pass did not count
			// run the thing

			f_time /= n_pass_num;
			size_t n_data = 1 * scan_data.size() * sizeof(uint32_t); // ignores writing tile reductions
			double f_GBps = n_data / f_time * 1e-9; // mGPU also uses 1e-9 rather than 1024^3
			printf("on " PRIsizeB "B, it took %f msec, reaching %f*1e9 B/s\n",
				PRIsizeBparams(scan_data.size() * sizeof(uint32_t)), f_time * 1000, f_GBps);
			// print results

			std::vector<uint32_t> reduce_cpu(n_reductions_elems);
			CTimer tcpu;
			reduce_cpu[0] = std::accumulate(scan_data.begin(), scan_data.begin() + n, uint32_t(0));
			printf("global reduce takes %f msec on CPU\n", tcpu.f_Time() * 1000);
			// perform a global scan (the goal)

			/*cmd_queue.n_Enqueue_Memcpy_DtoH(&scan_data[0],
				dp_reductions_buffer, 0, reduce_cpu.size() * sizeof(uint32_t));
			// copy back to CPU

			// memory released automatically at the end of scope

			size_t n_err_num = 0;
			for(size_t i = 0; i < n_reductions_elems; ++ i) {
				if(scan_data[i] != reduce_cpu[i]) {
					if(++ n_err_num < 100) {
						fprintf(stderr, "error: global reduce failed: reduce_data[%d] = %d (should be %d)\n",
							i, scan_data[i], reduce_cpu[i]);
					}
				}
			}*/
			
			bool b_results_correct;
			if(cmd_queue.n_CompareBuffer_DebugVerbose(b_results_correct, reduce_cpu.begin(),
			   reduce_cpu.end(), dp_reductions_buffer, 0, "global reduce") != cl_Success) {
				fprintf(stderr, "error: cmd_queue.n_CompareBuffer_DebugVerbose() failed to compare the results\n");
				break;
			}
			// reusable function, uses clnqueueMapBuffer()

			if(b_results_correct/*!n_err_num*/)
				printf("done. global reduce of %d items succeeded\n", n);
			else {
				//fprintf(stderr, "error: global reduce failed with %d errore\n", n_err_num);
				b_results_correct = false;
				break;
			}
			// make sure it is scanned correctly
		}

		if(b_results_correct)
			printf("all tests finished correctly\n");
		else
			fprintf(stderr, "error: there were some errors\n");

		return true;
	}

	inline CCLKernelCall Enqueue_TileReduce(cl_command_queue h_cmd_queue,
		cl_mem dp_reduce, size_t n_reduce_size, const cl_mem dp_data, size_t n_elem_num)
	{
		return m_p_tile_reduce[m_t_reduce_autotune.n_Tuning(n_elem_num)].Enqueue_TileReduce(h_cmd_queue,
			dp_reduce, n_reduce_size, dp_data, n_elem_num);
	}

	// each workgroup reduces n_tiles_per_workgroup consecutive tiles and produces a single result
	inline CCLKernelCall Enqueue_TileReduce(cl_command_queue h_cmd_queue,
		cl_mem dp_reduce, size_t n_reduce_size, const cl_mem dp_data, size_t n_elem_num, size_t n_tiles_per_workgroup)
	{
		return m_p_tile_reduce[m_t_reduce_autotune.n_Tuning(n_elem_num)].Enqueue_TileReduce(h_cmd_queue,
			dp_reduce, n_reduce_size, dp_data, n_elem_num, n_tiles_per_workgroup);
	}

	inline CCLKernelCall Enqueue_TileReduce_NoFinalOp(cl_command_queue h_cmd_queue,
		cl_mem dp_reduce, size_t n_reduce_size, const cl_mem dp_data, size_t n_elem_num)
	{
		return m_p_tile_reduce[m_t_reduce_autotune.n_Tuning(n_elem_num)].Enqueue_TileReduce_NoFinalOp(h_cmd_queue,
			dp_reduce, n_reduce_size, dp_data, n_elem_num);
	}

	// each workgroup reduces n_tiles_per_workgroup consecutive tiles and produces a single result
	inline CCLKernelCall Enqueue_TileReduce_NoFinalOp(cl_command_queue h_cmd_queue,
		cl_mem dp_reduce, size_t n_reduce_size, const cl_mem dp_data, size_t n_elem_num, size_t n_tiles_per_workgroup)
	{
		return m_p_tile_reduce[m_t_reduce_autotune.n_Tuning(n_elem_num)].Enqueue_TileReduce_NoFinalOp(h_cmd_queue,
			dp_reduce, n_reduce_size, dp_data, n_elem_num, n_tiles_per_workgroup);
	}

	/*static bool b_CheckTileRed_u32(cl_command_queue h_cmd_queue,
		const cl_mem dp_bred, const cl_mem dp_data, size_t n_elem_num, size_t n_tile_size)
	{
		size_t n_tile_num = (n_elem_num + n_tile_size - 1) / n_tile_size;
		CLresult n_result = (CLresult)clFinish(h_cmd_queue);
		std::vector<uint32_t> data(n_elem_num);
		n_result = (CLresult)clEnqueueReadBuffer(h_cmd_queue, dp_data, true, 0, n_elem_num * sizeof(uint32_t),
			(data.empty())? 0 : &data.front(), 0, 0, 0);
		std::vector<uint32_t> bred(n_tile_num);
		n_result = (CLresult)clEnqueueReadBuffer(h_cmd_queue, dp_bred, true, 0, n_tile_num * sizeof(uint32_t),
			(bred.empty())? 0 : &bred.front(), 0, 0, 0);
		for(size_t i = 0; i < n_tile_num; ++ i) {
			data[i] = std::accumulate(data.begin() + i * n_tile_size,
				data.begin() + min((i + 1) * n_tile_size, n_elem_num), uint32_t(0));
		}
		data.resize(n_tile_num);
		bool b_result = data == bred;
		if(!b_result) {
			size_t n_error_num = 0;
			for(size_t i = 0; i < n_tile_num; ++ i) {
				if(data[i] != bred[i]) {
					if(++ n_error_num < 100) {
						fprintf(stderr, "error: reduction[" PRIsize "] = %u (should be %u)\n",
							i, bred[i], data[i]);
					}
				}
			}
			fprintf(stderr, "error: there was " PRIsize " errors\n", n_error_num);
		}
		return b_result;
	}*/

	inline CLresult Enqueue_Reduce_NoFinalOp(cl_command_queue h_cmd_queue,
		cl_mem dp_reduce, const cl_mem dp_data, size_t n_elem_num)
	{
		size_t n_tuning = m_t_reduce_autotune.n_Tuning(n_elem_num);
		size_t n_tile_size = m_p_tile_reduce[n_tuning].n_Tile_Size();
		size_t n_reduce_elem_num = (n_elem_num + n_tile_size - 1) / n_tile_size;
		if(n_reduce_elem_num > 1/*m_n_max_workgroup_num*/) { // "n_reduce_elem_num > m_n_max_workgroup_num" ~ "in case the next stage still fills the whole GPU"
			/*if(n_reduce_elem_num >= m_n_max_workgroup_num) {
				size_t n_tiles_per_wg = (n_reduce_elem_num + m_n_max_workgroup_num - 1) / m_n_max_workgroup_num; // can end up with really unbalanced
				n_reduce_elem_num = (n_elem_num + n_tile_size * n_tiles_per_wg - 1) / (n_tile_size * n_tiles_per_wg);

				size_t n_temp_size = n_reduce_elem_num * m_config.n_DataType_Size();
				CCLTempBufferReservation dp_temp(n_temp_size, m_temp_buffers);
				// alloc temp buffer

				CLresult n_result;
				if((n_result = Enqueue_TileReduce_NoFinalOp(h_cmd_queue, dp_temp,
				   n_reduce_elem_num, dp_data, n_elem_num, n_tiles_per_wg)) != cl_Success)
					return n_result;
				//_ASSERTE(b_CheckTileRed_u32(h_cmd_queue, dp_temp, dp_data, n_elem_num, n_tile_size * n_tiles_per_wg));
				return Enqueue_Reduce_NoFinalOp(h_cmd_queue, dp_reduce, dp_temp, n_reduce_elem_num);
				// recurse to reduce the partials
			} else*/ {
				size_t n_temp_size = n_reduce_elem_num * m_config.n_DataType_Size();
				CCLTempBufferReservation dp_temp(n_temp_size, m_temp_buffers);
				// alloc temp buffer

				CLresult n_result;
				if((n_result = Enqueue_TileReduce_NoFinalOp(h_cmd_queue, dp_temp,
				   n_reduce_elem_num, dp_data, n_elem_num)) != cl_Success)
					return n_result;
				return Enqueue_Reduce_NoFinalOp(h_cmd_queue, dp_reduce, dp_temp, n_reduce_elem_num);
				// recurse to reduce the partials
			}
		} else {
			/*if(n_reduce_elem_num > 1)
				return Enqueue_TileReduce_NoFinalOp(h_cmd_queue, dp_reduce, 1, dp_data, n_elem_num, n_reduce_elem_num); // workgroup reuse, scan everything using a single sweep
			else*/
				return Enqueue_TileReduce_NoFinalOp(h_cmd_queue, dp_reduce, 1, dp_data, n_elem_num);
			// reduce directly
		}
	}

	inline CLresult Enqueue_Reduce(cl_command_queue h_cmd_queue,
		cl_mem dp_reduce, const cl_mem dp_data, size_t n_elem_num)
	{
		size_t n_tuning = m_t_reduce_autotune.n_Tuning(n_elem_num);
		size_t n_tile_size = m_p_tile_reduce[n_tuning].n_Tile_Size();
		size_t n_reduce_elem_num = (n_elem_num + n_tile_size - 1) / n_tile_size;
		if(n_reduce_elem_num > 1/*m_n_max_workgroup_num*/) { // "n_reduce_elem_num > m_n_max_workgroup_num" ~ "in case the next stage still fills the whole GPU"
			/*if(n_reduce_elem_num >= m_n_max_workgroup_num) {
				size_t n_tiles_per_wg = (n_reduce_elem_num + m_n_max_workgroup_num - 1) / m_n_max_workgroup_num; // can end up with really unbalanced
				n_reduce_elem_num = (n_elem_num + n_tile_size * n_tiles_per_wg - 1) / (n_tile_size * n_tiles_per_wg);

				size_t n_temp_size = n_reduce_elem_num * m_config.n_DataType_Size();
				CCLTempBufferReservation dp_temp(n_temp_size, m_temp_buffers);
				// alloc temp buffer

				CLresult n_result;
				if((n_result = Enqueue_TileReduce_NoFinalOp(h_cmd_queue, dp_temp,
				   n_reduce_elem_num, dp_data, n_elem_num, n_tiles_per_wg)) != cl_Success)
					return n_result;
				//_ASSERTE(b_CheckTileRed_u32(h_cmd_queue, dp_temp, dp_data, n_elem_num, n_tile_size * n_tiles_per_wg));
				return Enqueue_Reduce(h_cmd_queue, dp_reduce, dp_temp, n_reduce_elem_num); // finalize!
				// recurse to reduce the partials
			} else*/ {
				size_t n_temp_size = n_reduce_elem_num * m_config.n_DataType_Size();
				CCLTempBufferReservation dp_temp(n_temp_size, m_temp_buffers);
				// alloc temp buffer

				CLresult n_result;
				if((n_result = Enqueue_TileReduce_NoFinalOp(h_cmd_queue, dp_temp,
				   n_reduce_elem_num, dp_data, n_elem_num)) != cl_Success)
					return n_result;
				return Enqueue_Reduce(h_cmd_queue, dp_reduce, dp_temp, n_reduce_elem_num); // finalize!
				// recurse to reduce the partials
			}
		} else {
			/*if(n_reduce_elem_num > 1)
				return Enqueue_TileReduce(h_cmd_queue, dp_reduce, 1, dp_data, n_elem_num, n_reduce_elem_num); // workgroup reuse, scan everything using a single sweep
			else*/
				return Enqueue_TileReduce(h_cmd_queue, dp_reduce, 1, dp_data, n_elem_num); // finalize!
			// reduce directly
		}

		// this has two flaws,
		// 1) the temp buffer reservation is not needed till the end but only till the second
		//    reduce ends. could do with a simple ping pong buffer. no way to do it using CCLTempBufferReservation
		//    solving it would involve CPU-GPU synchronization though, not worth it.
		// 2) with larger input size, this requires more and more recursions. could impose a limit
		//    to the number of recursions and adapt the workgroup reuse pattern // t_odo
		//
		// looking into 2), this can yield some extra performance (but not more than 10GB/s and not on
		// the largest sizes) but it is a bit unpredictable. would have to enable it via flags and make
		// this a part of the autotuning parameters also.
	}
};

const TKernelAutotuneInfo<3> CCLReduce::m_p_reduce_autotune_info[] = {
	TKernelAutotuneInfo<3>("nv_3_5_tes", 4, // device and data type size
		1024, 0, 65536, 2, SIZE_MAX, 1, // configuration threshold / tuning index pairs
		3, std::make_pair(128, 512), std::make_pair(128, 1024), std::make_pair(256, 768)), // number of used tunings and the list of tunings

	TKernelAutotuneInfo<3>("nv_3_5_gf", 4, // device and data type size
		65536, 0, 131072, 1, SIZE_MAX, 2, // configuration threshold / tuning index pairs
		3, std::make_pair(128, 128), std::make_pair(128, 256), std::make_pair(128, 1024)), // number of used tunings and the list of tunings

	TKernelAutotuneInfo<3>("nv_3_0_gf", 4, // device and data type size
		4096, 1, 32768, 0, SIZE_MAX, 2, // configuration threshold / tuning index pairs
		3, std::make_pair(128, 256), std::make_pair(256, 2048), std::make_pair(256, 4096)), // number of used tunings and the list of tunings

	// generic comes last, serves as a sentinell
	TKernelAutotuneInfo<3>("generic", 4, // device and data type size
		SIZE_MAX, 0, SIZE_MAX, 0, SIZE_MAX, 0, // configuration threshold / tuning index pairs
		1, std::make_pair(128, 512), std::make_pair(128, 512), std::make_pair(128, 512)) // number of used tunings and the list of tunings
};

class CCLScan {
protected:
	enum {
		autotune_Variant_Num = 3
	};

	TKernelAutotuneInfo<autotune_Variant_Num> m_t_scan_autotune;
	CCLReductionConfig m_config;
	CCLTiled_ReduceScan_Impl m_p_tile_scan[autotune_Variant_Num]; // tiled kernel specializations
	CCLTempBufferStack m_temp_buffers; // temp buffers for recurs
	size_t m_n_max_workgroup_num; // maximum resident workgroups on the device

	static const TKernelAutotuneInfo<autotune_Variant_Num> m_p_scan_autotune_info[4]; // 680, 780, k40 and generic

public:
	CCLScan(cl_context h_context)
		:m_temp_buffers(h_context)
	{}

	bool Set_ScanOps(const char *p_s_elem_op = "x", const char *p_s_reduce_op = "x+y",
		const char *p_s_finalize_op = "x", const char *p_s_identity = "0")
	{
		return m_config.Set_ReduceOps(p_s_elem_op, p_s_reduce_op, p_s_finalize_op, p_s_identity);
	}

	bool Set_DataType(const char *p_s_data_type)
	{
		return m_config.Set_DataType(p_s_data_type);
	}

	const CCLReductionConfig &r_Configuration() const
	{
		return m_config;
	}

	bool b_Status() const
	{
		for(size_t i = 0; i < m_t_scan_autotune.n_tuning_num; ++ i) {
			if(!m_p_tile_scan[i].b_Status())
				return false;
		}
		return true;
	}

	/**
	 *	@brief compiles the kernels with the current settings
	 *
	 *	@param[in] h_context is OpenCL context
	 *	@param[in] h_device is target device (currently only supports single device)
	 *	@param[in] b_verbose is verbosity flag (set to enable verbose)
	 *	@param[in] n_max_SM_resident_workgroups is maximum number of workgroups that run
	 *		in a streaming multiprocessor concurrently (16 on kepler, 32 on maxwell and pascal,
	 *		at the same time only 64 warps can run, reducing the possible occupancy depending
	 *		on the workgroup size)
	 *
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note Once the kernels are compiled, this function has no effect and
	 *		always returns true.
	 */
	bool Compile(cl_context h_context, cl_device_id h_device, bool b_verbose = false,
		bool b_compiler_verbose = false, bool b_use_nv_shuffle = true,
		bool b_use_Harris_scan = false, size_t n_max_SM_resident_workgroup_num = 16)
	{
		_ASSERTE(h_context == m_temp_buffers.h_Context());
		// make sure this is the same context

		CCLDeviceParams dev(h_device);
		// get device info

		m_n_max_workgroup_num = dev.n_Multiprocessor_Num() * n_max_SM_resident_workgroup_num;

		std::string s_device_type;
		if(!CCLDeviceClassId::Get(s_device_type, dev))
			return false;
		// get device class-id used by autotune

		/*const struct {
			const char *p_s_dev_id;
			size_t n_workgroup_size0, n_tile_size0;
			size_t n_thresh0;
			size_t n_workgroup_size1, n_tile_size1;
			size_t n_thresh1;
			size_t n_workgroup_size2, n_tile_size2;
		} p_autotune_info[] = {
			{"generic", 128, 256, SIZE_MAX}, // use 128, 256 for all inputs
			{"nv_3_5_tes", 128, 128, 20000, 128, 512, 2000000, 256, 1024}, // for inputs <= 20k, use (128, 128), for inputs <= 2M use (128, 512), for bigger use (256, 1024)
			{"nv_3_5_gf", 128, 128, 20000, 128, 512, 2000000, 256, 1024},
			{"nv_3_0_gf", 128, 512, 256, 128, 128, 100000, 256, 1024} // for inputs <= 256 use (512, 256), for inputs <= 100k use (128, 128), for bigger use (256, 1024)
		};
		size_t n_autotune_num = sizeof(p_autotune_info) / sizeof(p_autotune_info[0]);
		size_t n_tuning = 0;
		for(size_t i = 1; i < n_autotune_num; ++ i) {
			if(!s_device_type.compare(p_autotune_info[i].p_s_dev_id)) {
				n_tuning = i;
				break;
			}
		}
		// figure out which tuning to use*/

		m_t_scan_autotune = *std::find(m_p_scan_autotune_info, m_p_scan_autotune_info +
			(sizeof(m_p_scan_autotune_info) / sizeof(m_p_scan_autotune_info[0]) - 1), // the last one is generic in case nothing else matches
			std::make_pair(s_device_type.c_str(), m_config.n_DataType_Size()));
		// find autotune

		/*m_p_thresh[0] = p_autotune_info[n_tuning].n_thresh0;
		m_p_thresh[1] = p_autotune_info[n_tuning].n_thresh1;
		m_p_tile_scan[0].Set_WorkGroupSize_TileSize(p_autotune_info[n_tuning].n_workgroup_size0,
			p_autotune_info[n_tuning].n_tile_size0);
		m_p_tile_scan[1].Set_WorkGroupSize_TileSize(p_autotune_info[n_tuning].n_workgroup_size1,
			p_autotune_info[n_tuning].n_tile_size1);
		m_p_tile_scan[2].Set_WorkGroupSize_TileSize(p_autotune_info[n_tuning].n_workgroup_size2,
			p_autotune_info[n_tuning].n_tile_size2);*/
		for(size_t i = 0; i < m_t_scan_autotune.n_tuning_num; ++ i) {
			m_p_tile_scan[i].Set_WorkGroupSize_TileSize(
				m_t_scan_autotune.p_tuning[i].first, m_t_scan_autotune.p_tuning[i].second);
			if(!m_p_tile_scan[i].Compile(h_context, h_device, m_config, true, true, b_verbose, // need reduce as well, to match scan tile size :( should separate into two classes then!
			   b_compiler_verbose, b_use_nv_shuffle, b_use_Harris_scan))
				return false;
		}
		// build all
		// t_odo - figure out what to do if two or more tunings are the same // encode that using SIZE_MAX in the thresholds

		return true;
	}

	bool Benchmark(cl_command_queue h_cmd_queue)
	{
		if(!b_Status())
			return false;

		CCLContextWrapper context(m_temp_buffers.h_Context());
		CCLCommandQueueWrapper cmd_queue(h_cmd_queue);

		bool b_results_correct = true;
		const size_t p_size[] = {1000 * 10, 1000 * 50, 1000 * 100, 1000 * 200,
			1000 * 500, 1000000, 1000000 * 2, 1000000 * 5, 1000000 * 10, 1000000 * 20};

		for(int n_pass = 0; n_pass < 2; ++ n_pass) {
			const bool b_inclusive = n_pass > 0;
			for(int n_test = 0; n_test < sizeof(p_size) / sizeof(p_size[0]); ++ n_test) {
				size_t n = p_size[n_test];

				printf("preparing data ...\r");

				std::vector<uint32_t> scan_data(n);
				for(size_t i = 0; i < n; ++ i)
					scan_data[i] = (uint32_t)i;
				std::random_shuffle(scan_data.begin(), scan_data.end());
				//for(size_t i = 0; i < n; ++ i)
				//	std::swap(scan_data[i], scan_data[CUniformIntegerDistribution<size_t>(i, n - 1)(CCLibGenerator<false>())]);
				// generate some data

				CCLUniqueMem dp_data_buffer, dp_scan_buffer;
				if(!(dp_data_buffer = context.h_CreateBuffer(n * sizeof(uint32_t))) ||
				   !(dp_scan_buffer = context.h_CreateBuffer(n * sizeof(uint32_t)))) {
					fprintf(stderr, "error: failed to alloc device buffer\n");
					return false;
				}
				// allocate memory

				printf("running global %s-scan test ...  \r", (b_inclusive)? "in" : "ex");

				CTimer test_timer;
				double f_time = 0;
				int n_pass_num = 0;
				for(;;) {
					cmd_queue.n_Enqueue_Memcpy_HtoD(dp_data_buffer, 0,
						&scan_data[0], scan_data.size() * sizeof(uint32_t));
					cmd_queue.n_Enqueue_Memcpy_HtoD(dp_scan_buffer, 0,
						&scan_data[0], scan_data.size() * sizeof(uint32_t)); // clear this buffer as well
					cmd_queue.n_Finish();
					// prepare data ...

					double f_start_time = test_timer.f_Time();

					//printf("\n=== scan of %d elems ===\n", scan_data.size()); // separate debug outputs

					if(b_inclusive) {
						CLresult n_result = Enqueue_InScan(cmd_queue, dp_scan_buffer, dp_data_buffer, n);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: Enqueue_InScan() failed with: %d (%s, %d)\n",
								n_result, __FILE__, __LINE__);
							return false;
						}
					} else {
						CLresult n_result = Enqueue_ExScan(cmd_queue, dp_scan_buffer, dp_data_buffer, n);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: Enqueue_ExScan() failed with: %d (%s, %d)\n",
								n_result, __FILE__, __LINE__);
							return false;
						}
					}

					//printf("\n"); // separate debug outputs

					CLresult n_result = cmd_queue.n_Finish();
					if(n_result) {
						fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
						return false;
					}

					double f_pass_time = test_timer.f_Time() - f_start_time;
					f_time += f_pass_time;
					++ n_pass_num;

					if((f_time > .5f && n_pass_num > 10) || f_time > 4)
						break;
					// make sure the timing is stable, don't take too long at the same time
				}
				//-- n_pass_num; // the first pass did not count
				// run the thing

				f_time /= n_pass_num;
				size_t n_data = 3 * scan_data.size() * sizeof(uint32_t); // read to tile reduce, read again to tile scan with spine scan, write the scan
				double f_GBps = n_data / f_time * 1e-9; // mGPU also uses 1e-9 rather than 1024^3
				printf("on " PRIsizeB "B, it took %f msec, reaching %f*1e9 B/s (old scan would be %f*1e9 B/s)\n",
					PRIsizeBparams(scan_data.size() * sizeof(uint32_t)), f_time * 1000, f_GBps, f_GBps / 3 * 4);
				// print results

				std::vector<uint32_t> scan_cpu(n);
				CTimer tcpu;
				if(b_inclusive)
					std::partial_sum(scan_data.begin(), scan_data.begin() + n, scan_cpu.begin());
				else
					stl_ut::ExclusiveScan(scan_data.begin(), scan_data.begin() + n, scan_cpu.begin());
				printf("global %s-scan takes %f msec on CPU\n", (b_inclusive)? "in" : "ex", tcpu.f_Time() * 1000);
				// perform a global scan (the goal)

				/*cmd_queue.n_Enqueue_Memcpy_DtoH(&scan_data[0], dp_scan_buffer, 0, scan_cpu.size() * sizeof(uint32_t));
				// copy back to CPU

				// memory released automatically at the end of scope

				size_t n_err_num = 0;
				for(size_t i = 0; i < n; ++ i) {
					if(scan_data[i] != scan_cpu[i]) {
						if(++ n_err_num < 100) {
							fprintf(stderr, "error: global %s-scan failed: scan_data[%d] = %d (should be %d)\n",
								(b_inclusive)? "in" : "ex", i, scan_data[i], scan_cpu[i]);
						}
					}
				}*/

				bool b_results_correct;
				if(cmd_queue.n_CompareBuffer_DebugVerbose(b_results_correct, scan_cpu.begin(),
				   scan_cpu.end(), dp_scan_buffer, 0, (b_inclusive)? "global in-scan" :
				   "global ex-scan") != cl_Success) {
					fprintf(stderr, "error: cmd_queue.n_CompareBuffer_DebugVerbose() failed to compare the results\n");
					break;
				}
				// reusable function, uses clnqueueMapBuffer()

				if(b_results_correct/*!n_err_num*/)
					printf("done. global %s-scan of " PRIsize " items succeeded\n", (b_inclusive)? "in" : "ex", n);
				else {
					//fprintf(stderr, "error: global %s-scan failed with " PRIsize " errore\n", (b_inclusive)? "in" : "ex", n_err_num);
					b_results_correct = false;
					break;
				}
				// make sure it is scanned correctly
			}
		}

		if(b_results_correct)
			printf("all tests finished correctly\n");
		else
			fprintf(stderr, "error: there were some errors\n");

		return true;
	}

	inline CCLKernelCall Enqueue_TileExScan(cl_command_queue h_cmd_queue,
		cl_mem dp_scan, const cl_mem dp_data, size_t n_elem_num)
	{
		return m_p_tile_scan[m_t_scan_autotune.n_Tuning(n_elem_num)].Enqueue_TileExScan(h_cmd_queue,
			dp_scan, dp_data, n_elem_num);
	}

	inline CCLKernelCall Enqueue_TileInScan(cl_command_queue h_cmd_queue,
		cl_mem dp_scan, const cl_mem dp_data, size_t n_elem_num)
	{
		return m_p_tile_scan[m_t_scan_autotune.n_Tuning(n_elem_num)].Enqueue_TileInScan(h_cmd_queue,
			dp_scan, dp_data, n_elem_num);
	}

	/*static bool b_CheckExScan_u32(cl_command_queue h_cmd_queue,
		const cl_mem dp_scan, const cl_mem dp_data, size_t n_elem_num)
	{
		CLresult n_result = (CLresult)clFinish(h_cmd_queue);
		std::vector<uint32_t> data(n_elem_num);
		n_result = (CLresult)clEnqueueReadBuffer(h_cmd_queue, dp_data, true, 0, n_elem_num * sizeof(uint32_t),
			(data.empty())? 0 : &data.front(), 0, 0, 0);
		std::vector<uint32_t> scan(n_elem_num);
		n_result = (CLresult)clEnqueueReadBuffer(h_cmd_queue, dp_scan, true, 0, n_elem_num * sizeof(uint32_t),
			(scan.empty())? 0 : &scan.front(), 0, 0, 0);
		stl_ut::ExclusiveScan(data.begin(), data.end());
		bool b_result = data == scan;
		if(!b_result) {
			size_t n_error_num = 0;
			for(size_t i = 0; i < n_elem_num; ++ i) {
				if(data[i] != scan[i]) {
					if(++ n_error_num < 100) {
						fprintf(stderr, "error: scan[" PRIsize "] = %u (should be %u)\n",
							i, scan[i], data[i]);
					}
				}
			}
			fprintf(stderr, "error: there was " PRIsize " errors\n", n_error_num);
		}
		return b_result;
	}*/

	inline CLresult Enqueue_InScan(cl_command_queue h_cmd_queue,
		cl_mem dp_scan, const cl_mem dp_data, size_t n_elem_num)
	{
		size_t n_tuning = m_t_scan_autotune.n_Tuning(n_elem_num);
		size_t n_tile_size = m_p_tile_scan[n_tuning].n_Tile_Size();
		size_t n_scan_tile_num = (n_elem_num + n_tile_size - 1) / n_tile_size;
		if(n_scan_tile_num > 1) {
			size_t n_spine_size = n_scan_tile_num * m_config.n_DataType_Size();
			CCLTempBufferReservation dp_spine(n_spine_size, m_temp_buffers);
			//CCLTempBufferReservation dp_spined(n_spine_size, m_temp_buffers);
			// alloc spine buffer

			CLresult n_result;
			if((n_result = m_p_tile_scan[n_tuning].Enqueue_TileReduce(h_cmd_queue,
			   dp_spine, n_scan_tile_num, dp_data, n_elem_num)) != cl_Success)
				return n_result;
			// reduce the spine

			if((n_result = Enqueue_ExScan(h_cmd_queue, dp_spine/*d*/, dp_spine, // exclusive !!
			   n_scan_tile_num)) != cl_Success)
				return n_result;
			//_ASSERTE(b_CheckExScan_u32(h_cmd_queue, dp_spined, dp_spine, n_scan_tile_num));
			// recurse and scan the spine (inplace)

			return m_p_tile_scan[n_tuning].Enqueue_TileInScan_Downsweep(h_cmd_queue,
				dp_scan, dp_spine/*d*/, dp_data, n_elem_num);
			// scan the data again and use the spine scan as a starting point
		} else {
			return m_p_tile_scan[n_tuning].Enqueue_TileInScan(h_cmd_queue, dp_scan, dp_data, n_elem_num);
			// scan directly as a single tile
		}
	}

	inline CLresult Enqueue_ExScan(cl_command_queue h_cmd_queue,
		cl_mem dp_scan, const cl_mem dp_data, size_t n_elem_num)
	{
		size_t n_tuning = m_t_scan_autotune.n_Tuning(n_elem_num);
		size_t n_tile_size = m_p_tile_scan[n_tuning].n_Tile_Size();
		size_t n_scan_tile_num = (n_elem_num + n_tile_size - 1) / n_tile_size;
		if(n_scan_tile_num > 1) {
			size_t n_spine_size = n_scan_tile_num * m_config.n_DataType_Size();
			CCLTempBufferReservation dp_spine(n_spine_size, m_temp_buffers);
			//CCLTempBufferReservation dp_spined(n_spine_size, m_temp_buffers);
			// alloc spine buffer

			CLresult n_result;
			if((n_result = m_p_tile_scan[n_tuning].Enqueue_TileReduce(h_cmd_queue,
			   dp_spine, n_scan_tile_num, dp_data, n_elem_num)) != cl_Success)
				return n_result;
			// reduce the spine

			if((n_result = Enqueue_ExScan(h_cmd_queue, dp_spine/*d*/, dp_spine, // exclusive !!
			   n_scan_tile_num)) != cl_Success)
				return n_result;
			//_ASSERTE(b_CheckExScan_u32(h_cmd_queue, dp_spined, dp_spine, n_scan_tile_num));
			// recurse and scan the spine (inplace)

			return m_p_tile_scan[n_tuning].Enqueue_TileExScan_Downsweep(h_cmd_queue,
				dp_scan, dp_spine/*d*/, dp_data, n_elem_num);
			// scan the data again and use the spine scan as a starting point
		} else {
			return m_p_tile_scan[n_tuning].Enqueue_TileExScan(h_cmd_queue, dp_scan, dp_data, n_elem_num);
			// scan directly as a single tile
		}
	}
};

const TKernelAutotuneInfo<3> CCLScan::m_p_scan_autotune_info[] = {
	/*TKernelAutotuneInfo<3>("nv_3_5_tes", 4, // device and data type size
		65536, 0, 2 * 1048576, 1, SIZE_MAX, 2, // configuration threshold / tuning index pairs
		3, std::make_pair(128, 128), std::make_pair(128, 512), std::make_pair(256, 1024)), // number of used tunings and the list of tunings

	TKernelAutotuneInfo<3>("nv_3_5_gf", 4, // device and data type size
		1024, 2, 262144, 0, SIZE_MAX, 1, // configuration threshold / tuning index pairs
		3, std::make_pair(128, 256), std::make_pair(128, 512), std::make_pair(512, 4096)),*/ // number of used tunings and the list of tunings

	TKernelAutotuneInfo<3>("nv_3_0_gf", 4, // device and data type size
		8192/*32768*/, 2, 65536, 1, SIZE_MAX, 0, // configuration threshold / tuning index pairs
		3, std::make_pair(128, 512), std::make_pair(128, 256/*1024*/), std::make_pair(128/*256*/, 384/*256*/)), // number of used tunings and the list of tunings
	// tunings for the scan kernel

	TKernelAutotuneInfo<3>("nv_3_5_tes", 4, // device and data type size
		1024, 0, 1048576, 1, SIZE_MAX, 2, // configuration threshold / tuning index pairs
		3, std::make_pair(128, 128), std::make_pair(128, 512), std::make_pair(128, 1024)), // number of used tunings and the list of tunings

	TKernelAutotuneInfo<3>("nv_3_5_gf", 4, // device and data type size
		512, 2, 65536, 0, SIZE_MAX, 1, // configuration threshold / tuning index pairs
		3, std::make_pair(128, 512), std::make_pair(128, 1024), std::make_pair(512, 4096)), // number of used tunings and the list of tunings

	/*TKernelAutotuneInfo<3>("nv_3_0_gf", 4, // device and data type size // this is actually worse, except for 2M elements
		256, 0, 16384, 1, SIZE_MAX, 2, // configuration threshold / tuning index pairs
		3, std::make_pair(128, 128), std::make_pair(256, 256), std::make_pair(256, 1024)),*/ // number of used tunings and the list of tunings
	// tunings for tile reduce + scan downsweep together

	// generic comes last, serves as a sentinell
	TKernelAutotuneInfo<3>("generic", 4, // device and data type size
		SIZE_MAX, 0, SIZE_MAX, 0, SIZE_MAX, 0, // configuration threshold / tuning index pairs
		1, std::make_pair(128, 512), std::make_pair(128, 512), std::make_pair(128, 512)) // number of used tunings and the list of tunings
};

#endif // !__OPENCL_GLOBAL_SCAN_REDUCTION_INCLUDED
