/*
								+--------------------------------+
								|                                |
								| *** OCL seg. scan / reduce *** |
								|                                |
								|  Copyright  -tHE SWINe- 2016  |
								|                                |
								|        SegScanReduce.h         |
								|                                |
								+--------------------------------+
*/

#pragma once
#ifndef __OPENCL_GLOBAL_SEGMENTED_SCAN_REDUCTION_INCLUDED
#define __OPENCL_GLOBAL_SEGMENTED_SCAN_REDUCTION_INCLUDED

/**
 *	@file gpgpu/SegScanReduce.h
 *	@date 2016
 *	@author -tHE SWINe-
 *	@brief autotuned OpenCL global segmented scan and reduce primitives
 */

#include "ScanReducev3.h" // CCLScan needed for reduction output position computation
#include "TiledScanReduce.h"
#include "TempBuffer.h"

class CCLSegScanReduce {
protected:
	struct TSegKernelTuning {
		int n_workgroup_size;
		int n_tile_size;
		bool b_strided_head_flags;

		TSegKernelTuning()
		{}

		TSegKernelTuning(int _n_workgroup_size, int _n_tile_size, bool _b_strided_head_flags)
			:n_workgroup_size(_n_workgroup_size), n_tile_size(_n_tile_size),
			b_strided_head_flags(_b_strided_head_flags)
		{}

		TSegKernelTuning(int _n_workgroup_size, int _n_tile_size, int n_strided_head_flags)
			:n_workgroup_size(_n_workgroup_size), n_tile_size(_n_tile_size),
			b_strided_head_flags(n_strided_head_flags != 0)
		{}
	};

	enum {
		autotune_Variant_Num = 3
	};

	typedef TKernelAutotuneInfo<autotune_Variant_Num, TSegKernelTuning> TAutotuneInfo;

protected:
	TAutotuneInfo m_t_segscan_autotune;
	TAutotuneInfo m_t_segreduce_autotune;
	TAutotuneInfo m_t_segreduce_spineadj_autotune;
	CCLReductionConfig m_config;
	CCLTiled_SegmentedReduceScan_Impl m_p_tile_segscan[autotune_Variant_Num];
	CCLTiled_SegmentedReduceScan_Impl m_p_tile_segreduce[autotune_Variant_Num];
	CCLTiled_SegmentedReduceScan_Impl m_p_tile_segreduce_spineadj[autotune_Variant_Num];
	CCLTempBufferStack m_temp_buffers; // temp buffers for recurs

	static const TAutotuneInfo m_p_segscan_autotune_info[4]; // 680, 780, k40 and generic
	static const TAutotuneInfo m_p_segreduce_autotune_info[4]; // 680, 780, k40 and generic
	static const TAutotuneInfo m_p_segreduce_spineadf_autotune_info[4]; // 680, 780, k40 and generic

public:
	CCLSegScanReduce(cl_context h_context)
		:m_temp_buffers(h_context)
	{
		m_t_segscan_autotune.n_tuning_num = 0;
		m_t_segreduce_autotune.n_tuning_num = 0;
		m_t_segreduce_spineadj_autotune.n_tuning_num = 0;
		// none available
	}

	bool Set_ReduceOps(const char *p_s_elem_op = "x", const char *p_s_reduce_op = "x+y",
		/*const char *p_s_finalize_op = "x",*/ const char *p_s_identity = "0")
	{
		return m_config.Set_ReduceOps(p_s_elem_op, p_s_reduce_op, "x"/*p_s_finalize_op*/, p_s_identity);
	}

	bool Set_DataType(const char *p_s_data_type)
	{
		return m_config.Set_DataType(p_s_data_type);
	}

	const CCLReductionConfig &r_Configuration() const
	{
		return m_config;
	}

	bool b_Status() const
	{
		for(size_t i = 0; i < m_t_segscan_autotune.n_tuning_num; ++ i) {
			if(!m_p_tile_segscan[i].b_Status())
				return false;
		}
		for(size_t i = 0; i < m_t_segreduce_autotune.n_tuning_num; ++ i) {
			if(!m_p_tile_segreduce[i].b_Status())
				return false;
		}
		for(size_t i = 0; i < m_t_segreduce_spineadj_autotune.n_tuning_num; ++ i) {
			if(!m_p_tile_segreduce_spineadj[i].b_Status())
				return false;
		}
		return true;
	}

	bool b_Have_SegScan() const
	{
		return m_t_segscan_autotune.n_tuning_num > 0;
	}

	bool b_Have_SegReduce() const
	{
		return m_t_segreduce_autotune.n_tuning_num > 0 &&
			m_t_segreduce_spineadj_autotune.n_tuning_num > 0;
	}

	/**
	 *	@brief compiles the kernels with the current settings
	 *
	 *	@param[in] h_context is OpenCL context
	 *	@param[in] h_device is target device (currently only supports single device)
	 *	@param[in] b_verbose is verbosity flag (set to enable verbose)
	 *	@param[in] n_max_SM_resident_workgroups is maximum number of workgroups that run
	 *		in a streaming multiprocessor concurrently (16 on kepler, 32 on maxwell and pascal,
	 *		at the same time only 64 warps can run, reducing the possible occupancy depending
	 *		on the workgroup size)
	 *
	 *	@return Returns true on success, false on failure.
	 *
	 *	@note Once the kernels are compiled, this function has no effect and
	 *		always returns true.
	 */
	bool Compile(cl_context h_context, cl_device_id h_device, bool b_want_scan = true,
		bool b_want_reduce = true, bool b_verbose = false,
		bool b_compiler_verbose = false, bool b_use_nv_shuffle = true)
	{
		_ASSERTE(h_context == m_temp_buffers.h_Context());
		// make sure this is the same context

		b_want_scan = true;
		// always compile scan

		CCLDeviceParams dev(h_device);
		// get device info

		//m_n_max_workgroup_num = dev.n_Multiprocessor_Num() * n_max_SM_resident_workgroup_num;

		std::string s_device_type;
		if(!CCLDeviceClassId::Get(s_device_type, dev))
			return false;
		// get device class-id used by autotune

		if(b_want_scan) {
			m_t_segscan_autotune = *std::find(m_p_segscan_autotune_info, m_p_segscan_autotune_info +
				(sizeof(m_p_segscan_autotune_info) / sizeof(m_p_segscan_autotune_info[0]) - 1), // the last one is generic in case nothing else matches
				std::make_pair(s_device_type.c_str(), m_config.n_DataType_Size()));
			// find autotune

			for(size_t i = 0; i < m_t_segscan_autotune.n_tuning_num; ++ i) {
				m_p_tile_segscan[i].Set_WorkGroupSize_TileSize(
					m_t_segscan_autotune.p_tuning[i].n_workgroup_size,
					m_t_segscan_autotune.p_tuning[i].n_tile_size,
					m_t_segscan_autotune.p_tuning[i].b_strided_head_flags);
				if(!m_p_tile_segscan[i].Compile(h_context, h_device, m_config, true /*pack*/,
				   true /* scan */, false /* reduce */, false /* reduce spineadj */,
				   b_verbose, b_compiler_verbose, b_use_nv_shuffle))
					return false;
			}
		} else
			m_t_segscan_autotune.n_tuning_num = 0;
		if(b_want_reduce) {
			m_t_segreduce_autotune = *std::find(m_p_segreduce_autotune_info, m_p_segreduce_autotune_info +
				(sizeof(m_p_segreduce_autotune_info) / sizeof(m_p_segreduce_autotune_info[0]) - 1), // the last one is generic in case nothing else matches
				std::make_pair(s_device_type.c_str(), m_config.n_DataType_Size()));
			m_t_segreduce_spineadj_autotune = *std::find(m_p_segreduce_spineadf_autotune_info,
				m_p_segreduce_spineadf_autotune_info + (sizeof(m_p_segreduce_spineadf_autotune_info) /
				sizeof(m_p_segreduce_spineadf_autotune_info[0]) - 1), // the last one is generic in case nothing else matches
				std::make_pair(s_device_type.c_str(), m_config.n_DataType_Size()));
			// find autotune

			for(size_t i = 0; i < m_t_segreduce_autotune.n_tuning_num; ++ i) {
				m_p_tile_segreduce[i].Set_WorkGroupSize_TileSize(
					m_t_segreduce_autotune.p_tuning[i].n_workgroup_size,
					m_t_segreduce_autotune.p_tuning[i].n_tile_size,
					m_t_segreduce_autotune.p_tuning[i].b_strided_head_flags);
				if(!m_p_tile_segreduce[i].Compile(h_context, h_device, m_config, true /*pack*/,
				   false /* scan */, true /* reduce */, false /* reduce spineadj */,
				   b_verbose, b_compiler_verbose, b_use_nv_shuffle))
					return false;
			}
			for(size_t i = 0; i < m_t_segreduce_spineadj_autotune.n_tuning_num; ++ i) {
				m_p_tile_segreduce_spineadj[i].Set_WorkGroupSize_TileSize(
					m_t_segreduce_spineadj_autotune.p_tuning[i].n_workgroup_size,
					m_t_segreduce_spineadj_autotune.p_tuning[i].n_tile_size,
					m_t_segreduce_spineadj_autotune.p_tuning[i].b_strided_head_flags);
				if(!m_p_tile_segreduce_spineadj[i].Compile(h_context, h_device, m_config,
				   false /*pack*/, true /* scan */, false /* reduce */, true /* reduce spineadj */,
				   b_verbose, b_compiler_verbose, b_use_nv_shuffle))
					return false;
			}
		} else {
			m_t_segreduce_autotune.n_tuning_num = 0;
			m_t_segreduce_spineadj_autotune.n_tuning_num = 0;
		}
		// build all

		return true;
	}

	/**
	 *	@copydoc CCLTiled_SegmentedReduceScan_Impl::n_PackedHeadFlags_Size()
	 */
	size_t n_SegScan_Packed_HeadFlags_Size(size_t n_flag_num) const
	{
		return m_p_tile_segscan[m_t_segscan_autotune.n_Tuning(n_flag_num)].n_PackedHeadFlags_Size(n_flag_num);
	}

	/**
	 *	@copydoc CCLTiled_SegmentedReduceScan_Impl::n_PackedHeadFlags_Size()
	 */
	size_t n_SegReduce_Packed_HeadFlags_Size(size_t n_flag_num) const
	{
		return m_p_tile_segreduce[m_t_segreduce_autotune.n_Tuning(n_flag_num)].n_PackedHeadFlags_Size(n_flag_num);
	}

	CCLKernelCall Enqueue_SegScan_Pack_HeadFlags(cl_command_queue h_cmd_queue,
		cl_mem dp_packed_head_flags, const cl_mem dp_head_flags, size_t n_flag_num) const
	{
		return m_p_tile_segscan[m_t_segscan_autotune.n_Tuning(n_flag_num)].Enqueue_Pack_HeadFlags(h_cmd_queue,
			dp_packed_head_flags, dp_head_flags, n_flag_num);
	}

	CCLKernelCall Enqueue_SegReduce_Pack_HeadFlags(cl_command_queue h_cmd_queue,
		cl_mem dp_packed_head_flags, const cl_mem dp_head_flags, size_t n_flag_num) const
	{
		return m_p_tile_segreduce[m_t_segreduce_autotune.n_Tuning(n_flag_num)].Enqueue_Pack_HeadFlags(h_cmd_queue,
			dp_packed_head_flags, dp_head_flags, n_flag_num);
	}

	CLresult n_Enqueue_SegScan(cl_command_queue h_cmd_queue, cl_mem dp_segscan,
		const cl_mem dp_data, const cl_mem dp_head_flags, size_t n_elem_num,
		CCLTempBufferStack &r_memory_alloc) const
	{
		const CCLTiled_SegmentedReduceScan_Impl &r_scan =
			m_p_tile_segscan[m_t_segscan_autotune.n_Tuning(n_elem_num)];

		const size_t n_tile_size = r_scan.n_Tile_Size();
		if(n_elem_num > n_tile_size) {
			size_t n_tile_num = (n_elem_num + n_tile_size - 1) / n_tile_size;
			CCLTempBufferReservation dp_tile_flags(n_tile_num * sizeof(uint32_t), r_memory_alloc);
			CCLTempBufferReservation dp_tile_carry(n_tile_num * m_config.n_DataType_Size(), r_memory_alloc);

			CCLTempBufferReservation dp_pack_flags(r_scan.n_PackedHeadFlags_Size(n_elem_num), r_memory_alloc);
			// bit-pack the decoded head flags, save a bunch of traffic in the downsweep pass

			CLresult n_result = r_scan.Enqueue_TileSegScan_Carry_Pack(h_cmd_queue, dp_tile_carry,
				dp_tile_flags, dp_pack_flags, dp_data, dp_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;

			n_result = n_Enqueue_SegScan(h_cmd_queue, dp_tile_carry, dp_tile_carry,
				dp_tile_flags, n_tile_num, r_memory_alloc);
			if(n_result != cl_Success)
				return n_result;
			// recurse to scan the segment carry

			return r_scan.Enqueue_TileSegScan_Downsweep_Packed(h_cmd_queue, dp_segscan,
				dp_data, dp_tile_carry, dp_pack_flags, n_elem_num);
			// segmented scan downsweep
		} else {
			return r_scan.Enqueue_TileSegScan(h_cmd_queue, dp_segscan, dp_data,
				dp_head_flags, n_elem_num);
		}
	}

	CLresult n_Enqueue_SegScan_PackedHeadFlags(cl_command_queue h_cmd_queue, cl_mem dp_segscan,
		const cl_mem dp_data, const cl_mem dp_packed_head_flags, size_t n_elem_num,
		CCLTempBufferStack &r_memory_alloc) const
	{
		const CCLTiled_SegmentedReduceScan_Impl &r_scan =
			m_p_tile_segscan[m_t_segscan_autotune.n_Tuning(n_elem_num)];

		const size_t n_tile_size = r_scan.n_Tile_Size();
		if(n_elem_num > n_tile_size) {
			size_t n_tile_num = (n_elem_num + n_tile_size - 1) / n_tile_size;
			CCLTempBufferReservation dp_tile_flags(n_tile_num * sizeof(uint32_t), r_memory_alloc);
			CCLTempBufferReservation dp_tile_carry(n_tile_num * m_config.n_DataType_Size(), r_memory_alloc);

			CLresult n_result = r_scan.Enqueue_TileSegScan_Carry_Packed(h_cmd_queue, dp_tile_carry,
				dp_tile_flags, dp_data, dp_packed_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;

			n_result = n_Enqueue_SegScan(h_cmd_queue, dp_tile_carry, dp_tile_carry,
				dp_tile_flags, n_tile_num, r_memory_alloc);
			if(n_result != cl_Success)
				return n_result;
			// recurse to scan the segment carry

			return r_scan.Enqueue_TileSegScan_Downsweep_Packed(h_cmd_queue, dp_segscan,
				dp_data, dp_tile_carry, dp_packed_head_flags, n_elem_num);
			// segmented scan downsweep
		} else {
			return r_scan.Enqueue_TileSegScan_Packed(h_cmd_queue, dp_segscan,
				dp_data, dp_packed_head_flags, n_elem_num);
		}
	}

	CLresult n_Enqueue_SegScan_PackedHeadFlags_NaturalOrder(cl_command_queue h_cmd_queue, cl_mem dp_segscan,
		const cl_mem dp_data, const cl_mem dp_packed_head_flags, size_t n_elem_num,
		CCLTempBufferStack &r_memory_alloc) const
	{
		const CCLTiled_SegmentedReduceScan_Impl &r_scan =
			m_p_tile_segscan[m_t_segscan_autotune.n_Tuning(n_elem_num)];

		const size_t n_tile_size = r_scan.n_Tile_Size();
		if(n_elem_num > n_tile_size) {
			size_t n_tile_num = (n_elem_num + n_tile_size - 1) / n_tile_size;
			CCLTempBufferReservation dp_tile_flags(n_tile_num * sizeof(uint32_t), r_memory_alloc);
			CCLTempBufferReservation dp_tile_carry(n_tile_num * m_config.n_DataType_Size(), r_memory_alloc);

			CLresult n_result = r_scan.Enqueue_TileSegScan_Carry_PackedNaturalOrder(h_cmd_queue, dp_tile_carry,
				dp_tile_flags, dp_data, dp_packed_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;

			n_result = n_Enqueue_SegScan(h_cmd_queue, dp_tile_carry, dp_tile_carry,
				dp_tile_flags, n_tile_num, r_memory_alloc);
			if(n_result != cl_Success)
				return n_result;
			// recurse to scan the segment carry

			return r_scan.Enqueue_TileSegScan_Downsweep_PackedNaturalOrder(h_cmd_queue, dp_segscan,
				dp_data, dp_tile_carry, dp_packed_head_flags, n_elem_num);
			// segmented scan downsweep
		} else {
			return r_scan.Enqueue_TileSegScan_PackedNaturalOrder(h_cmd_queue,
				dp_segscan, dp_data, dp_packed_head_flags, n_elem_num);
		}
	}

	CLresult n_Enqueue_SegReduce_SpineAdjust(cl_command_queue h_cmd_queue, cl_mem dp_segreduce,
		const cl_mem dp_tile_tcounts, const cl_mem dp_packed_head_flags, size_t n_tile_size,
		const cl_mem dp_tile_carry, const cl_mem dp_tile_flags, size_t n_tile_num,
		CCLTempBufferStack &r_memory_alloc) const
	{
		const CCLTiled_SegmentedReduceScan_Impl &r_scan =
			m_p_tile_segreduce_spineadj[m_t_segreduce_spineadj_autotune.n_Tuning(n_tile_num)];

		const size_t n_tile_size2 = r_scan.n_Tile_Size();
		if(n_tile_num > n_tile_size2) {
			size_t n_tile_num2 = (n_tile_num + n_tile_size2 - 1) / n_tile_size2;
			CCLTempBufferReservation dp_tile_flags2(n_tile_num2 * sizeof(uint32_t), r_memory_alloc);
			CCLTempBufferReservation dp_tile_carry_scan(n_tile_num2 * m_config.n_DataType_Size(), r_memory_alloc);

			CCLTempBufferReservation dp_packed_tile_head_flags(r_scan.n_PackedHeadFlags_Size(n_tile_num2), r_memory_alloc);
			// bit-pack the decoded head flags, save a bunch of traffic in the downsweep pass

			CLresult n_result = r_scan.Enqueue_TileSegScan_Carry_Pack(h_cmd_queue, dp_tile_carry_scan,
				dp_tile_flags2, dp_packed_tile_head_flags, dp_tile_carry, dp_tile_flags, n_tile_num);
			if(n_result != cl_Success)
				return n_result;

			n_result = n_Enqueue_SegScan(h_cmd_queue, dp_tile_carry_scan, dp_tile_carry_scan,
				dp_tile_flags2, n_tile_num2, r_memory_alloc);
			if(n_result != cl_Success)
				return n_result;
			// scan the segment carry

			return r_scan.Enqueue_TileSegReduce_SpineAdjust_Downsweep_Packed(h_cmd_queue, dp_segreduce,
				dp_tile_tcounts, dp_packed_head_flags, n_tile_size / 32, 1, dp_tile_carry, dp_tile_carry_scan,
				dp_packed_tile_head_flags, n_tile_num);
			// segmented reduction downsweep
		} else {
			return r_scan.Enqueue_TileSegReduce_SpineAdjust(h_cmd_queue, dp_segreduce, dp_tile_tcounts,
				dp_packed_head_flags, n_tile_size / 32, 1, dp_tile_carry, dp_tile_flags, n_tile_num);
			// will need also a downsweep variant preceded by corrseponding seg-scan kernels
		}
	}

	CLresult n_Enqueue_SegReduce(cl_command_queue h_cmd_queue,
		cl_mem &dp_segreduce, // a buffer or 0 for allocate it for me
		size_t &r_n_reduction_num, // number of reductions or 0 for dont know
		size_t n_max_reduction_num, // max number of reductions that dp_segreduce can hold or 0 if dp_segreduce == 0
		const cl_mem dp_data, const cl_mem dp_head_flags, size_t n_elem_num, CCLScan &r_int_scan,
		CCLTempBufferStack &r_memory_alloc) const
	{
		_ASSERTE(!n_max_reduction_num == !dp_segreduce); // either both zero or both nonzero
		_ASSERTE(!n_max_reduction_num || n_max_reduction_num >= r_n_reduction_num); // if you guess, make a sane guess (if this fails then the dp_segreduce buffer has fewer elements than is guessed to be the number of reductions)
		_ASSERTE(r_n_reduction_num <= n_elem_num); // cannot possibly be more than that

		const CCLTiled_SegmentedReduceScan_Impl &r_reduce =
			m_p_tile_segreduce[m_t_segreduce_autotune.n_Tuning(n_elem_num)];

		const size_t n_tile_size = r_reduce.n_Tile_Size();

		size_t n_tile_num = (n_elem_num + n_tile_size - 1) / n_tile_size;
		CCLTempBufferReservation dp_tile_flags(n_tile_num * sizeof(uint32_t), r_memory_alloc);
		CCLTempBufferReservation dp_tile_carry(n_tile_num * m_config.n_DataType_Size(), r_memory_alloc);

		bool b_need_reduction_total = !r_n_reduction_num;
#ifdef _DEBUG
		b_need_reduction_total = true;
#endif // _DEBUG

		CCLTempBufferReservation dp_tile_tcounts((n_tile_num +
			((b_need_reduction_total)? 1 : 0)) * sizeof(uint32_t), r_memory_alloc); // +1!

		CCLTempBufferReservation dp_pack_flags(r_reduce.n_PackedHeadFlags_Size(n_elem_num), r_memory_alloc);
		// bit-pack the decoded head flags, save a bunch of traffic in the downsweep pass

		CLresult n_result = r_reduce.Enqueue_TileSegReduce_Bootstrap_Pack(h_cmd_queue, dp_tile_tcounts,
			dp_tile_flags, dp_pack_flags, dp_head_flags, n_elem_num);
		if(n_result != cl_Success)
			return n_result;
		// bootstrap the segmented reduction by calculating tail counts per tile

		if(n_tile_num > 1) {
			n_result = r_int_scan.Enqueue_ExScan(h_cmd_queue, dp_tile_tcounts,
				dp_tile_tcounts, n_tile_num + ((b_need_reduction_total)? 1 : 0)); // +1!
			if(n_result != cl_Success)
				return n_result;
			// todo - this needs to be an int scan! (works now but won't work if the data type changes e.g. to float)
			// todo - make and use the reentrant version with a temp buffer allocator
		}
		// scan the tile counts to get tail counts

		uint32_t n_reds_num = r_n_reduction_num;
		if(b_need_reduction_total) { // the caller does not know how many there are, need to synchronize and check
			CCLCommandQueueWrapper q(h_cmd_queue);
			if((n_result = q.n_Finish()) != cl_Success) // ouch :(
				return n_result;
			size_t n_sum_index = (n_tile_num > 1)? n_tile_num : n_tile_num - 1; // if we just exscanned it then +1!
			if((n_result = q.n_Enqueue_Memcpy_DtoH(&n_reds_num, dp_tile_tcounts,
			   n_sum_index * sizeof(uint32_t), sizeof(uint32_t))) != cl_Success) // +1!
				return n_result;
			if(n_reds_num < 1)
				return (CLresult)-123456;//n_reds_num = 1; // would likely indicate a failure in counting tail flags (there always is at least one)
#ifdef _DEBUG
			_ASSERTE(!r_n_reduction_num || r_n_reduction_num == n_reds_num); // make sure that the caller indeed knows
#endif // _DEBUG
			r_n_reduction_num = n_reds_num; // write out the real number
		}
		if(n_max_reduction_num < n_reds_num) {
			if(dp_segreduce) {
				clReleaseMemObject(dp_segreduce);
				dp_segreduce = cl_mem(0);
			}
			n_result = CCLContextWrapper(r_memory_alloc.h_Context()).n_CreateBuffer(dp_segreduce,
				n_reds_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/);
			if(n_result != cl_Success) {
				dp_segreduce = cl_mem(0); // just to make sure
				return n_result;
			}
		}
		// in case there is a lot of data, read the number of reductions, otherwise just use a large temp buffer

		if(n_tile_num > 1) {
			n_result = r_reduce.Enqueue_TileSegReduce_Packed(h_cmd_queue, dp_segreduce,
				dp_tile_carry, dp_tile_tcounts, dp_data, /*dp_head_flags*/dp_pack_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;
			// apply the segmented reduction to tile partials

#if 0
			/*_ASSERTE(n_tile_num <= m_n_tile_size);
			n_result = n_Enqueue_SegmentedScan_NoAutoTune(h_cmd_queue, dp_tile_carry,
				dp_tile_carry, dp_head_flags, n_tile_num, r_memory_alloc);
			if(n_result != cl_Success)
				return n_result;*/
			// you are getting confused now, this is done below

			if(n_tile_num > m_n_tile_size) {
				size_t n_tile_num2 = (n_tile_num + m_n_tile_size - 1) / m_n_tile_size;
				CCLTempBufferReservation dp_tile_flags2(n_tile_num2 * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);
				CCLTempBufferReservation dp_tile_carry2(n_tile_num2 * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, r_memory_alloc);

				CCLTempBufferReservation dp_packed_tile_head_flags(n_tile_num2 * m_n_tile_size / 32 * sizeof(uint32_t), r_memory_alloc);
				// bit-pack the decoded head flags, save a bunch of traffic in the downsweep pass

				CLresult n_result = Enqueue_TileSegScan_Carry_Pack(h_cmd_queue, dp_tile_carry2,
					dp_tile_flags2, dp_packed_tile_head_flags, dp_tile_carry, dp_tile_flags, n_tile_num);
				if(n_result != cl_Success)
					return n_result;

				n_result = n_Enqueue_SegmentedScan_NoAutoTune(h_cmd_queue,
					dp_tile_carry2, dp_tile_carry2, dp_tile_flags2, n_tile_num2, r_memory_alloc);
				if(n_result != cl_Success)
					return n_result;
				// scan the segment carry

				// note that at this point, dp_tile_carry2 contains global segmented scan of tile carry (dp_tile_carry)
				// there are no tile size constraints tying segmented scan to the segmented reduce spine adjust below.

				// note that the tile size argument below is likely a nonsense one,
				// this is not tied to the global scan tile size and can in fact
				// either run at the same granularity as the rest of the segmented reduce kernels
				// or at different one but then it needs more work

				n_result = Enqueue_TileSegReduce_SpineAdjust_Downsweep_Packed(h_cmd_queue, dp_segreduce,
					dp_tile_tcounts, dp_pack_flags, m_n_tile_size / 32, 1, dp_tile_carry, dp_tile_carry2,
					dp_packed_tile_head_flags, n_tile_num);
				/*n_result = Enqueue_TileSegReduce_SpineAdjust_Downsweep(h_cmd_queue, dp_segreduce,
					dp_tile_tcounts, dp_head_flags, m_n_tile_size, dp_tile_carry, dp_tile_carry2,
					dp_packed_tile_head_flags, n_tile_num);*/
				if(n_result != cl_Success)
					return n_result;
				// segmented reduction downsweep
			} else {
				return Enqueue_TileSegReduce_SpineAdjust(h_cmd_queue, dp_segreduce, dp_tile_tcounts,
					dp_pack_flags, m_n_tile_size / 32, 1, dp_tile_carry, dp_tile_flags, n_tile_num);
				// will need also a downsweep variant preceded by corrseponding seg-scan kernels
			}
#else // 0
			return n_Enqueue_SegReduce_SpineAdjust(h_cmd_queue, dp_segreduce, dp_tile_tcounts,
				dp_pack_flags, n_tile_size, dp_tile_carry, dp_tile_flags, n_tile_num, r_memory_alloc);
			// a nice function to take care of autotuning of the spine adjust kernel in addition
			// to the autotuning of the required segmented scan kernels
#endif // 0

			// need a new kernel which does seg scan (downsweep) and reduction fixup in one
			// dp_reductions[dp_tile_tcounts[i]] += scan[i] for each i where either !i or
			// dp_tile_tcounts[i] > dp_tile_tcounts[i - 1] ... TileSegReduce_SpineAdjust()
		} else {
			return r_reduce.Enqueue_TileSegReduceSingle_Packed(h_cmd_queue, dp_segreduce,
				dp_data, dp_pack_flags, n_elem_num);
			// apply the segmented reduction to tile partials
		}
	}

	CLresult n_Enqueue_SegReduce_PackedHeadFlags(cl_command_queue h_cmd_queue,
		cl_mem &dp_segreduce, // a buffer or 0 for allocate it for me
		size_t &r_n_reduction_num, // number of reductions or 0 for dont know
		size_t n_max_reduction_num, // max number of reductions that dp_segreduce can hold or 0 if dp_segreduce == 0
		const cl_mem dp_data, const cl_mem dp_packed_head_flags, size_t n_elem_num, CCLScan &r_int_scan,
		CCLTempBufferStack &r_memory_alloc) const
	{
		_ASSERTE(!n_max_reduction_num == !dp_segreduce); // either both zero or both nonzero
		_ASSERTE(!n_max_reduction_num || n_max_reduction_num >= r_n_reduction_num); // if you guess, make a sane guess (if this fails then the dp_segreduce buffer has fewer elements than is guessed to be the number of reductions)
		_ASSERTE(r_n_reduction_num <= n_elem_num); // cannot possibly be more than that

		const CCLTiled_SegmentedReduceScan_Impl &r_reduce =
			m_p_tile_segreduce[m_t_segreduce_autotune.n_Tuning(n_elem_num)];

		const size_t n_tile_size = r_reduce.n_Tile_Size();

		size_t n_tile_num = (n_elem_num + n_tile_size - 1) / n_tile_size;
		CCLTempBufferReservation dp_tile_flags(n_tile_num * sizeof(uint32_t), r_memory_alloc);
		CCLTempBufferReservation dp_tile_carry(n_tile_num * m_config.n_DataType_Size(), r_memory_alloc);

		bool b_need_reduction_total = !r_n_reduction_num;
#ifdef _DEBUG
		b_need_reduction_total = true;
#endif // _DEBUG

		CCLTempBufferReservation dp_tile_tcounts((n_tile_num +
			((b_need_reduction_total)? 1 : 0)) * sizeof(uint32_t), r_memory_alloc); // +1!

		CLresult n_result = r_reduce.Enqueue_TileSegReduce_Bootstrap_Packed(h_cmd_queue,
			dp_tile_tcounts, dp_tile_flags, dp_packed_head_flags, n_elem_num);
		if(n_result != cl_Success)
			return n_result;
		// bootstrap the segmented reduction by calculating tail counts per tile

		if(n_tile_num > 1) {
			n_result = r_int_scan.Enqueue_ExScan(h_cmd_queue, dp_tile_tcounts,
				dp_tile_tcounts, n_tile_num + ((b_need_reduction_total)? 1 : 0)); // +1!
			if(n_result != cl_Success)
				return n_result;
			// todo - this needs to be an int scan! (works now but won't work if the data type changes e.g. to float)
			// todo - make and use the reentrant version with a temp buffer allocator
		}
		// scan the tile counts to get tail counts

		uint32_t n_reds_num = r_n_reduction_num;
		if(b_need_reduction_total) { // the caller does not know how many there are, need to synchronize and check
			CCLCommandQueueWrapper q(h_cmd_queue);
			if((n_result = q.n_Finish()) != cl_Success) // ouch :(
				return n_result;
			size_t n_sum_index = (n_tile_num > 1)? n_tile_num : n_tile_num - 1; // if we just exscanned it then +1!
			if((n_result = q.n_Enqueue_Memcpy_DtoH(&n_reds_num, dp_tile_tcounts,
			   n_sum_index * sizeof(uint32_t), sizeof(uint32_t))) != cl_Success) // +1!
				return n_result;
			if(n_reds_num < 1)
				return (CLresult)-123456;//n_reds_num = 1; // would likely indicate a failure in counting tail flags (there always is at least one)
#ifdef _DEBUG
			_ASSERTE(!r_n_reduction_num || r_n_reduction_num == n_reds_num); // make sure that the caller indeed knows
#endif // _DEBUG
			r_n_reduction_num = n_reds_num; // write out the real number
		}
		if(n_max_reduction_num < n_reds_num) {
			if(dp_segreduce) {
				clReleaseMemObject(dp_segreduce);
				dp_segreduce = cl_mem(0);
			}
			n_result = CCLContextWrapper(r_memory_alloc.h_Context()).n_CreateBuffer(dp_segreduce,
				n_reds_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/);
			if(n_result != cl_Success) {
				dp_segreduce = cl_mem(0); // just to make sure
				return n_result;
			}
		}
		// in case there is a lot of data, read the number of reductions, otherwise just use a large temp buffer

		if(n_tile_num > 1) {
			n_result = r_reduce.Enqueue_TileSegReduce_Packed(h_cmd_queue, dp_segreduce,
				dp_tile_carry, dp_tile_tcounts, dp_data, dp_packed_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;
			// apply the segmented reduction to tile partials

			return n_Enqueue_SegReduce_SpineAdjust(h_cmd_queue, dp_segreduce, dp_tile_tcounts,
				dp_packed_head_flags, n_tile_size, dp_tile_carry, dp_tile_flags, n_tile_num, r_memory_alloc);
			// a nice function to take care of autotuning of the spine adjust kernel in addition
			// to the autotuning of the required segmented scan kernels
		} else {
			return r_reduce.Enqueue_TileSegReduceSingle_Packed(h_cmd_queue, dp_segreduce,
				dp_data, dp_packed_head_flags, n_elem_num);
			// apply the segmented reduction to tile partials
		}
	}

	CLresult n_Enqueue_SegReduce_PackedHeadFlags_NaturalOrder(cl_command_queue h_cmd_queue,
		cl_mem &dp_segreduce, // a buffer or 0 for allocate it for me
		size_t &r_n_reduction_num, // number of reductions or 0 for dont know
		size_t n_max_reduction_num, // max number of reductions that dp_segreduce can hold or 0 if dp_segreduce == 0
		const cl_mem dp_data, const cl_mem dp_packed_head_flags, size_t n_elem_num, CCLScan &r_int_scan,
		CCLTempBufferStack &r_memory_alloc) const
	{
		_ASSERTE(!n_max_reduction_num == !dp_segreduce); // either both zero or both nonzero
		_ASSERTE(!n_max_reduction_num || n_max_reduction_num >= r_n_reduction_num); // if you guess, make a sane guess (if this fails then the dp_segreduce buffer has fewer elements than is guessed to be the number of reductions)
		_ASSERTE(r_n_reduction_num <= n_elem_num); // cannot possibly be more than that

		const CCLTiled_SegmentedReduceScan_Impl &r_reduce =
			m_p_tile_segreduce[m_t_segreduce_autotune.n_Tuning(n_elem_num)];

		const size_t n_tile_size = r_reduce.n_Tile_Size();

		size_t n_tile_num = (n_elem_num + n_tile_size - 1) / n_tile_size;
		CCLTempBufferReservation dp_tile_flags(n_tile_num * sizeof(uint32_t), r_memory_alloc);
		CCLTempBufferReservation dp_tile_carry(n_tile_num * m_config.n_DataType_Size(), r_memory_alloc);

		bool b_need_reduction_total = !r_n_reduction_num;
#ifdef _DEBUG
		b_need_reduction_total = true;
#endif // _DEBUG

		CCLTempBufferReservation dp_tile_tcounts((n_tile_num +
			((b_need_reduction_total)? 1 : 0)) * sizeof(uint32_t), r_memory_alloc); // +1!

		CLresult n_result = r_reduce.Enqueue_TileSegReduce_Bootstrap_PackedNaturalOrder(h_cmd_queue,
			dp_tile_tcounts, dp_tile_flags, dp_packed_head_flags, n_elem_num);
		if(n_result != cl_Success)
			return n_result;
		// bootstrap the segmented reduction by calculating tail counts per tile

		if(n_tile_num > 1) {
			n_result = r_int_scan.Enqueue_ExScan(h_cmd_queue, dp_tile_tcounts,
				dp_tile_tcounts, n_tile_num + ((b_need_reduction_total)? 1 : 0)); // +1!
			if(n_result != cl_Success)
				return n_result;
			// todo - this needs to be an int scan! (works now but won't work if the data type changes e.g. to float)
			// todo - make and use the reentrant version with a temp buffer allocator
		}
		// scan the tile counts to get tail counts

		uint32_t n_reds_num = r_n_reduction_num;
		if(b_need_reduction_total) { // the caller does not know how many there are, need to synchronize and check
			CCLCommandQueueWrapper q(h_cmd_queue);
			if((n_result = q.n_Finish()) != cl_Success) // ouch :(
				return n_result;
			size_t n_sum_index = (n_tile_num > 1)? n_tile_num : n_tile_num - 1; // if we just exscanned it then +1!
			if((n_result = q.n_Enqueue_Memcpy_DtoH(&n_reds_num, dp_tile_tcounts,
			   n_sum_index * sizeof(uint32_t), sizeof(uint32_t))) != cl_Success) // +1!
				return n_result;
			if(n_reds_num < 1)
				return (CLresult)-123456;//n_reds_num = 1; // would likely indicate a failure in counting tail flags (there always is at least one)
#ifdef _DEBUG
			_ASSERTE(!r_n_reduction_num || r_n_reduction_num == n_reds_num); // make sure that the caller indeed knows
#endif // _DEBUG
			r_n_reduction_num = n_reds_num; // write out the real number
		}
		if(n_max_reduction_num < n_reds_num) {
			if(dp_segreduce) {
				clReleaseMemObject(dp_segreduce);
				dp_segreduce = cl_mem(0);
			}
			n_result = CCLContextWrapper(r_memory_alloc.h_Context()).n_CreateBuffer(dp_segreduce,
				n_reds_num * sizeof(uint32_t)/*m_config.n_DataType_Size()*/);
			if(n_result != cl_Success) {
				dp_segreduce = cl_mem(0); // just to make sure
				return n_result;
			}
		}
		// in case there is a lot of data, read the number of reductions, otherwise just use a large temp buffer

		if(n_tile_num > 1) {
			n_result = r_reduce.Enqueue_TileSegReduce_PackedNaturalOrder(h_cmd_queue, dp_segreduce,
				dp_tile_carry, dp_tile_tcounts, dp_data, dp_packed_head_flags, n_elem_num);
			if(n_result != cl_Success)
				return n_result;
			// apply the segmented reduction to tile partials

			return n_Enqueue_SegReduce_SpineAdjust(h_cmd_queue, dp_segreduce, dp_tile_tcounts,
				dp_packed_head_flags, n_tile_size, dp_tile_carry, dp_tile_flags, n_tile_num, r_memory_alloc); // ignores natural order in the head flags, the first flag is in the same position in both otders and that's all it needs to read
			// a nice function to take care of autotuning of the spine adjust kernel in addition
			// to the autotuning of the required segmented scan kernels
		} else {
			return r_reduce.Enqueue_TileSegReduceSingle_PackedNaturalOrder(h_cmd_queue, dp_segreduce,
				dp_data, dp_packed_head_flags, n_elem_num);
			// apply the segmented reduction to tile partials
		}
	}

	bool Benchmark(cl_command_queue h_cmd_queue, CCLScan &r_int_scan)
	{
		if(!b_Status())
			return false;

		CCLContextWrapper context(m_temp_buffers.h_Context());
		CCLCommandQueueWrapper cmd_queue(h_cmd_queue);
		// thin wrappers, do not delete the handles

		CCLTempBufferStack &mem_alloc = m_temp_buffers;
		// temp buffers, reuse between passes and benchmarks (otherwise prints a lot of verbose)

		
		bool b_results_correct = true;
		const size_t p_size[] = {1000 * 10, 1000 * 50, 1000 * 100, 1000 * 200,
			1000 * 500, 1000000, 1000000 * 2, 1000000 * 5, 1000000 * 10, 1000000 * 20};
		//const size_t p_size[] = {10, 100, 1000, 10000, 100000, 1000000, 10000000};
		if(b_Have_SegReduce()) {
			for(int n_test = 0; n_test < sizeof(p_size) / sizeof(p_size[0]); ++ n_test) {
				size_t n = p_size[n_test];

				printf("preparing data ...\r");

				seg_debug::THeadFlag_DebugInfo hf;
				//
				std::vector<uint32_t> scan_data(n);
				for(size_t i = 0; i < n; ++ i)
					scan_data[i] = (uint32_t)(i + 1);
				std::random_shuffle(scan_data.begin(), scan_data.end());
				// generate some data

				const size_t n_avg_seg_size = 500;
				// given as an input

				//size_t n_tile_num = (n + m_n_tile_size - 1) / m_n_tile_size;
				// number of tiles

				size_t n_head_flags_size_bytes = n_SegReduce_Packed_HeadFlags_Size(n);
				// size of the packed head flags

				const size_t n_seg_num = n / n_avg_seg_size + 1; // there must be at least one
				// number of segments (and so also of reductions)

				CCLUniqueMem dp_data, dp_reduce, dp_reduce_pphf, dp_reduce_pphfno,
					dp_head_flags, dp_packed_natural_hf, dp_packed_interleaved_hf;
				if(!(dp_data = context.h_CreateBuffer(n * sizeof(uint32_t))) ||
				   !(dp_reduce = context.h_CreateBuffer(n_seg_num * sizeof(uint32_t))) ||
				   !(dp_reduce_pphf = context.h_CreateBuffer(n_seg_num * sizeof(uint32_t))) ||
				   (m_p_tile_segreduce->b_Strided_HeadFlags() && // only if m_b_strided_head_flags is set
				   !(dp_reduce_pphfno = context.h_CreateBuffer(n_seg_num * sizeof(uint32_t)))) ||
				   !(dp_head_flags = context.h_CreateBuffer(n * sizeof(uint32_t))) ||
				   !(dp_packed_natural_hf = context.h_CreateBuffer(n_head_flags_size_bytes)) ||
				   !(dp_packed_interleaved_hf = context.h_CreateBuffer(n_head_flags_size_bytes))) {
					fprintf(stderr, "error: failed to alloc device buffer\n");
					return false;
				}
				// allocate memory

				_ASSERTE(!(n_head_flags_size_bytes % sizeof(uint32_t)));
				std::vector<uint32_t> packed_head_flags((m_p_tile_segreduce->b_Strided_HeadFlags())?
					n_head_flags_size_bytes / sizeof(uint32_t) : 0, 0);
				// allocate this ahead of time too

				printf("running global segmented reduce test ...  \r");

				CTimer test_timer;
				double f_time = 0;
				double f_time_pack = 0;
				double f_time_pphf = 0;
				double f_time_pphfno = 0;
				int n_pass_num = 0;
				for(;;) {
					cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_data, 0, &scan_data[0], n * sizeof(uint32_t));
					cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_reduce, 0, &scan_data[0], n_seg_num * sizeof(uint32_t)); // clear this buffer as well
					cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_reduce_pphf, 0, &scan_data[0], n_seg_num * sizeof(uint32_t)); // clear this buffer as well
					if(m_p_tile_segreduce->b_Strided_HeadFlags())
						cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_reduce_pphfno, 0, &scan_data[0], n_seg_num * sizeof(uint32_t)); // clear this buffer as well
					cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_packed_interleaved_hf, 0, &scan_data[0], n_head_flags_size_bytes); // and this buffer as well

					seg_debug::CSegmentedOp_Random_Benchmark().n_Generate_HeadFlags(hf, n, n_seg_num - 1);
					// generate head flags

					if(m_p_tile_segreduce->b_Strided_HeadFlags()) {
						packed_head_flags.assign(n_head_flags_size_bytes / sizeof(uint32_t), 0);
						for(size_t i = 0; i < n; ++ i)
							packed_head_flags[i / 32] |= (hf.head_flags[i] != 0) << (i & 31);
						// pack the head flags on the CPU
					}

					cmd_queue.n_Enqueue_Memcpy_HtoD(dp_head_flags, 0, &hf.head_flags[0], n * sizeof(uint32_t));
					if(m_p_tile_segreduce->b_Strided_HeadFlags()) {
						cmd_queue.n_Enqueue_Memcpy_HtoD(dp_packed_natural_hf, 0,
							&packed_head_flags[0], n_head_flags_size_bytes); // !!
					}
					CLresult n_result0 = cmd_queue.n_Finish();
					if(n_result0) {
						fprintf(stderr, "error: pre-finish result: %d (%s, %d)\n", n_result0, __FILE__, __LINE__);
						return false;
					}
					// prepare data ...

					double f_pack_start_time = test_timer.f_Time();

					{
						CLresult n_result = Enqueue_SegReduce_Pack_HeadFlags(cmd_queue,
							dp_packed_interleaved_hf, dp_head_flags, n);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
						n_result = cmd_queue.n_Finish();
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
					}

					f_time_pack += test_timer.f_Time() - f_pack_start_time;

					double f_pphf_start_time = test_timer.f_Time();

					{
						size_t n_seg_num1 = n_seg_num;
						CLresult n_result = n_Enqueue_SegReduce_PackedHeadFlags(cmd_queue,
							*const_cast<cl_mem*>(&dp_reduce_pphf), n_seg_num1, n_seg_num1, dp_data, //dp_head_flags, // debug
							dp_packed_interleaved_hf, n, r_int_scan, mem_alloc/*, f_bootstrap_time,
							f_exscan_time, f_segreduce_time, f_segscan_time, f_spine_time, f_total_time*/);
						_ASSERTE(n_seg_num1 == n_seg_num);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
						n_result = cmd_queue.n_Finish();
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
					}

					f_time_pphf += test_timer.f_Time() - f_pphf_start_time;

					if(m_p_tile_segreduce->b_Strided_HeadFlags()) {
						double f_pphfno_start_time = test_timer.f_Time();

						{
							CCLUniqueEvent ev_start, ev_end;

							//cmd_queue.n_Enqueue_Marker(ev_start);

							size_t n_seg_num1 = n_seg_num;
							CLresult n_result = n_Enqueue_SegReduce_PackedHeadFlags_NaturalOrder(cmd_queue,
								*const_cast<cl_mem*>(&dp_reduce_pphfno), n_seg_num1, n_seg_num1, dp_data, //dp_head_flags, // debug
								dp_packed_natural_hf, n, r_int_scan, mem_alloc);
							_ASSERTE(n_seg_num1 == n_seg_num);
							if(n_result != CL_SUCCESS) {
								fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
								return false;
							}

							/*cmd_queue.n_Enqueue_Marker(ev_end);

							ev_start.n_Wait();
							ev_end.n_Wait();
							uint64_t n_start, n_end;
							n_result = ev_start.n_GetProfilingCounter(n_start, CL_PROFILING_COMMAND_START);
							n_result = ev_end.n_GetProfilingCounter(n_end, CL_PROFILING_COMMAND_END);
							f_time_pphfno += CCLUniqueEvent::f_ProfilingCounter_Difference(n_start, n_end);*/

							n_result = cmd_queue.n_Finish();
							if(n_result != CL_SUCCESS) {
								fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
								return false;
							}
						}

						f_time_pphfno += test_timer.f_Time() - f_pphfno_start_time;
					}

					double f_start_time = test_timer.f_Time();

					{
						size_t n_seg_num1 = n_seg_num;
						CLresult n_result = n_Enqueue_SegReduce(cmd_queue,
							*const_cast<cl_mem*>(&dp_reduce), n_seg_num1, n_seg_num1, dp_data, dp_head_flags,
							n, r_int_scan, mem_alloc);
						_ASSERTE(n_seg_num1 == n_seg_num);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
						n_result = cmd_queue.n_Finish();
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
					}

					double f_pass_time = test_timer.f_Time() - f_start_time;
					f_time += f_pass_time;
					++ n_pass_num;

					if((f_time > .5f && n_pass_num > 10) || f_time > 4)
						break;
					// make sure the timing is stable, don't take too long at the same time
				}
				//-- n_pass_num; // the first pass did not count
				// run the thing

				f_time /= n_pass_num;
				f_time_pack /= n_pass_num;
				f_time_pphf /= n_pass_num;
				f_time_pphfno /= n_pass_num;
				size_t n_data = 1 * scan_data.size() * sizeof(uint32_t); // only read the data, written reductions much smaller
				double f_GBps = n_data / f_time * 1e-9; // mGPU also uses 1e-9 rather than 1024^3
				if(m_p_tile_segreduce->b_Strided_HeadFlags()) {
					printf("" PRIsizeB "B: %f ms -> %f*1e9 B/s (%.4f w/hf, %.4f ilv, %.4f nat, %.4f pck)\n",
						PRIsizeBparams(scan_data.size() * sizeof(uint32_t)), f_time * 1000, f_GBps, f_GBps * 2, // also read head flags (as big as the data if unpacked)
						n_data / f_time_pphf * 1e-9, n_data / f_time_pphfno * 1e-9, n_data / f_time_pack * 1e-9); // only read head flags
				} else {
					printf("" PRIsizeB "B: %f ms -> %f*1e9 B/s (%.4f w/hf, %.4f nat, %.4f pck)\n",
						PRIsizeBparams(scan_data.size() * sizeof(uint32_t)), f_time * 1000, f_GBps, f_GBps * 2, // also read head flags (as big as the data if unpacked)
						n_data / f_time_pphf * 1e-9, n_data / f_time_pack * 1e-9); // only read head flags
				}
				// print results

				std::vector<uint32_t> seg_reduction_cpu(n); // could work inplace but we still need a buffer for getting the GPU result(s)
				CTimer tcpu;
				seg_debug::CReference::Segmented_Reduce(seg_reduction_cpu, scan_data, hf.head_flags);
				printf("global segmented reduction takes %f msec on CPU\n", tcpu.f_Time() * 1000);
				// perform a global scan (the goal)

				bool b_test_correct;
				if(cmd_queue.n_CompareBuffer_DebugVerbose(b_test_correct, seg_reduction_cpu.begin(),
				   seg_reduction_cpu.end(), dp_reduce, 0, "global segmented reduction") != cl_Success) {
					fprintf(stderr, "error: cmd_queue.n_CompareBuffer_DebugVerbose() failed to compare the results\n");
					break;
				}
				bool b_test_correct_pphf;
				if(cmd_queue.n_CompareBuffer_DebugVerbose(b_test_correct_pphf, seg_reduction_cpu.begin(),
				   seg_reduction_cpu.end(), dp_reduce_pphf, 0, "global segmented reduction pphf") != cl_Success) {
					fprintf(stderr, "error: cmd_queue.n_CompareBuffer_DebugVerbose() failed to compare the results\n");
					break;
				}
				bool b_test_correct_pphfno = true;
				if(m_p_tile_segreduce->b_Strided_HeadFlags()) {
					if(cmd_queue.n_CompareBuffer_DebugVerbose(b_test_correct_pphfno, seg_reduction_cpu.begin(),
					   seg_reduction_cpu.end(), dp_reduce_pphfno, 0, "global segmented reduction pphfno") != cl_Success) {
						fprintf(stderr, "error: cmd_queue.n_CompareBuffer_DebugVerbose() failed to compare the results\n");
						break;
					}
				}
				// reusable function, uses clnqueueMapBuffer()

				if(b_test_correct && b_test_correct_pphf && b_test_correct_pphfno)
					printf("done. global segmented reduction of %d items succeeded\n", n);
				else {
					b_results_correct = false;
					break;
				}
				// make sure it is scanned correctly
			}
		}
		if(b_Have_SegScan()) {
			for(int n_test = 0; n_test < sizeof(p_size) / sizeof(p_size[0]); ++ n_test) {
				size_t n = p_size[n_test];

				printf("preparing data ...\r");

				seg_debug::THeadFlag_DebugInfo hf;
				//
				std::vector<uint32_t> scan_data(n);
				for(size_t i = 0; i < n; ++ i)
					scan_data[i] = (uint32_t)(i + 1);
				std::random_shuffle(scan_data.begin(), scan_data.end());
				// generate some data

				const size_t n_avg_seg_size = 500;
				// given as an input

				//size_t n_tile_num = (n + m_n_tile_size - 1) / m_n_tile_size;
				// number of tiles

				size_t n_head_flags_size_bytes = n_SegScan_Packed_HeadFlags_Size(n);
				// size of the packed head flags

				CCLUniqueMem dp_data, dp_scan, dp_scan_pphf, dp_scan_pphfno,
					dp_head_flags, dp_packed_natural_hf, dp_packed_interleaved_hf;
				if(!(dp_data = context.h_CreateBuffer(n * sizeof(uint32_t))) ||
				   !(dp_scan = context.h_CreateBuffer(n * sizeof(uint32_t))) ||
				   !(dp_scan_pphf = context.h_CreateBuffer(n * sizeof(uint32_t))) ||
				   (m_p_tile_segscan->b_Strided_HeadFlags() && // only if m_b_strided_head_flags is set
				   !(dp_scan_pphfno = context.h_CreateBuffer(n * sizeof(uint32_t)))) ||
				   !(dp_head_flags = context.h_CreateBuffer(n * sizeof(uint32_t))) ||
				   !(dp_packed_natural_hf = context.h_CreateBuffer(n_head_flags_size_bytes)) ||
				   !(dp_packed_interleaved_hf = context.h_CreateBuffer(n_head_flags_size_bytes))) {
					fprintf(stderr, "error: failed to alloc device buffer\n");
					return false;
				}
				// allocate memory

				_ASSERTE(!(n_head_flags_size_bytes % sizeof(uint32_t)));
				std::vector<uint32_t> packed_head_flags((m_p_tile_segscan->b_Strided_HeadFlags())?
					n_head_flags_size_bytes / sizeof(uint32_t) : 0, 0);
				// allocate this ahead of time too

				printf("running global segmented scan test ...  \r");

				CTimer test_timer;
				double f_time = 0;
				double f_time_pack = 0;
				double f_time_pphf = 0;
				double f_time_pphfno = 0;
				int n_pass_num = 0;
				for(;;) {
					cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_data, 0, &scan_data[0], n * sizeof(uint32_t));
					cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_scan, 0, &scan_data[0], n * sizeof(uint32_t)); // clear this buffer as well
					cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_scan_pphf, 0, &scan_data[0], n * sizeof(uint32_t)); // clear this buffer as well
					if(m_p_tile_segscan->b_Strided_HeadFlags())
						cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_scan_pphfno, 0, &scan_data[0], n * sizeof(uint32_t)); // clear this buffer as well
					cmd_queue.n_Enqueue_Memcpy_HtoD_Async(dp_packed_interleaved_hf, 0, &scan_data[0], n_head_flags_size_bytes); // and this buffer as well

					size_t n_seg_num = n / n_avg_seg_size + 1; // there must be at least one
					seg_debug::CSegmentedOp_Random_Benchmark().n_Generate_HeadFlags(hf, n, n_seg_num - 1);
					// generate head flags

					if(m_p_tile_segscan->b_Strided_HeadFlags()) {
						packed_head_flags.assign(n_head_flags_size_bytes / sizeof(uint32_t), 0);
						for(size_t i = 0; i < n; ++ i)
							packed_head_flags[i / 32] |= (hf.head_flags[i] != 0) << (i & 31);
						// pack the head flags on the CPU
					}

					cmd_queue.n_Enqueue_Memcpy_HtoD(dp_head_flags, 0, &hf.head_flags[0], n * sizeof(uint32_t));
					if(m_p_tile_segscan->b_Strided_HeadFlags()) {
						cmd_queue.n_Enqueue_Memcpy_HtoD(dp_packed_natural_hf, 0,
							&packed_head_flags[0], n_head_flags_size_bytes); // !!
					}
					CLresult n_result0 = cmd_queue.n_Finish();
					if(n_result0) {
						fprintf(stderr, "error: pre-finish result: %d (%s, %d)\n", n_result0, __FILE__, __LINE__);
						return false;
					}
					// prepare data ...

					double f_pack_start_time = test_timer.f_Time();

					{
						CLresult n_result = Enqueue_SegScan_Pack_HeadFlags(cmd_queue,
							dp_packed_interleaved_hf, dp_head_flags, n);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
						n_result = cmd_queue.n_Finish();
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
					}

					f_time_pack += test_timer.f_Time() - f_pack_start_time;

					double f_pphf_start_time = test_timer.f_Time();

					{
						CLresult n_result = n_Enqueue_SegScan_PackedHeadFlags(cmd_queue,
							dp_scan_pphf, dp_data, dp_packed_interleaved_hf, n, mem_alloc);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
						n_result = cmd_queue.n_Finish();
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
					}

					f_time_pphf += test_timer.f_Time() - f_pphf_start_time;

					if(m_p_tile_segscan->b_Strided_HeadFlags()) {
						double f_pphfno_start_time = test_timer.f_Time();

						{
							CLresult n_result = n_Enqueue_SegScan_PackedHeadFlags_NaturalOrder(cmd_queue,
								dp_scan_pphfno, dp_data, dp_packed_natural_hf, n, mem_alloc);
							if(n_result != CL_SUCCESS) {
								fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
								return false;
							}
							n_result = cmd_queue.n_Finish();
							if(n_result != CL_SUCCESS) {
								fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
								return false;
							}
						}

						f_time_pphfno += test_timer.f_Time() - f_pphfno_start_time;
					}

					double f_start_time = test_timer.f_Time();

					{
						CLresult n_result = n_Enqueue_SegScan(cmd_queue,
							dp_scan, dp_data, dp_head_flags, n, mem_alloc);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: clCall1D() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
						n_result = cmd_queue.n_Finish();
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: finish result: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
							return false;
						}
					}

					double f_pass_time = test_timer.f_Time() - f_start_time;
					f_time += f_pass_time;
					++ n_pass_num;

					if((f_time > .5f && n_pass_num > 10) || f_time > 4)
						break;
					// make sure the timing is stable, don't take too long at the same time
				}
				//-- n_pass_num; // the first pass did not count
				// run the thing

				f_time /= n_pass_num;
				f_time_pack /= n_pass_num;
				f_time_pphf /= n_pass_num;
				f_time_pphfno /= n_pass_num;
				size_t n_data = 3 * scan_data.size() * sizeof(uint32_t); // read data, write tile carry and compressed head flags, read data, write scans
				double f_GBps = n_data / f_time * 1e-9; // mGPU also uses 1e-9 rather than 1024^3
				if(m_p_tile_segscan->b_Strided_HeadFlags()) {
					printf("on " PRIsizeB "B, it took %f msec, reaching %f*1e9 B/s (%.4f w/hf, %.4f ilv, %.4f nat, %.4f pck)\n",
						PRIsizeBparams(scan_data.size() * sizeof(uint32_t)), f_time * 1000, f_GBps, f_GBps * 4 / 3, // also read head flags (as big as the data if unpacked)
						n_data / f_time_pphf * 1e-9, n_data / f_time_pphfno * 1e-9, n_data / f_time_pack * 1e-9 * 1 / 3); // only read head flags
				} else {
					printf("on " PRIsizeB "B, it took %f msec, reaching %f*1e9 B/s (%.4f w/hf, %.4f nat, %.4f pck)\n",
						PRIsizeBparams(scan_data.size() * sizeof(uint32_t)), f_time * 1000, f_GBps, f_GBps * 4 / 3, // also read head flags (as big as the data if unpacked)
						n_data / f_time_pphf * 1e-9, n_data / f_time_pack * 1e-9 * 1 / 3); // only read head flags
				}
				// print results

				std::vector<uint32_t> seg_scan_cpu(n); // could work inplace but we still need a buffer for getting the GPU result(s)
				CTimer tcpu;
				seg_debug::CReference::Segmented_Scan(seg_scan_cpu, scan_data, hf.head_flags);
				printf("global segmented scan takes %f msec on CPU\n", tcpu.f_Time() * 1000);
				// perform a global scan (the goal)

				bool b_test_correct;
				if(cmd_queue.n_CompareBuffer_DebugVerbose(b_test_correct, seg_scan_cpu.begin(),
				   seg_scan_cpu.end(), dp_scan, 0, "global segmented scan") != cl_Success) {
					fprintf(stderr, "error: cmd_queue.n_CompareBuffer_DebugVerbose() failed to compare the results\n");
					break;
				}
				bool b_test_correct_pphf;
				if(cmd_queue.n_CompareBuffer_DebugVerbose(b_test_correct_pphf, seg_scan_cpu.begin(),
				   seg_scan_cpu.end(), dp_scan_pphf, 0, "global segmented scan pphf") != cl_Success) {
					fprintf(stderr, "error: cmd_queue.n_CompareBuffer_DebugVerbose() failed to compare the results\n");
					break;
				}
				bool b_test_correct_pphfno = true;
				if(m_p_tile_segscan->b_Strided_HeadFlags()) {
					if(cmd_queue.n_CompareBuffer_DebugVerbose(b_test_correct_pphfno, seg_scan_cpu.begin(),
					   seg_scan_cpu.end(), dp_scan_pphfno, 0, "global segmented scan pphfno") != cl_Success) {
						fprintf(stderr, "error: cmd_queue.n_CompareBuffer_DebugVerbose() failed to compare the results\n");
						break;
					}
				}
				// reusable function, uses clnqueueMapBuffer()

				if(b_test_correct && b_test_correct_pphf && b_test_correct_pphfno)
					printf("done. global segmented scan of %d items succeeded\n", n);
				else {
					b_results_correct = false;
					break;
				}
				// make sure it is scanned correctly
			}
		}
		if(b_results_correct)
			printf("all tests finished correctly\n");
		else
			fprintf(stderr, "error: there were some errors\n");

		return true;
	}
};

const CCLSegScanReduce::TAutotuneInfo CCLSegScanReduce::m_p_segscan_autotune_info[4] = {
	TAutotuneInfo("generic", 4,
		SIZE_MAX, 0, SIZE_MAX, 0, SIZE_MAX, 0,
		1, TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0)),
	TAutotuneInfo("generic", 4,
		SIZE_MAX, 0, SIZE_MAX, 0, SIZE_MAX, 0,
		1, TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0)),
	TAutotuneInfo("generic", 4,
		SIZE_MAX, 0, SIZE_MAX, 0, SIZE_MAX, 0,
		1, TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0)),

	TAutotuneInfo("generic", 4,
		SIZE_MAX, 0, SIZE_MAX, 0, SIZE_MAX, 0,
		1, TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0))
};

const CCLSegScanReduce::TAutotuneInfo CCLSegScanReduce::m_p_segreduce_autotune_info[4] = {
	TAutotuneInfo("generic", 4,
		SIZE_MAX, 0, SIZE_MAX, 0, SIZE_MAX, 0,
		1, TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0)),
	TAutotuneInfo("generic", 4,
		SIZE_MAX, 0, SIZE_MAX, 0, SIZE_MAX, 0,
		1, TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0)),
	TAutotuneInfo("generic", 4,
		SIZE_MAX, 0, SIZE_MAX, 0, SIZE_MAX, 0,
		1, TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0)),

	TAutotuneInfo("generic", 4,
		SIZE_MAX, 0, SIZE_MAX, 0, SIZE_MAX, 0,
		1, TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0))
};

const CCLSegScanReduce::TAutotuneInfo CCLSegScanReduce::m_p_segreduce_spineadf_autotune_info[4] = {
	TAutotuneInfo("nv_3_5_tes", 4, // device and data type size
		512, 1, 4096, 2, SIZE_MAX, 0, // configuration threshold / tuning index pairs
		3, TSegKernelTuning(128, 384, 0), TSegKernelTuning(512, 512, 1), TSegKernelTuning(256, 1024, 1)), // number of used tunings and the list of tunings

	TAutotuneInfo("generic", 4,
		SIZE_MAX, 0, SIZE_MAX, 0, SIZE_MAX, 0,
		1, TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0)),
	TAutotuneInfo("generic", 4,
		SIZE_MAX, 0, SIZE_MAX, 0, SIZE_MAX, 0,
		1, TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0)),

	TAutotuneInfo("generic", 4,
		SIZE_MAX, 0, SIZE_MAX, 0, SIZE_MAX, 0,
		1, TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0), TSegKernelTuning(256, 1024, 0))
};

#endif // !__OPENCL_GLOBAL_SEGMENTED_SCAN_REDUCTION_INCLUDED
