/*
								+--------------------------------+
								|                                |
								| *** Tiled scan/reduce a.t. *** |
								|                                |
								|  Copyright  -tHE SWINe- 2016  |
								|                                |
								|      TiledScanReduce_AT.h      |
								|                                |
								+--------------------------------+
*/

#pragma once
#ifndef __OPENCL_TILED_SCAN_REDUCTION_AUTOTUNING_INCLUDED
#define __OPENCL_TILED_SCAN_REDUCTION_AUTOTUNING_INCLUDED

/**
 *	@file gpgpu/TiledScanReduce_AT.h
 *	@date 2016
 *	@author -tHE SWINe-
 *	@brief OpenCL tiled scan and reduce autotune wrapper
 */

/*
// use e.g. this:

int main(int n_arg_num, const char **p_arg_list)
{
	CCLUniqueInstance opencl(CL_DEVICE_TYPE_GPU);
	// init OpencL

	const char *p_s_report_file = "cum_report_in-scan_transposed_cput_rich.html";
	const char *p_s_results_file = "autotune_in-scan_transposed_cput_rich.txt";

	int n_verbosity = 1, n = 10;
	{
		CTiled_ReduceScan_AutoTune exscan_kernel(0);
		for(int i = 0; i < n; ++ i) { // loop contains CCLSimpleAutotuneDriver to reparse the results so far
			CCLSimpleAutotuneDriver autotune(&exscan_kernel, p_s_results_file,
				opencl.h_Device(0), false, n, true, n_verbosity); // allow repeated
			autotune.Run(opencl);
		}
	}
	// autotune reduce / downsweep scan

	CAutotuneCuts_ReportGenerator().Report(p_s_report_file, p_s_results_file,
		2, 3, CAutotuneCuts_ReportGenerator::result_Median);
#if defined(_WIN32) || defined(_WIN64)
	ShellExecute(0, "open", p_s_report_file, 0, 0, SW_SHOW); // open in the default browser
#endif // _WIN32 || _WIN64

	return 0;
}
*/

#include "TiledScanReduce.h"
#include "Autotune.h"

class CTiled_ReduceScan_AutoTune : public CCLAutotuneInterface,
	public CAutotuneModel_DataSize_WorkGroupSize_TileSize {
protected:
	std::vector<uint32_t> m_data;
	std::vector<uint32_t> m_spine_scan;
	std::vector<uint32_t> m_scan_ground_truth;
	std::vector<uint32_t> m_reduce_ground_truth;
	CCLTiled_ReduceScan m_scan;
	CCLUniqueMem m_dp_scan, m_dp_tile_reduce, m_dp_scanned_spine;
	size_t m_n_scan_buffer_elems;
	size_t m_n_reduce_buffer_elems;
	CTimer m_timer;
	int m_n_bench_type;

public:
	CTiled_ReduceScan_AutoTune(int n_bench_type = 0) // in, ex, reduce
		:m_n_bench_type(n_bench_type), m_n_scan_buffer_elems(0), m_n_reduce_buffer_elems(0)
	{
		_ASSERTE(n_bench_type >= 0 && n_bench_type < 3);
	}

	virtual bool Prepare_Benchmark(cl_context h_context, cl_device_id h_device,
		cl_command_queue h_cmd_queue, const std::vector<size_t> &r_tuning)
	{
		_ASSERTE(r_tuning.size() == 2);

		if(m_scan.n_Tile_Size() != r_tuning[1]) {
			m_scan_ground_truth.clear();
			m_reduce_ground_truth.clear();
		}
		// if tile size changed then need to redo the ground truth

		m_scan = CCLTiled_ReduceScan(r_tuning[0], r_tuning[1]);
		if(!m_scan.Compile(h_context, h_device, true, true))
			return false;

		// ground truth computed lazily

		return true;
	}

	virtual bool Run_Benchmark(cl_context h_context,
		cl_command_queue h_cmd_queue, double &r_f_time, bool b_verify)
	{
		CCLContextWrapper context(h_context);
		CCLCommandQueueWrapper cmd_queue(h_cmd_queue);
		// thin wrappers, do not delete the handles

		const size_t n = m_data.size(), n_tile_size = m_scan.n_Tile_Size();
		const size_t n_tile_num = (n + n_tile_size - 1) / n_tile_size;

		if(m_n_bench_type < 2 && m_spine_scan.size() != n_tile_num) {
			m_spine_scan.clear();
			m_spine_scan.reserve(n_tile_num);
			size_t n_tile_size = m_scan.n_Tile_Size();
			size_t n_tile_num = (n + n_tile_size - 1) / n_tile_size;
			for(size_t i = 0; i < n;) {
				uint32_t n_reduce = 0;
				for(size_t e = std::min(i + n_tile_size, n); i < e; ++ i)
					n_reduce += m_data[i];
				m_spine_scan.push_back(n_reduce);
			}
			stl_ut::ExclusiveScan(m_spine_scan.begin(), m_spine_scan.end());
			// generate spine scan
		}

		size_t n_scan_buffer_elems = n;
		if(n_scan_buffer_elems != m_n_scan_buffer_elems) {
			if(!(m_dp_scan = context.h_CreateBuffer(sizeof(uint32_t) * n_scan_buffer_elems)))
				return false;
			m_n_scan_buffer_elems = n_scan_buffer_elems;
		}
		size_t n_reduce_buffer_elems = n_tile_num;
		if(n_reduce_buffer_elems != m_n_reduce_buffer_elems) {
			if(m_n_bench_type == 2 || (m_n_bench_type < 2 && n_tile_num > 1)) {
				if(!(m_dp_tile_reduce = context.h_CreateBuffer(sizeof(uint32_t) * n_reduce_buffer_elems)))
					return false;
			}
			if(m_n_bench_type < 2) {
				if(!(m_dp_scanned_spine = context.h_CreateBuffer(sizeof(uint32_t) * n_reduce_buffer_elems)))
					return false;
			}
			m_n_reduce_buffer_elems = n_reduce_buffer_elems;
		}
		// (re)allocate the buffers as needed

		_ASSERTE(m_data.size() == n_scan_buffer_elems);
		CLresult n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_scan, 0,
			&m_data[0], n_scan_buffer_elems * sizeof(uint32_t));
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		if(m_n_bench_type == 2 || (m_n_bench_type < 2 && n_tile_num > 1)) {
			_ASSERTE(m_data.size() >= n_tile_num);
			n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_tile_reduce, 0, &m_data[0],
				n_tile_num * sizeof(uint32_t)); // put something different in here as well
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
		}
		if(m_n_bench_type < 2 && n_tile_num > 1) {
			_ASSERTE(m_spine_scan.size() == n_tile_num);
			n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_scanned_spine, 0, &m_spine_scan[0],
				n_tile_num * sizeof(uint32_t)); // put something different in here as well
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
		}
		n_result = cmd_queue.n_Finish();
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		// wait

		CCLUniqueEvent ev, ev2; // for profiling

		r_f_time = m_timer.f_Time();

		CLresult n_result2 = cl_Success;
		if(n_tile_num > 1) {
			if(m_n_bench_type == 0) {
				n_result = m_scan.Enqueue_TileReduce(h_cmd_queue,
					m_dp_tile_reduce, n_tile_num, m_dp_scan, n_scan_buffer_elems).GetEvent(ev);
				n_result2 = m_scan.Enqueue_TileInScan_Downsweep(h_cmd_queue, m_dp_scan, m_dp_scanned_spine,
					m_dp_scan, n_scan_buffer_elems).WithEvents(1, &ev, ev2); // explicitly serialize, wouldn't wait otherwise
			} else if(m_n_bench_type == 1) {
				n_result = m_scan.Enqueue_TileReduce(h_cmd_queue,
					m_dp_tile_reduce, n_tile_num, m_dp_scan, n_scan_buffer_elems).GetEvent(ev);
				n_result2 = m_scan.Enqueue_TileExScan_Downsweep(h_cmd_queue, m_dp_scan, m_dp_scanned_spine,
					m_dp_scan, n_scan_buffer_elems).WithEvents(1, &ev, ev2); // explicitly serialize, wouldn't wait otherwise
			} else /*if(m_n_bench_type == 2)*/
				n_result = m_scan.Enqueue_TileReduce(h_cmd_queue, m_dp_tile_reduce, n_tile_num, m_dp_scan, n_scan_buffer_elems).GetEvent(ev);
		} else {
			if(m_n_bench_type == 0)
				n_result = m_scan.Enqueue_TileInScan(h_cmd_queue, m_dp_scan, m_dp_scan, n_scan_buffer_elems).GetEvent(ev);
			else if(m_n_bench_type == 1)
				n_result = m_scan.Enqueue_TileExScan(h_cmd_queue, m_dp_scan, m_dp_scan, n_scan_buffer_elems).GetEvent(ev);
			else /*if(m_n_bench_type == 2)*/
				n_result = m_scan.Enqueue_TileReduce(h_cmd_queue, m_dp_tile_reduce, n_tile_num, m_dp_scan, n_scan_buffer_elems).GetEvent(ev);
		}
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: scan.n_Enqueue_Tile%s() failed with: %d (%s, %d)\n",
				(m_n_bench_type == 0)? "InScan" : ((m_n_bench_type == 1)? "ExScan" : "Reduce"),
				n_result, __FILE__, __LINE__);
			return false;
		}
		if(n_result2 != CL_SUCCESS) {
			fprintf(stderr, "error: scan.Enqueue_TileExScan_Downsweep() failed with: %d (%s, %d)\n",
				n_result2, __FILE__, __LINE__);
			return false;
		}
		// run

		n_result = cmd_queue.n_Finish();
		r_f_time = m_timer.f_Time() - r_f_time;
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}

		if((n_result = ev.n_Wait()) != CL_SUCCESS) {
			fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		if(m_n_bench_type < 2 && n_tile_num > 1 && (n_result2 = ev2.n_Wait()) != CL_SUCCESS) {
			fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result2, __FILE__, __LINE__);
			return false;
		}
		// wait

		/*if(ev.n_GetProfilingCounter_Difference(r_f_time, CL_PROFILING_COMMAND_START,
		   CL_PROFILING_COMMAND_END) != CL_SUCCESS) {
			fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		if(m_n_bench_type < 2 && n_tile_num > 1) {
			double f_more_time;
			if(ev2.n_GetProfilingCounter_Difference(f_more_time, CL_PROFILING_COMMAND_START,
			   CL_PROFILING_COMMAND_END) != CL_SUCCESS) {
				fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
			r_f_time += f_more_time;
		}*/

		if(b_verify && m_n_bench_type != 3) {
			if(m_scan_ground_truth.empty()) {
				m_scan_ground_truth.resize(n);
					uint32_t n_partial = 0; // global!
				for(size_t i = 0; i < n; ++ i) { // global!
					/*for(size_t e = std::min(n, i + n_tile_size); i < e; ++ i)*/ { // global!
						if(m_n_bench_type == 1)
							m_scan_ground_truth[i] = n_partial; // exclusive
						n_partial += m_data[i];
						if(m_n_bench_type == 0)
							m_scan_ground_truth[i] = n_partial; // inclusive
					}
				}
				m_reduce_ground_truth.resize(n_tile_num);
				for(size_t i = 0, b = 0; i < n; ++ b) {
					uint32_t n_partial = 0;
					for(size_t e = std::min(n, i + n_tile_size); i < e; ++ i)
						n_partial += m_data[i];
					m_reduce_ground_truth[b] = n_partial;
				}
			}
			// compute ground truth on CPU

			if(m_n_bench_type < 2) {
				bool b_verify_result;
				n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_verify_result, m_scan_ground_truth.begin(),
					m_scan_ground_truth.end(), m_dp_scan, 0, (!m_n_bench_type)? "in-scan" : "ex-scan");
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: opencl[0].n_CompareBuffer() failed with: %d (%s, %d)\n",
						n_result, __FILE__, __LINE__);
					return false;
				}
				if(!b_verify_result) {
					fprintf(stderr, "error: %s-scan results are incorrect\n", (!m_n_bench_type)? "in" : "ex");
					return false;
				}
			}
			if(m_n_bench_type == 2 || n_tile_num > 1) {
				bool b_verify_result;
				n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_verify_result, m_reduce_ground_truth.begin(),
					m_reduce_ground_truth.end(), m_dp_tile_reduce, 0, "reduce");
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: opencl[0].n_CompareBuffer() failed with: %d (%s, %d)\n",
						n_result, __FILE__, __LINE__);
					return false;
				}
				if(!b_verify_result) {
					fprintf(stderr, "error: reduce results are incorrect\n");
					return false;
				}
			}
		}
		// verify

		return true;
	}

	virtual bool Prepare_Input(const std::vector<size_t> &r_input)
	{
		_ASSERTE(r_input.size() == 1);
		const size_t n = r_input.front();
		// read config

		m_data.resize(n);
		m_scan_ground_truth.clear();
		m_reduce_ground_truth.clear();
		m_spine_scan.clear();
		// alloc

		for(size_t i = 0; i < n; ++ i)
			m_data[i] = (uint32_t)(i + 1);
		std::random_shuffle(m_data.begin(), m_data.end());
		// generate some data

		return true;
	}

	virtual const char *p_s_Benchmark_Id() const
	{
		return (m_n_bench_type == 0)? "tiled_in-scan" :
			((m_n_bench_type == 1)? "tiled_ex-scan" :
			((m_n_bench_type == 2)? "tiled_reduce" : "(null)"));
	}

	virtual const size_t n_InputSize_Lookup(size_t n_index) const
	{
		const size_t p_size[] = {10, 20, 50, 100, 200, 500, 1000, 2000, 5000,
			1000 * 10, 1000 * 20, 1000 * 50, 1000 * 100, 1000 * 200, 1000 * 500,
			1000000, 1000000 * 2, 1000000 * 5, 1000000 * 10, 1000000 * 20};
		if(sizeof(p_size) / sizeof(p_size[0]) <= n_index)
			return 0;
		return p_size[n_index];
	}

	virtual const size_t n_WorkGroupSize_Lookup(size_t n_index) const
	{
		const size_t p_size[] = {128, 192, 256, 512, 1024};
		if(sizeof(p_size) / sizeof(p_size[0]) <= n_index)
			return 0;
		return p_size[n_index];
	}

	virtual const size_t n_ItemsPerThread_Lookup(size_t n_index) const
	{
		const size_t p_size[] = {1, 2, 3, 4, 5, 6, 7, 8/*, 16, 32*/};
		if(sizeof(p_size) / sizeof(p_size[0]) <= n_index)
			return 0;
		return p_size[n_index];
	}
};

class CTiled_SegmentedScan_AutoTune : public CCLAutotuneInterface,
	public CAutotuneModel_DataSize_WorkGroupSize_TileSize {
protected:
	std::vector<uint32_t> m_data;
	std::vector<uint32_t> *m_p_head_flags;
	std::vector<size_t> m_packed_head_flags_natural_cpu;
	std::vector<uint32_t> m_tile_tail_num_cpu;
	std::vector<uint32_t> m_tile_flags_cpu;
	std::vector<uint32_t> m_tile_carry_cpu;
	std::vector<uint32_t> m_scanned_tile_carry_cpu;
	std::vector<uint32_t> m_seg_scan_cpu;

	std::vector<uint32_t> m_super_head_flags;
	std::vector<uint32_t> m_super_segreduce_cpu;
	std::vector<uint32_t> m_segscanned_tile_carry_cpu;
	std::vector<uint32_t> m_super_tile_tcounts_exscan_cpu;
	std::vector<uint32_t> m_super_packed_head_flags_cpu;

	seg_debug::THeadFlag_DebugInfo m_seginfo; // keep allocated

	CCLTiled_SegmentedReduceScan m_scan;
	CCLUniqueMem m_dp_scan, m_dp_head_flags, m_dp_packed_head_flags,
		m_dp_tile_flags, m_dp_tile_carry, m_dp_scanned_tile_carry,

		m_dp_super_segreduce, // bigger than m_dp_scan, contains fictitious segmented reduce
		m_dp_segscanned_tile_carry, // the same as m_dp_scan but already contains the segmented scan as if we returned from the recursed segscan
		m_dp_super_tile_tcounts_exscan, // tail counts
		m_dp_super_packed_head_flags; // packed head flags

	size_t m_n_scan_buffer_elems;
	size_t m_n_tile_buffer_elems;
	size_t m_n_hf_buffer_elems;
	size_t m_n_reduction_buffer_elems;
	CTimer m_timer;
	int m_n_bench_type;

public:
	CTiled_SegmentedScan_AutoTune(int n_bench_type = 0) // only in scan so far
		:m_n_bench_type(n_bench_type), m_n_scan_buffer_elems(0),
		m_n_tile_buffer_elems(0), m_n_hf_buffer_elems(0), m_n_reduction_buffer_elems(0)
	{
		m_p_head_flags = &m_seginfo.head_flags;

		_ASSERTE(n_bench_type >= 0 && n_bench_type < 4); // decode, pre-packed, pre-packed natural
	}

	virtual bool Prepare_Benchmark(cl_context h_context, cl_device_id h_device,
		cl_command_queue h_cmd_queue, const std::vector<size_t> &r_tuning)
	{
		_ASSERTE(r_tuning.size() == 3);

		m_scan = CCLTiled_SegmentedReduceScan(r_tuning[0], r_tuning[1], r_tuning[2] != 0);
		if(!m_scan.Compile(h_context, h_device, true, true))
			return false;

		// ground truth computed always

		return true;
	}

	virtual void Init_TuningVector(std::vector<size_t> &r_tuning, std::vector<size_t> &r_hidden) const // throw(std::bad_alloc)
	{
		r_tuning.resize(3);
		r_hidden.resize(3);
		r_hidden[0] = 0;
		r_hidden[1] = 0;
		r_hidden[2] = 0;
		r_tuning[0] = n_WorkGroupSize_Lookup(r_hidden[0]);
		r_tuning[1] = n_WorkGroupSize_Lookup(r_hidden[0]) * n_ItemsPerThread_Lookup(r_hidden[1]);
		r_tuning[2] = 0;
	}

	virtual bool Next_TuningVector(std::vector<size_t> &r_tuning, std::vector<size_t> &r_hidden) const
	{
		_ASSERTE(r_tuning.size() == 3 && r_hidden.size() == 3);
		if(!(r_tuning[0] = n_WorkGroupSize_Lookup(++ r_hidden[0]))) {
			r_tuning[0] = n_WorkGroupSize_Lookup(r_hidden[0] = 0); // reset
			if(!(r_tuning[1] = r_tuning[0] * n_ItemsPerThread_Lookup(++ r_hidden[1]))) { // carry
				r_tuning[1] = r_tuning[0] * n_ItemsPerThread_Lookup(r_hidden[1] = 0); // reset
				if(m_n_bench_type == 2)
					return false; // in case we're doing the natural order benchmark, no need to try the interleaved variants
				if(r_tuning[2])
					return false; // are there more sizes?
				r_tuning[2] = 1;
			}
		} else
			r_tuning[1] = r_tuning[0] * n_ItemsPerThread_Lookup(r_hidden[1]); // update this as well
		return true;
	}

	virtual bool Run_Benchmark(cl_context h_context,
		cl_command_queue h_cmd_queue, double &r_f_time, bool b_verify)
	{
		CCLContextWrapper context(h_context);
		CCLCommandQueueWrapper cmd_queue(h_cmd_queue);
		// thin wrappers, do not delete the handles

		const size_t n = m_data.size(), n_tile_size = m_scan.n_Tile_Size();
		const size_t n_tile_num = (n + n_tile_size - 1) / n_tile_size;

		size_t n_scan_buffer_elems = n;
		size_t n_packed_hf_buffer_bytes = m_scan.n_PackedHeadFlags_Size(n);
		if(n_scan_buffer_elems > m_n_scan_buffer_elems) { // only alloc when needed
			if(!(m_dp_scan = context.h_CreateBuffer(sizeof(uint32_t) * n_scan_buffer_elems)) ||
			   !(m_dp_head_flags = context.h_CreateBuffer(sizeof(uint32_t) * n_scan_buffer_elems)))
				return false;
			if(m_n_bench_type == 3) {
				if(!(m_dp_super_tile_tcounts_exscan = context.h_CreateBuffer(n_scan_buffer_elems * sizeof(uint32_t))) || // tail counts
				   !(m_dp_super_packed_head_flags = context.h_CreateBuffer(n_scan_buffer_elems * sizeof(uint32_t)))) // packed head flags
					return false;
			}
			m_n_scan_buffer_elems = n_scan_buffer_elems;
		}
		size_t n_tile_buffer_elems = n_tile_num;
		if(n_tile_buffer_elems > m_n_tile_buffer_elems) { // only alloc when needed
			if(!(m_dp_tile_flags = context.h_CreateBuffer(sizeof(uint32_t) * n_tile_buffer_elems)) ||
			   !(m_dp_tile_carry = context.h_CreateBuffer(sizeof(uint32_t) * n_tile_buffer_elems)) ||
			   !(m_dp_scanned_tile_carry = context.h_CreateBuffer(sizeof(uint32_t) * n_tile_buffer_elems)))
				return false;
			if(m_n_bench_type == 3) {
				if(!(m_dp_segscanned_tile_carry = context.h_CreateBuffer(n_tile_buffer_elems * sizeof(uint32_t))))
					return false;
			}
			m_n_tile_buffer_elems = n_tile_buffer_elems;
		}
		size_t n_hf_buffer_elems = n_packed_hf_buffer_bytes / sizeof(uint32_t);
		if(n_hf_buffer_elems > m_n_hf_buffer_elems) { // only alloc when needed
			if(!(m_dp_packed_head_flags = context.h_CreateBuffer(n_packed_hf_buffer_bytes)))
				return false;
			m_n_hf_buffer_elems = n_hf_buffer_elems;
		}
		// (re)allocate the buffers as needed

		/*seg_debug::CSegmentedOp_UniformSize_Benchmark segment_pattern_generator;
		const size_t n_pass_num = segment_pattern_generator.n_Benchmark_Num(n_scan_buffer_elems);
		for(size_t n_pass = 0; n_pass < n_pass_num; ++ n_pass) {*/

		seg_debug::CSegmentedOp_Random_Benchmark segment_pattern_generator;

		const size_t n_super_tile_size = 32;

		size_t n_reduction_num = 0;
		if(b_verify) { // must be set the first time around
			size_t n_avg_seg_size = 500;
			size_t n_seg_num = min(n_scan_buffer_elems, n_scan_buffer_elems / n_avg_seg_size + 3);
			segment_pattern_generator.n_Generate_HeadFlags(m_seginfo, n_scan_buffer_elems, n_seg_num - 1);
			// fill m_p_head_flags

			seg_debug::CReference::Segmented_TileScan(m_seg_scan_cpu, m_tile_tail_num_cpu,
				m_tile_carry_cpu, m_tile_flags_cpu, m_data, *m_p_head_flags,
				m_scan.n_Tile_Size(), m_scan.n_WorkGroup_Size());
			/*std::vector<uint32_t> tile_seg_scan_cpu, dummy_tile_tail_num_cpu,
				dummy_tile_carry_cpu, dummy_tile_flags_cpu;
			seg_debug::CReference::Segmented_TileScan(tile_seg_scan_cpu, dummy_tile_tail_num_cpu,
				dummy_tile_carry_cpu, dummy_tile_flags_cpu, m_tile_carry_cpu, m_tile_flags_cpu,
				m_scan.n_Tile_Size(), m_scan.n_WorkGroup_Size());*/
			seg_debug::CReference::Segmented_Scan/*Segmented_SpineScan*/(m_scanned_tile_carry_cpu,
				m_tile_carry_cpu, m_tile_flags_cpu); // the data is read skewed on the GPU, don't insert the 0 at the beginning
			// partials ground truth

			seg_debug::CReference::Segmented_Scan(m_seg_scan_cpu, m_data, *m_p_head_flags);
			// the result of the scan

			if(m_n_bench_type == 3) {
				m_super_head_flags.assign(n_super_tile_size * n_scan_buffer_elems, uint32_t(0));
				m_super_tile_tcounts_exscan_cpu.assign(n_scan_buffer_elems, uint32_t(0));

				size_t n_reduction_size = 1;
				for(size_t i = 0; i < n_scan_buffer_elems; ++ i) {
					if((*m_p_head_flags)[i]) { // there is a head flag, the super array must have had some nnzs there
						size_t n_head_num = 1 + rand() % 5;
						std::vector<size_t> heads;
						CUniqueRandomPermutation::Generate(heads, n_head_num, n_super_tile_size, CCLibGenerator<false>());
						for(size_t j = 0; j < n_head_num; ++ j) {
							m_super_head_flags[i * n_super_tile_size + heads[j]] = 1; // raise a head flag
							if(i * n_super_tile_size + heads[j]) // the first one is implied
								++ n_reduction_size;
						}
					}
				}
				// pseudorandomly generate super head flags

#ifdef _DEBUG
				{
					std::vector<uint32_t> dummy_tile_flags;
					{
						std::vector<uint32_t> dummy_scan(n_scan_buffer_elems * n_super_tile_size, uint32_t(0));
						std::vector<uint32_t> dummy_tile_tails;
						std::vector<uint32_t> dummy_tile_carry;
						seg_debug::CReference::Segmented_TileScan(dummy_scan, dummy_tile_tails,
							dummy_tile_carry, dummy_tile_flags, dummy_scan, m_super_head_flags,
							n_super_tile_size, n_super_tile_size);
					}
					_ASSERTE(dummy_tile_flags.size() == (*m_p_head_flags).size());
					_ASSERTE(dummy_tile_flags.size() == n_scan_buffer_elems);
					for(size_t i = 0; i < n_scan_buffer_elems; ++ i)
						_ASSERTE(!(*m_p_head_flags)[i] == !dummy_tile_flags[i]);
				}
				// make sure that these head flags would reduce to the same tile flags
#endif // _DEBUG

				seg_debug::CReference::Pack_HeadFlags(m_super_packed_head_flags_cpu, m_super_head_flags,
					n_super_tile_size, n_super_tile_size, false);
				// pack these head flags

				std::vector<size_t> first_tile_tail_positions;
				std::vector<size_t> first_tile_tail_positions_reduction_array;
				n_reduction_num = 0;
				for(size_t i = 0; i < n_super_tile_size * n_scan_buffer_elems; ++ i) {
					if(i && m_super_head_flags[i]) {
						++ n_reduction_num;
						if(++ m_super_tile_tcounts_exscan_cpu[(i - 1) / n_super_tile_size] == 1) {
							first_tile_tail_positions_reduction_array.push_back(n_reduction_num - 1); // minus the increment just above
							first_tile_tail_positions.push_back(i - 1);
						}
					}
				}
				{
					size_t i = n_super_tile_size * n_scan_buffer_elems;
					if(i /*&& m_super_head_flags[i]*/) { // implied
						++ n_reduction_num;
						if(++ m_super_tile_tcounts_exscan_cpu[(i - 1) / n_super_tile_size] == 1) {
							first_tile_tail_positions_reduction_array.push_back(n_reduction_num - 1); // minus the increment just above
							first_tile_tail_positions.push_back(i - 1);
						}
					}
				}
				stl_ut::ExclusiveScan(m_super_tile_tcounts_exscan_cpu.begin(),
					m_super_tile_tcounts_exscan_cpu.end());
				// calculate the number of super reductions and tile tail counts scan

				seg_debug::CReference::Segmented_Scan(m_segscanned_tile_carry_cpu, m_tile_carry_cpu, m_tile_flags_cpu);
				// need global segmented scan of the tile carry as well

				/*std::vector<uint32_t> scanned_carry_cpu;
				seg_debug::CReference::Segmented_Scan(scanned_carry_cpu, m_tile_carry_cpu, m_tile_flags_cpu);
				// the result of the scan*/ // nope, this is working on one higher level (super-level)

				m_super_segreduce_cpu.assign(n_reduction_num, uint32_t(0)); // just zero (could also assign other value and then make sure it was added to rather than replaced)
				for(size_t i = 0, n = first_tile_tail_positions.size(); i < n; ++ i) {
					size_t t = first_tile_tail_positions[i];
					size_t tt = (t / n_super_tile_size) * n_super_tile_size; // align down
					m_super_segreduce_cpu[first_tile_tail_positions_reduction_array[i]] =
						(tt != m_super_head_flags.size() && m_super_head_flags[tt] != 0)? 0 : // if there is a head flag on the first element, need to write an explicit zero!
						(t / n_super_tile_size)? m_seg_scan_cpu[t / n_super_tile_size - 1] : 0; // i guess
				}
				// generate ground truth
			}
			// prepare the super reduction data for benchmark type 3
		}
		// calculate ground truth

		_ASSERTE(m_data.size() == n_scan_buffer_elems);
		CLresult n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_scan, 0,
			&m_data[0], n_scan_buffer_elems * sizeof(uint32_t));
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		_ASSERTE(m_p_head_flags->size() == n_scan_buffer_elems);
		n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_head_flags, 0, &((*m_p_head_flags)[0]),
			n_scan_buffer_elems * sizeof(uint32_t)); // put something different in here as well
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		_ASSERTE(m_data.size() >= n_tile_buffer_elems);
		n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_tile_flags, 0, &m_data[0],
			n_tile_buffer_elems * sizeof(uint32_t)); // put something different in here as well
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		_ASSERTE(m_data.size() >= n_tile_buffer_elems);
		n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_tile_carry, 0, &m_data[0],
			n_tile_buffer_elems * sizeof(uint32_t)); // put something different in here as well
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		_ASSERTE(((b_verify)? m_scanned_tile_carry_cpu : m_data).size() >= n_tile_buffer_elems);
		n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_scanned_tile_carry, 0,
			(b_verify)? &m_scanned_tile_carry_cpu[0] : &m_data[0], // use the precomputed correct carry (from the higher recursions which use different tuning, don't want to calculate it here)
			n_tile_buffer_elems * sizeof(uint32_t)); // put something different in here as well
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		if(m_n_bench_type == 2) { // natural head flags
			_ASSERTE(!(n_packed_hf_buffer_bytes % sizeof(uint32_t)));
			m_packed_head_flags_natural_cpu.assign(n_packed_hf_buffer_bytes / sizeof(uint32_t), 0);
			for(size_t i = 0, m = m_p_head_flags->size(); i < m; ++ i)
				m_packed_head_flags_natural_cpu[i / 32] |= ((*m_p_head_flags)[i]) << (i & 31);
			_ASSERTE(m_packed_head_flags_natural_cpu.size() * sizeof(uint32_t) == n_packed_hf_buffer_bytes);
			n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_packed_head_flags, 0, &m_packed_head_flags_natural_cpu[0],
				n_packed_hf_buffer_bytes); // put something different in here as well
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
		} else {
			// don't really need to clear the packed head flags, the contents change randomly anyways
		}
		if(m_n_bench_type == 3 && m_n_reduction_buffer_elems < n_reduction_num) {
			if(!(m_dp_super_segreduce = context.h_CreateBuffer(n_reduction_num * sizeof(uint32_t))))
				return false;
			m_n_reduction_buffer_elems = n_reduction_num;
		}
		if(m_n_bench_type == 3) {
			std::vector<uint32_t> zero(m_n_reduction_buffer_elems, uint32_t(0));
			n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_super_segreduce, 0, &zero[0],
				m_n_reduction_buffer_elems * sizeof(uint32_t));
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
			_ASSERTE(m_segscanned_tile_carry_cpu.size() == n_tile_buffer_elems);
			n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_segscanned_tile_carry, 0,
				&m_segscanned_tile_carry_cpu[0], n_tile_buffer_elems * sizeof(uint32_t));
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
			_ASSERTE(m_super_tile_tcounts_exscan_cpu.size() == n_scan_buffer_elems);
			n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_super_tile_tcounts_exscan, 0,
				&m_super_tile_tcounts_exscan_cpu[0], n_scan_buffer_elems * sizeof(uint32_t));
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
			_ASSERTE(m_super_packed_head_flags_cpu.size() == n_scan_buffer_elems);
			n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_super_packed_head_flags, 0,
				&m_super_packed_head_flags_cpu[0], n_scan_buffer_elems * sizeof(uint32_t));
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
		}

		n_result = cmd_queue.n_Finish();
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		// wait

		if(m_n_bench_type == 1) { // packed or natural head flags
			n_result = m_scan.Enqueue_Pack_HeadFlags(cmd_queue, m_dp_packed_head_flags, m_dp_head_flags, n);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: scan.Enqueue_Pack_HeadFlags() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
			n_result = cmd_queue.n_Finish();
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
		}

		CCLUniqueEvent ev, ev2; // for profiling

		r_f_time = m_timer.f_Time();

		CLresult n_result2 = cl_Success;
		if(n_scan_buffer_elems > m_scan.n_Tile_Size()) {
			switch(m_n_bench_type) {
			case 0: // decode
				n_result = m_scan.Enqueue_TileSegScan_Carry_Pack(h_cmd_queue, m_dp_tile_carry,
					m_dp_tile_flags, m_dp_packed_head_flags, m_dp_scan, m_dp_head_flags, n_scan_buffer_elems).GetEvent(ev);
				n_result2 = m_scan.Enqueue_TileSegScan_Downsweep_Packed(h_cmd_queue, m_dp_scan, m_dp_scan,
					m_dp_scanned_tile_carry, m_dp_packed_head_flags, n_scan_buffer_elems).GetEvent(ev2); // use m_dp_tile_carry precalculated by the CPU so that it is possible to verify the result
				break;
			case 1: // packed interleaved
				n_result = m_scan.Enqueue_TileSegScan_Carry_Packed(h_cmd_queue, m_dp_tile_carry,
					m_dp_tile_flags, m_dp_scan, m_dp_packed_head_flags, n_scan_buffer_elems).GetEvent(ev);
				n_result2 = m_scan.Enqueue_TileSegScan_Downsweep_Packed(h_cmd_queue, m_dp_scan, m_dp_scan,
					m_dp_scanned_tile_carry, m_dp_packed_head_flags, n_scan_buffer_elems).GetEvent(ev2); // use m_dp_tile_carry precalculated by the CPU so that it is possible to verify the result
				break;
			case 2: // packed natural
				n_result = m_scan.Enqueue_TileSegScan_Carry_PackedNaturalOrder(h_cmd_queue, m_dp_tile_carry,
					m_dp_tile_flags, m_dp_scan, m_dp_packed_head_flags, n_scan_buffer_elems).GetEvent(ev);
				n_result2 = m_scan.Enqueue_TileSegScan_Downsweep_PackedNaturalOrder(h_cmd_queue, m_dp_scan, m_dp_scan,
					m_dp_scanned_tile_carry, m_dp_packed_head_flags, n_scan_buffer_elems).GetEvent(ev2); // use m_dp_tile_carry precalculated by the CPU so that it is possible to verify the result
				break;
			case 3:
				//n_result = m_scan.Enqueue_TileSegScan_Carry_Pack(h_cmd_queue, m_dp_scanned_tile_carry_spine,
				//	dp_tile_flags2, m_dp_packed_tile_head_flags, m_dp_tile_carry, m_dp_tile_flags, n_tile_num);
				n_result = m_scan.Enqueue_TileSegScan_Carry_Pack(h_cmd_queue, m_dp_tile_carry,
					m_dp_tile_flags, m_dp_packed_head_flags, m_dp_scan, m_dp_head_flags, n_scan_buffer_elems).GetEvent(ev);

				/*n_result2 = m_scan.Enqueue_TileSegReduce_SpineAdjust_Downsweep_Packed(h_cmd_queue,
					m_dp_segreduce, // in-out
					m_dp_tile_tcounts_exscan, m_dp_packed_head_flags, m_scan.n_Tile_Size() / 32, 1, m_dp_tile_carry, // ins
					m_dp_scanned_tile_carry_spine, m_dp_packed_tile_head_flags, n_tile_num).GetEvent(ev2);*/ // ins, need to upload m_dp_scanned_tile_carry_spine, m_dp_packed_tile_head_flags
				n_result2 = m_scan.Enqueue_TileSegReduce_SpineAdjust_Downsweep_Packed(h_cmd_queue,
					m_dp_super_segreduce, // in-out
					m_dp_super_tile_tcounts_exscan, m_dp_super_packed_head_flags, n_super_tile_size / 32, 1, m_dp_scan,
					m_dp_segscanned_tile_carry, m_dp_packed_head_flags, n_scan_buffer_elems).GetEvent(ev2);
				break;
			};
		} else {
			switch(m_n_bench_type) {
			case 0: // decode
				n_result = m_scan.Enqueue_TileSegScan(h_cmd_queue,
					m_dp_scan, m_dp_scan, m_dp_head_flags, n_scan_buffer_elems).GetEvent(ev);
				break;
			case 1: // packed interleaved
				n_result = m_scan.Enqueue_TileSegScan_Packed(h_cmd_queue,
					m_dp_scan, m_dp_scan, m_dp_packed_head_flags, n_scan_buffer_elems).GetEvent(ev);
				break;
			case 2: // packed natural
				n_result = m_scan.Enqueue_TileSegScan_PackedNaturalOrder(h_cmd_queue,
					m_dp_scan, m_dp_scan, m_dp_packed_head_flags, n_scan_buffer_elems).GetEvent(ev);
				break;
			case 3:
				/*n_result3 = m_scan.Enqueue_TileSegReduce_SpineAdjust(h_cmd_queue,
					m_dp_segreduce, // in-out
					m_dp_tile_tcounts_exscan, m_dp_packed_head_flags, m_scan.n_Tile_Size() / 32, 1, m_dp_tile_carry, // ins
					m_dp_tile_flags, n_tile_num).GetEvent(ev);*/
				if(n_scan_buffer_elems > 1) {
					n_result = m_scan.Enqueue_TileSegReduce_SpineAdjust(h_cmd_queue,
						m_dp_super_segreduce, // in-out
						m_dp_super_tile_tcounts_exscan, m_dp_super_packed_head_flags, n_super_tile_size / 32, 1, m_dp_scan, // ins
						m_dp_head_flags, n_scan_buffer_elems).GetEvent(ev);
					// not packed head flags
				} else
					n_result = cmd_queue.n_Enqueue_Marker(ev); // need the event
				break;
			}
		}
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: scan.Enqueue_TileSegScan%s() failed with: %d (%s, %d)\n",
				(n_scan_buffer_elems > m_scan.n_Tile_Size())? "_Carry" : "",
				n_result, __FILE__, __LINE__);
			return false;
		}
		if(n_result2 != CL_SUCCESS) {
			fprintf(stderr, "error: scan.Enqueue_TileSegScan_Downsweep() failed with: %d (%s, %d)\n",
				n_result2, __FILE__, __LINE__);
			return false;
		}
		// run

		n_result = cmd_queue.n_Finish();
		r_f_time = m_timer.f_Time() - r_f_time;
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}

		if((n_result = ev.n_Wait()) != CL_SUCCESS) {
			fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		if(n_scan_buffer_elems > m_scan.n_Tile_Size() && (n_result2 = ev2.n_Wait()) != CL_SUCCESS) {
			fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result2, __FILE__, __LINE__);
			return false;
		}
		// wait

		/*if(ev.n_GetProfilingCounter_Difference(r_f_time, CL_PROFILING_COMMAND_START,
		   CL_PROFILING_COMMAND_END) != CL_SUCCESS) {
			fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		if(n_scan_buffer_elems > m_scan.n_Tile_Size()) {
			double f_more_time;
			if(ev2.n_GetProfilingCounter_Difference(f_more_time, CL_PROFILING_COMMAND_START,
			   CL_PROFILING_COMMAND_END) != CL_SUCCESS) {
				fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
			r_f_time += f_more_time;
		}*/

		if(b_verify) {
			if(m_n_bench_type == 3) {
				bool b_good_tails;
				n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_good_tails, m_super_segreduce_cpu.begin(),
					m_super_segreduce_cpu.end(), m_dp_super_segreduce, 0, "m_dp_super_segreduce");
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: opencl[0].n_CompareBuffer() failed with: %d (%s, %d)\n",
						n_result, __FILE__, __LINE__);
					return false;
				}
				if(!b_good_tails) {
					fprintf(stderr, "error: m_dp_super_segreduce results are incorrect\n");
					//return false;
				}
			} else {
				bool b_verify_result;

				if(n_scan_buffer_elems > m_scan.n_Tile_Size()) {
					n_result = cmd_queue.n_CompareBuffer(b_verify_result, m_tile_carry_cpu.begin(),
						m_tile_carry_cpu.end(), m_dp_tile_carry);
					if(n_result != CL_SUCCESS) {
						fprintf(stderr, "error: opencl[0].n_CompareBuffer() failed with: %d (%s, %d)\n",
							n_result, __FILE__, __LINE__);
						return false;
					}
					if(!b_verify_result) {
						fprintf(stderr, "error: tile carry results are incorrect\n");
						return false;
					}

					n_result = cmd_queue.n_CompareBuffer(b_verify_result, m_tile_flags_cpu.begin(),
						m_tile_flags_cpu.end(), m_dp_tile_flags);
					if(n_result != CL_SUCCESS) {
						fprintf(stderr, "error: opencl[0].n_CompareBuffer() failed with: %d (%s, %d)\n",
							n_result, __FILE__, __LINE__);
						return false;
					}
					if(!b_verify_result) {
						fprintf(stderr, "error: tile flags results are incorrect\n");
						return false;
					}
				}
				// only verify those if using the recursive path

				n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_verify_result, m_seg_scan_cpu.begin(),
					m_seg_scan_cpu.end(), m_dp_scan, 0, "segmented scan");
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: opencl[0].n_CompareBuffer() failed with: %d (%s, %d)\n",
						n_result, __FILE__, __LINE__);
					return false;
				}
				if(!b_verify_result) {
					/*_ASSERTE(m_data.size() == n_scan_buffer_elems);
					n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_scan, 0,
						&m_data[0], n_scan_buffer_elems * sizeof(uint32_t));
					if(n_result != CL_SUCCESS) {
						fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
						return false;
					}
					// reload

					n_result = m_scan.n_Enqueue_SegmentedScan_NoAutoTune(cmd_queue, m_dp_scan, m_dp_scan, m_dp_head_flags,
						n_scan_buffer_elems, CCLTempBufferStack(h_context));
					if(n_result != CL_SUCCESS) {
						fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
						return false;
					}
					// GPU global segmented scan

					n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_verify_result, m_seg_scan_cpu.begin(),
						m_seg_scan_cpu.end(), m_dp_scan, 0, "segmented scan");
					if(n_result != CL_SUCCESS) {
						fprintf(stderr, "error: opencl[0].n_CompareBuffer() failed with: %d (%s, %d)\n",
							n_result, __FILE__, __LINE__);
						return false;
					}
					if(b_verify_result)
						fprintf(stderr, "fatal error: calling scan.Segmented_Scan() yields correct results\n");
					else */
						fprintf(stderr, "error: seg-scan results are incorrect\n");
					// just a debug check

					return false;
				}
				// verify the global scan result
			}
		}
		// verify

		return true;
	}

	virtual bool Prepare_Input(const std::vector<size_t> &r_input)
	{
		_ASSERTE(r_input.size() == 1);
		const size_t n = r_input.front();
		// read config

		m_data.resize(n);
		// alloc

		for(size_t i = 0; i < n; ++ i)
			m_data[i] = (uint32_t)(i + 1);
		std::random_shuffle(m_data.begin(), m_data.end());
		// generate some data

		return true;
	}

	virtual const char *p_s_Benchmark_Id() const
	{
		return (m_n_bench_type == 0)? "tiled_seg_in_scan_pack" :
			(m_n_bench_type == 1)? "tiled_seg_in_scan_interleaved" :
			(m_n_bench_type == 2)? "tiled_seg_in_scan_natural" :
			(m_n_bench_type == 3)? "tiled_seg_reduce_spine_adjust" : "(null)";
	}

	virtual const size_t n_InputSize_Lookup(size_t n_index) const
	{
		if(m_n_bench_type == 3) {
			const size_t p_size[] = {1, 2, 5, 10, 20, 50, 100, 200, 500, 1000, 2000, 5000,
				1000 * 10, 1000 * 20, 1000 * 50, 1000 * 100, 1000 * 200, 1000 * 500};
			// use smaller sizes for tiled_seg_reduce_spine_adjust, want tuning focused on much smaller sizes

			if(sizeof(p_size) / sizeof(p_size[0]) <= n_index)
				return 0;
			return p_size[n_index];
		} else {
			const size_t p_size[] = {10, 20, 50, 100, 200, 500, 1000, 2000, 5000,
				1000 * 10, 1000 * 20, 1000 * 50, 1000 * 100, 1000 * 200, 1000 * 500,
				1000000, 1000000 * 2, 1000000 * 5, 1000000 * 10, 1000000 * 20};
			if(sizeof(p_size) / sizeof(p_size[0]) <= n_index)
				return 0;
			return p_size[n_index];
		}
	}

	virtual const size_t n_WorkGroupSize_Lookup(size_t n_index) const
	{
		const size_t p_size[] = {128, 192, 256, 512, 1024};
		if(sizeof(p_size) / sizeof(p_size[0]) <= n_index)
			return 0;
		return p_size[n_index];
	}

	virtual const size_t n_ItemsPerThread_Lookup(size_t n_index) const
	{
		const size_t p_size[] = {1, 2, 3, 4, 5, 6, 7, 8/*, 16, 32*/};
		if(sizeof(p_size) / sizeof(p_size[0]) <= n_index)
			return 0;
		return p_size[n_index];
	}
};

class CCLScan;

class CTiled_SegmentedReduce_AutoTune : public CCLAutotuneInterface,
	public CAutotuneModel_DataSize_WorkGroupSize_TileSize {
protected:
	std::vector<uint32_t> m_data;
	std::vector<uint32_t> *m_p_head_flags;

	std::vector<uint32_t> m_segreduce_cpu;
	std::vector<uint32_t> m_packed_head_flags_cpu;
	std::vector<uint32_t> m_tile_tcounts_cpu;
	std::vector<uint32_t> m_tile_tcounts_exscan_cpu;
	std::vector<uint32_t> m_tile_flags_cpu;
	std::vector<uint32_t> m_tile_carry_cpu;
	std::vector<uint32_t> m_tile_carry2_cpu, m_tile_flags2_cpu;
	std::vector<uint32_t> m_scanned_tile_carry_spine_cpu;
	std::vector<uint32_t> m_packed_tile_head_flags_cpu;
	//std::vector<uint32_t> m_seg_scan_cpu; // not needed
	//std::vector<size_t> m_packed_head_flags_natural_cpu;

	seg_debug::THeadFlag_DebugInfo m_seginfo; // keep allocated

	CCLScan &m_r_glob_scan;
	CCLTiled_SegmentedReduceScan m_scan;
	CCLUniqueMem m_dp_data, m_dp_head_flags, // size n = m_n_scan_buffer_elems
		m_dp_segreduce, // size r = m_n_reduce_buffer_elems 
		m_dp_packed_head_flags, // size m_scan.n_PackedHeadFlags_Size(n) = m_n_hf_buffer_elems
		m_dp_tile_tcounts, m_dp_tile_tcounts_exscan, m_dp_tile_flags, m_dp_tile_carry, // size n_tile_num = m_n_tile_buffer_elems
		m_dp_scanned_tile_carry_spine, // size n_tile2_num = m_n_tile2_buffer_elems
		m_dp_packed_tile_head_flags; // size m_scan.n_PackedHeadFlags_Size(n_tile_num) = m_n_tile_hf_buffer_elems

	size_t m_n_scan_buffer_elems;
	size_t m_n_reduce_buffer_elems;
	size_t m_n_tile_buffer_elems;
	size_t m_n_tile2_buffer_elems;
	size_t m_n_hf_buffer_elems;
	size_t m_n_tile_hf_buffer_elems;
	CTimer m_timer;
	int m_n_bench_type;

public:
	CTiled_SegmentedReduce_AutoTune(CCLScan &r_glob_scan, int n_bench_type = 0) // only in scan so far
		:m_n_bench_type(n_bench_type), m_n_scan_buffer_elems(0), m_n_reduce_buffer_elems(0),
		m_n_tile_buffer_elems(0), m_n_tile2_buffer_elems(0), m_n_hf_buffer_elems(0),
		m_n_tile_hf_buffer_elems(0), m_r_glob_scan(r_glob_scan)
	{
		m_p_head_flags = &m_seginfo.head_flags;

		_ASSERTE(n_bench_type >= 0 && n_bench_type < 3); // decode, pre-packed, pre-packed natural
	}

	virtual bool Prepare_Benchmark(cl_context h_context, cl_device_id h_device,
		cl_command_queue h_cmd_queue, const std::vector<size_t> &r_tuning)
	{
		_ASSERTE(r_tuning.size() == 3);

		m_scan = CCLTiled_SegmentedReduceScan(r_tuning[0], r_tuning[1], r_tuning[2] != 0);
		if(!m_scan.Compile(h_context, h_device, true, true))
			return false;

		// ground truth computed always

		return true;
	}

	virtual void Init_TuningVector(std::vector<size_t> &r_tuning, std::vector<size_t> &r_hidden) const // throw(std::bad_alloc)
	{
		r_tuning.resize(3);
		r_hidden.resize(3);
		r_hidden[0] = 0;
		r_hidden[1] = 0;
		r_hidden[2] = 0;
		r_tuning[0] = n_WorkGroupSize_Lookup(r_hidden[0]);
		r_tuning[1] = n_WorkGroupSize_Lookup(r_hidden[0]) * n_ItemsPerThread_Lookup(r_hidden[1]);
		r_tuning[2] = 0;
	}

	virtual bool Next_TuningVector(std::vector<size_t> &r_tuning, std::vector<size_t> &r_hidden) const
	{
		_ASSERTE(r_tuning.size() == 3 && r_hidden.size() == 3);
		if(!(r_tuning[0] = n_WorkGroupSize_Lookup(++ r_hidden[0]))) {
			r_tuning[0] = n_WorkGroupSize_Lookup(r_hidden[0] = 0); // reset
			if(!(r_tuning[1] = r_tuning[0] * n_ItemsPerThread_Lookup(++ r_hidden[1]))) { // carry
				r_tuning[1] = r_tuning[0] * n_ItemsPerThread_Lookup(r_hidden[1] = 0); // reset
				if(m_n_bench_type == 2)
					return false; // in case we're doing the natural order benchmark, no need to try the interleaved variants
				if(r_tuning[2])
					return false; // are there more sizes?
				r_tuning[2] = 1;
			}
		} else
			r_tuning[1] = r_tuning[0] * n_ItemsPerThread_Lookup(r_hidden[1]); // update this as well
		return true;
	}

	virtual bool Run_Benchmark(cl_context h_context,
		cl_command_queue h_cmd_queue, double &r_f_time, bool b_verify)
	{
		CCLContextWrapper context(h_context);
		CCLCommandQueueWrapper cmd_queue(h_cmd_queue);
		// thin wrappers, do not delete the handles

		const size_t n = m_data.size(), n_tile_size = m_scan.n_Tile_Size();
		const size_t n_tile_num = (n + n_tile_size - 1) / n_tile_size;

		size_t n_scan_buffer_elems = n;
		const size_t n_avg_seg_size = 500; // given as an input
		size_t n_seg_num = n_scan_buffer_elems / n_avg_seg_size + 3;
		size_t n_packed_hf_buffer_bytes = m_scan.n_PackedHeadFlags_Size(n);
		size_t n_packed_tile_hf_buffer_bytes = m_scan.n_PackedHeadFlags_Size(n_tile_num);

		if(n_scan_buffer_elems > m_n_scan_buffer_elems) { // only alloc when needed
			if(!(m_dp_data = context.h_CreateBuffer(sizeof(uint32_t) * n_scan_buffer_elems)) ||
			   !(m_dp_head_flags = context.h_CreateBuffer(sizeof(uint32_t) * n_scan_buffer_elems)))
				return false;
			m_n_scan_buffer_elems = n_scan_buffer_elems;
		}
		if(m_n_reduce_buffer_elems < n_seg_num) {
			if(!(m_dp_segreduce = context.h_CreateBuffer(sizeof(uint32_t) * (n_seg_num + 1))))
				return false;
			m_n_reduce_buffer_elems = n_seg_num;
		}
		size_t n_tile_buffer_elems = n_tile_num;//(n_scan_buffer_elems + m_scan.n_Tile_Size() - 1) / m_scan.n_Tile_Size();
		if(n_tile_buffer_elems > m_n_tile_buffer_elems) { // only alloc when needed
			if(!(m_dp_tile_flags = context.h_CreateBuffer(sizeof(uint32_t) * n_tile_buffer_elems)) ||
			   !(m_dp_tile_carry = context.h_CreateBuffer(sizeof(uint32_t) * n_tile_buffer_elems)) ||
			   !(m_dp_tile_tcounts = context.h_CreateBuffer(sizeof(uint32_t) * n_tile_buffer_elems)) ||
			   !(m_dp_tile_tcounts_exscan = context.h_CreateBuffer(sizeof(uint32_t) * n_tile_buffer_elems)))
				return false;
			m_n_tile_buffer_elems = n_tile_buffer_elems;
		}
		size_t n_tile2_num = (n_tile_num + n_tile_size - 1) / n_tile_size;
		if(n_tile2_num > m_n_tile2_buffer_elems) { // only alloc when needed
			if(!(m_dp_scanned_tile_carry_spine = context.h_CreateBuffer(sizeof(uint32_t) * n_tile2_num)))
				return false;
			m_n_tile2_buffer_elems = n_tile2_num;
		}
		size_t n_hf_buffer_elems = n_packed_hf_buffer_bytes / sizeof(uint32_t);
		if(n_hf_buffer_elems > m_n_hf_buffer_elems) { // only alloc when needed
			if(!(m_dp_packed_head_flags = context.h_CreateBuffer(n_packed_hf_buffer_bytes)))
				return false;
			m_n_hf_buffer_elems = n_hf_buffer_elems;
		}
		size_t n_tile_hf_buffer_elems = n_packed_tile_hf_buffer_bytes / sizeof(uint32_t);
		if(n_tile_hf_buffer_elems > m_n_tile_hf_buffer_elems) { // only alloc when needed
			if(!(m_dp_packed_tile_head_flags = context.h_CreateBuffer(n_packed_tile_hf_buffer_bytes)))
				return false;
			m_n_tile_hf_buffer_elems = n_tile_hf_buffer_elems;
		}
		// (re)allocate the buffers as needed

		/*seg_debug::CSegmentedOp_UniformSize_Benchmark segment_pattern_generator;
		const size_t n_pass_num = segment_pattern_generator.n_Benchmark_Num(n_scan_buffer_elems);
		for(size_t n_pass = 0; n_pass < n_pass_num; ++ n_pass) {*/

		seg_debug::CSegmentedOp_Random_Benchmark segment_pattern_generator;

		if(b_verify) { // won't work without data
			segment_pattern_generator.n_Generate_HeadFlags(m_seginfo, n_scan_buffer_elems, n_seg_num - 1);
			// fill m_p_head_flags

			seg_debug::CReference::Pack_HeadFlags(m_packed_head_flags_cpu, *m_p_head_flags,
				m_scan.n_Tile_Size(), m_scan.n_WorkGroup_Size(),
				m_scan.b_Strided_HeadFlags() && m_n_bench_type != 2); // not strided for natural benchmark
			_ASSERTE(m_packed_head_flags_cpu.size() == n_hf_buffer_elems);

			std::vector<uint32_t> temp, temp2;
			seg_debug::CReference::Segmented_TileScan(temp, m_tile_tcounts_cpu,
				m_tile_carry_cpu, m_tile_flags_cpu, m_data, *m_p_head_flags,
				m_scan.n_Tile_Size(), m_scan.n_WorkGroup_Size());
			m_tile_tcounts_exscan_cpu.resize(m_tile_tcounts_cpu.size());
			stl_ut::ExclusiveScan(m_tile_tcounts_cpu.begin(), m_tile_tcounts_cpu.end(), m_tile_tcounts_exscan_cpu.begin());
			{
				seg_debug::CReference::Segmented_TileScan(temp, temp2, m_tile_carry2_cpu, m_tile_flags2_cpu,
					m_tile_carry_cpu, m_tile_flags_cpu, m_scan.n_Tile_Size(), m_scan.n_WorkGroup_Size());

				seg_debug::CReference::Segmented_Scan(m_scanned_tile_carry_spine_cpu,
					m_tile_carry2_cpu, m_tile_flags2_cpu);
				// this is the second level scan and it is a global one
			}
			// partials ground truth

			seg_debug::CReference::Pack_HeadFlags(m_packed_tile_head_flags_cpu, m_tile_flags_cpu,
				m_scan.n_Tile_Size(), m_scan.n_WorkGroup_Size(), m_scan.b_Strided_HeadFlags()); // this is interleaved even with natural!
			_ASSERTE(m_packed_tile_head_flags_cpu.size() == n_tile_hf_buffer_elems);

			seg_debug::CReference::Segmented_Reduce(m_segreduce_cpu, m_data, *m_p_head_flags);
			// the result of the reduction
		}
		// calculate ground truth

		_ASSERTE(m_data.size() == n);
		CLresult n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_data, 0, &m_data[0], n * sizeof(uint32_t));
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		_ASSERTE(m_p_head_flags->size() == n);
		n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_head_flags, 0, &((*m_p_head_flags)[0]), n * sizeof(uint32_t));
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		_ASSERTE(m_tile_tcounts_exscan_cpu.size() == n_tile_num);
		n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_tile_tcounts_exscan, 0,
			&m_tile_tcounts_exscan_cpu[0], n_tile_num * sizeof(uint32_t));
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		_ASSERTE(m_scanned_tile_carry_spine_cpu.size() == n_tile2_num);
		n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_scanned_tile_carry_spine, 0,
			&m_scanned_tile_carry_spine_cpu[0], n_tile2_num * sizeof(uint32_t));
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		_ASSERTE(m_packed_tile_head_flags_cpu.size() == n_tile_hf_buffer_elems);
		n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_packed_tile_head_flags, 0,
			&m_packed_tile_head_flags_cpu[0], n_tile_hf_buffer_elems * sizeof(uint32_t));
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		if(m_n_bench_type > 0) {
			_ASSERTE(m_packed_head_flags_cpu.size() == n_hf_buffer_elems);
			n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_packed_head_flags, 0, // need packed head flags, except for benchmark 0
				&m_packed_head_flags_cpu[0], n_hf_buffer_elems * sizeof(uint32_t));
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
		}
		// inputs

		{
			std::vector<uint32_t> ff(n_seg_num, uint32_t(-1));
			m_segreduce_cpu.resize(n_seg_num + 1, 0xbaadf00d);
			_ASSERTE(m_segreduce_cpu.back() == 0xbaadf00d);
			ff.push_back(0xbaadf00d); // add a checked terminator in there as well ;)
			n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_segreduce, 0,
				&ff[0], (n_seg_num + 1) * sizeof(uint32_t));
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
		}
		if(m_n_bench_type == 0) {
			std::vector<uint32_t> zero(n_hf_buffer_elems);
			CLresult n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_packed_head_flags, 0,
				&zero[0], n_hf_buffer_elems * sizeof(uint32_t));
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
		}
		_ASSERTE(m_data.size() >= n_tile_buffer_elems);
		n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_tile_flags, 0, &m_data[0],
			n_tile_buffer_elems * sizeof(uint32_t)); // put something different in here as well
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		_ASSERTE(m_data.size() >= n_tile_buffer_elems);
		n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_tile_carry, 0, &m_data[0],
			n_tile_buffer_elems * sizeof(uint32_t)); // put something different in here as well
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		_ASSERTE(m_data.size() >= n_tile_buffer_elems);
		n_result = cmd_queue.n_Enqueue_Memcpy_HtoD(m_dp_tile_tcounts, 0, &m_data[0],
			n_tile_buffer_elems * sizeof(uint32_t)); // put something different in here as well
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clEnqueueWriteBuffer() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		// scramble the outputs

		n_result = cmd_queue.n_Finish();
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		// wait

		/*if(m_n_bench_type == 1) { // packed or natural head flags
			n_result = m_scan.Enqueue_Pack_HeadFlags(cmd_queue, m_dp_packed_head_flags, m_dp_head_flags, n);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: scan.Enqueue_Pack_HeadFlags() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
			n_result = cmd_queue.n_Finish();
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
		}*/

		CCLUniqueEvent ev, ev2, ev3; // for profiling

		r_f_time = m_timer.f_Time();

		CLresult n_result2 = cl_Success;
		CLresult n_result3 = cl_Success;
		if(n_tile_num > 1) {
			switch(m_n_bench_type) {
			case 0: // decode
				n_result = m_scan.Enqueue_TileSegReduce_Bootstrap_Pack(h_cmd_queue,

					m_dp_tile_tcounts, m_dp_tile_flags, m_dp_packed_head_flags, // outs

					m_dp_head_flags, n).GetEvent(ev); // ins, satisfied

				// ex scan of m_dp_tile_tcounts for tile num > 1

				n_result2 = m_scan.Enqueue_TileSegReduce_Packed(h_cmd_queue,

					m_dp_segreduce, m_dp_tile_carry, // outs

					m_dp_tile_tcounts_exscan, m_dp_data, m_dp_packed_head_flags, n).GetEvent(ev2); // ins, need to upload m_dp_tile_tcounts_exscan

				// tile carry scan + tile flags pack

				if(n_tile_num > m_scan.n_Tile_Size()) {
#if 0
					CCLTempBufferStack memory_alloc(h_context);
					size_t n_tile_num2 = (n_tile_num + m_scan.n_Tile_Size() - 1) / m_scan.n_Tile_Size();
					CCLTempBufferReservation dp_tile_flags2(n_tile_num2 * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, memory_alloc);

					CLresult n_result = m_scan.Enqueue_TileSegScan_Carry_Pack(h_cmd_queue, m_dp_scanned_tile_carry_spine,
						dp_tile_flags2, m_dp_packed_tile_head_flags, m_dp_tile_carry, m_dp_tile_flags, n_tile_num);
					if(n_result != cl_Success)
						return false;

					n_result = m_scan.n_Enqueue_SegmentedScan_NoAutoTune(h_cmd_queue,
						m_dp_scanned_tile_carry_spine, m_dp_scanned_tile_carry_spine, dp_tile_flags2, n_tile_num2, memory_alloc);
					if(n_result != cl_Success)
						return false;
					// scan the segment carry

					bool b_compare;
					cmd_queue.n_CompareBuffer_DebugVerbose(b_compare, m_tile_flags_cpu.begin(),
						m_tile_flags_cpu.end(), m_dp_tile_flags, 0, "tile head flags");
					cmd_queue.n_CompareBuffer_DebugVerbose(b_compare, m_scanned_tile_carry_spine_cpu.begin(),
						m_scanned_tile_carry_spine_cpu.end(), m_dp_scanned_tile_carry_spine, 0, "scanned tile carry spine");
					cmd_queue.n_CompareBuffer_DebugVerbose(b_compare, m_packed_tile_head_flags_cpu.begin(),
						m_packed_tile_head_flags_cpu.end(), m_dp_packed_tile_head_flags, 0, "packed tile head flags");
#endif // 0

					n_result3 = m_scan.Enqueue_TileSegReduce_SpineAdjust_Downsweep_Packed(h_cmd_queue,

						m_dp_segreduce, // outs

						m_dp_tile_tcounts_exscan, m_dp_packed_head_flags, m_scan.n_Tile_Size() / 32, 1, m_dp_tile_carry, // ins
						m_dp_scanned_tile_carry_spine, m_dp_packed_tile_head_flags, n_tile_num).GetEvent(ev3); // ins, need to upload m_dp_scanned_tile_carry_spine, m_dp_packed_tile_head_flags

					// todo - tune this separately, along with the segmented scans
				} else {
					n_result3 = m_scan.Enqueue_TileSegReduce_SpineAdjust(h_cmd_queue,

						m_dp_segreduce, // outs

						m_dp_tile_tcounts_exscan, m_dp_packed_head_flags, m_scan.n_Tile_Size() / 32, 1, m_dp_tile_carry, // ins
						m_dp_tile_flags, n_tile_num).GetEvent(ev3);
					// not packed head flags!
				}
				break;
			case 1: // packed interleaved
				n_result = m_scan.Enqueue_TileSegReduce_Bootstrap_Packed(h_cmd_queue,

					m_dp_tile_tcounts, m_dp_tile_flags, // outs

					m_dp_packed_head_flags, n).GetEvent(ev); // ins, satisfied

				// ex scan of m_dp_tile_tcounts for tile num > 1

				n_result2 = m_scan.Enqueue_TileSegReduce_Packed(h_cmd_queue,

					m_dp_segreduce, m_dp_tile_carry, // outs

					m_dp_tile_tcounts_exscan, m_dp_data, m_dp_packed_head_flags, n).WithEvents(1, &ev, ev2); // ins, need to upload m_dp_tile_tcounts_exscan, need explicit synchronization otherwise wouldn't wait

				// tile carry scan + tile flags pack

				if(n_tile_num > m_scan.n_Tile_Size()) {
#if 0
					CCLTempBufferStack memory_alloc(h_context);
					size_t n_tile_num2 = (n_tile_num + m_scan.n_Tile_Size() - 1) / m_scan.n_Tile_Size();
					CCLTempBufferReservation dp_tile_flags2(n_tile_num2 * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, memory_alloc);

					CLresult n_result = m_scan.Enqueue_TileSegScan_Carry_Pack(h_cmd_queue, m_dp_scanned_tile_carry_spine,
						dp_tile_flags2, m_dp_packed_tile_head_flags, m_dp_tile_carry, m_dp_tile_flags, n_tile_num);
					if(n_result != cl_Success)
						return false;

					n_result = m_scan.n_Enqueue_SegmentedScan_NoAutoTune(h_cmd_queue,
						m_dp_scanned_tile_carry_spine, m_dp_scanned_tile_carry_spine, dp_tile_flags2, n_tile_num2, memory_alloc);
					if(n_result != cl_Success)
						return false;
					// scan the segment carry

					bool b_compare;
					cmd_queue.n_CompareBuffer_DebugVerbose(b_compare, m_tile_flags_cpu.begin(),
						m_tile_flags_cpu.end(), m_dp_tile_flags, 0, "tile head flags");
					cmd_queue.n_CompareBuffer_DebugVerbose(b_compare, m_scanned_tile_carry_spine_cpu.begin(),
						m_scanned_tile_carry_spine_cpu.end(), m_dp_scanned_tile_carry_spine, 0, "scanned tile carry spine");
					cmd_queue.n_CompareBuffer_DebugVerbose(b_compare, m_packed_tile_head_flags_cpu.begin(),
						m_packed_tile_head_flags_cpu.end(), m_dp_packed_tile_head_flags, 0, "packed tile head flags");
#endif // 0

					n_result3 = m_scan.Enqueue_TileSegReduce_SpineAdjust_Downsweep_Packed(h_cmd_queue,

						m_dp_segreduce, // outs

						m_dp_tile_tcounts_exscan, m_dp_packed_head_flags, m_scan.n_Tile_Size() / 32, 1, m_dp_tile_carry, // ins
						m_dp_scanned_tile_carry_spine, m_dp_packed_tile_head_flags, n_tile_num).GetEvent(ev3); // ins, need to upload m_dp_scanned_tile_carry_spine, m_dp_packed_tile_head_flags

					// todo - tune this separately, along with the segmented scans
				} else {
					n_result3 = m_scan.Enqueue_TileSegReduce_SpineAdjust(h_cmd_queue,

						m_dp_segreduce, // outs

						m_dp_tile_tcounts_exscan, m_dp_packed_head_flags, m_scan.n_Tile_Size() / 32, 1, m_dp_tile_carry, // ins
						m_dp_tile_flags, n_tile_num).GetEvent(ev3);
					// not packed head flags!
				}
				break;
			case 2: // packed natural
				n_result = m_scan.Enqueue_TileSegReduce_Bootstrap_PackedNaturalOrder(h_cmd_queue,

					m_dp_tile_tcounts, m_dp_tile_flags, // outs

					m_dp_packed_head_flags, n).GetEvent(ev); // ins, satisfied

				// ex scan of m_dp_tile_tcounts for tile num > 1

				n_result2 = m_scan.Enqueue_TileSegReduce_PackedNaturalOrder(h_cmd_queue,

					m_dp_segreduce, m_dp_tile_carry, // outs

					m_dp_tile_tcounts_exscan, m_dp_data, m_dp_packed_head_flags, n).WithEvents(1, &ev, ev2); // ins, need to upload m_dp_tile_tcounts_exscan, need explicit synchronization otherwise wouldn't wait

				// tile carry scan + tile flags pack

				if(n_tile_num > m_scan.n_Tile_Size()) {
#if 0
					CCLTempBufferStack memory_alloc(h_context);
					size_t n_tile_num2 = (n_tile_num + m_scan.n_Tile_Size() - 1) / m_scan.n_Tile_Size();
					CCLTempBufferReservation dp_tile_flags2(n_tile_num2 * sizeof(uint32_t)/*m_config.n_DataType_Size()*/, memory_alloc);

					CLresult n_result = m_scan.Enqueue_TileSegScan_Carry_Pack(h_cmd_queue, m_dp_scanned_tile_carry_spine,
						dp_tile_flags2, m_dp_packed_tile_head_flags, m_dp_tile_carry, m_dp_tile_flags, n_tile_num);
					if(n_result != cl_Success)
						return false;

					n_result = m_scan.n_Enqueue_SegmentedScan_NoAutoTune(h_cmd_queue,
						m_dp_scanned_tile_carry_spine, m_dp_scanned_tile_carry_spine, dp_tile_flags2, n_tile_num2, memory_alloc);
					if(n_result != cl_Success)
						return false;
					// scan the segment carry

					bool b_compare;
					cmd_queue.n_CompareBuffer_DebugVerbose(b_compare, m_tile_flags_cpu.begin(),
						m_tile_flags_cpu.end(), m_dp_tile_flags, 0, "tile head flags");
					cmd_queue.n_CompareBuffer_DebugVerbose(b_compare, m_scanned_tile_carry_spine_cpu.begin(),
						m_scanned_tile_carry_spine_cpu.end(), m_dp_scanned_tile_carry_spine, 0, "scanned tile carry spine");
					cmd_queue.n_CompareBuffer_DebugVerbose(b_compare, m_packed_tile_head_flags_cpu.begin(),
						m_packed_tile_head_flags_cpu.end(), m_dp_packed_tile_head_flags, 0, "packed tile head flags");
#endif // 0

					n_result3 = m_scan.Enqueue_TileSegReduce_SpineAdjust_Downsweep_Packed(h_cmd_queue,

						m_dp_segreduce, // outs

						m_dp_tile_tcounts_exscan, m_dp_packed_head_flags, m_scan.n_Tile_Size() / 32, 1, m_dp_tile_carry, // ins
						m_dp_scanned_tile_carry_spine, m_dp_packed_tile_head_flags, n_tile_num).GetEvent(ev3); // ins, need to upload m_dp_scanned_tile_carry_spine, m_dp_packed_tile_head_flags

					// todo - tune this separately, along with the segmented scans
				} else {
					n_result3 = m_scan.Enqueue_TileSegReduce_SpineAdjust(h_cmd_queue,

						m_dp_segreduce, // outs

						m_dp_tile_tcounts_exscan, m_dp_packed_head_flags, m_scan.n_Tile_Size() / 32, 1, m_dp_tile_carry, // ins
						m_dp_tile_flags, n_tile_num).GetEvent(ev3);
					// not packed head flags!
				}
				break;
			}
		} else {
			switch(m_n_bench_type) {
			case 0: // decode
				n_result = m_scan.Enqueue_TileSegReduce_Bootstrap_Pack(h_cmd_queue,

					m_dp_tile_tcounts, m_dp_tile_flags, m_dp_packed_head_flags, // outs

					m_dp_head_flags, n).GetEvent(ev); // ins
				// m_dp_tile_tcounts remains unscanned

				n_result2 = m_scan.Enqueue_TileSegReduceSingle_Packed(h_cmd_queue,

					m_dp_segreduce, //m_dp_tile_carry, // outs

					/*m_dp_tile_tcounts_exscan,*/ m_dp_data, m_dp_packed_head_flags, n).GetEvent(ev2); // ins, satisfied
				// no tile carry scan

				//n_result3 = m_scan.Enqueue_TileSegReduce_SpineAdjust(h_cmd_queue,

				//	m_dp_segreduce, // outs

				//	m_dp_tile_tcounts/*_exscan*/, m_dp_packed_head_flags, m_scan.n_Tile_Size() / 32, 1,
				//	m_dp_tile_carry, m_dp_tile_flags, n_tile_num).GetEvent(ev3); // ins
				// not here, m_dp_segreduce is already global
				break;
			case 1: // packed interleaved
				n_result = m_scan.Enqueue_TileSegReduce_Bootstrap_Packed(h_cmd_queue,

					m_dp_tile_tcounts, m_dp_tile_flags, // outs

					m_dp_packed_head_flags, n).GetEvent(ev); // ins
				// m_dp_tile_tcounts remains unscanned

				n_result2 = m_scan.Enqueue_TileSegReduceSingle_Packed(h_cmd_queue,

					m_dp_segreduce, //m_dp_tile_carry, // outs

					/*m_dp_tile_tcounts_exscan,*/ m_dp_data, m_dp_packed_head_flags, n).WithEvents(1, &ev, ev2); // ins, satisfied, need explicit synchronization otherwise wouldn't wait
				// no tile carry scan
				break;
			case 2: // packed natural
				n_result = m_scan.Enqueue_TileSegReduce_Bootstrap_PackedNaturalOrder(h_cmd_queue,

					m_dp_tile_tcounts, m_dp_tile_flags, // outs

					m_dp_packed_head_flags, n).GetEvent(ev); // ins
				// m_dp_tile_tcounts remains unscanned

				n_result2 = m_scan.Enqueue_TileSegReduceSingle_PackedNaturalOrder(h_cmd_queue,

					m_dp_segreduce, //m_dp_tile_carry, // outs

					/*m_dp_tile_tcounts_exscan,*/ m_dp_data, m_dp_packed_head_flags, n).WithEvents(1, &ev, ev2); // ins, satisfied, need explicit synchronization otherwise wouldn't wait
				// no tile carry scan
				break;
			}
		}
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: scan.Enqueue_TileSegReduce_Bootstrap_Pack() failed with: %d (%s, %d)\n",
				n_result, __FILE__, __LINE__);
			return false;
		}
		if(n_result2 != CL_SUCCESS) {
			fprintf(stderr, "error: scan.Enqueue_TileSegReduce_Packed() failed with: %d (%s, %d)\n",
				n_result2, __FILE__, __LINE__);
			return false;
		}
		if(n_result3 != CL_SUCCESS) {
			fprintf(stderr, "error: scan.Enqueue_TileSegReduce_SpineAdjust%s() failed with: %d (%s, %d)\n",
				(n_scan_buffer_elems <= m_scan.n_Tile_Size())? "" : "_Downsweep_Packed", n_result3, __FILE__, __LINE__);
			return false;
		}
		// run

		n_result = cmd_queue.n_Finish();
		r_f_time = m_timer.f_Time() - r_f_time;
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		if((n_result = ev.n_Wait()) != CL_SUCCESS) {
			fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		if((n_result2 = ev2.n_Wait()) != CL_SUCCESS ||
		   (n_scan_buffer_elems > m_scan.n_Tile_Size() && (n_result3 = ev3.n_Wait()) != CL_SUCCESS)) {
			fprintf(stderr, "error: clFinish() failed with: %d or %d (%s, %d)\n", n_result2, n_result3, __FILE__, __LINE__);
			return false;
		}
		// wait

		/*if(ev.n_GetProfilingCounter_Difference(r_f_time, CL_PROFILING_COMMAND_START,
		   CL_PROFILING_COMMAND_END) != CL_SUCCESS) {
			fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
			return false;
		}
		if(n_scan_buffer_elems > m_scan.n_Tile_Size()) {
			double f_more_time;
			if(ev2.n_GetProfilingCounter_Difference(f_more_time, CL_PROFILING_COMMAND_START,
			   CL_PROFILING_COMMAND_END) != CL_SUCCESS) {
				fprintf(stderr, "error: clFinish() failed with: %d (%s, %d)\n", n_result, __FILE__, __LINE__);
				return false;
			}
			r_f_time += f_more_time;
		}*/

		if(b_verify) {
			bool b_verify_result;

			n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_verify_result, m_packed_head_flags_cpu.begin(),
				m_packed_head_flags_cpu.end(), m_dp_packed_head_flags, 0, "packed head flags");
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: opencl[0].n_CompareBuffer() failed with: %d (%s, %d)\n",
					n_result, __FILE__, __LINE__);
				return false;
			}
			if(!b_verify_result) {
				fprintf(stderr, "error: packed head flags results are incorrect\n");
				return false;
			}

			/*n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_verify_result, m_p_head_flags->begin(),
				m_p_head_flags->end(), m_dp_head_flags, 0, "head flags");
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: opencl[0].n_CompareBuffer() failed with: %d (%s, %d)\n",
					n_result, __FILE__, __LINE__);
				return false;
			}
			if(!b_verify_result) {
				fprintf(stderr, "error: packed head flags results are incorrect\n");
				return false;
			}*/ // also paranoia

			/*n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_verify_result, m_data.begin(),
				m_data.end(), m_dp_data, 0, "data");
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: opencl[0].n_CompareBuffer() failed with: %d (%s, %d)\n",
					n_result, __FILE__, __LINE__);
				return false;
			}
			if(!b_verify_result) {
				fprintf(stderr, "error: packed head flags results are incorrect\n");
				return false;
			}*/ // paranoia

			n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_verify_result, m_tile_tcounts_cpu.begin(),
				m_tile_tcounts_cpu.end(), m_dp_tile_tcounts, 0, "tile tail counts");
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: opencl[0].n_CompareBuffer() failed with: %d (%s, %d)\n",
					n_result, __FILE__, __LINE__);
				return false;
			}
			if(!b_verify_result) {
				fprintf(stderr, "error: tile tail counts results are incorrect\n");
				return false;
			}

			if(n_tile_num > 1) {
				n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_verify_result, m_tile_carry_cpu.begin(),
					m_tile_carry_cpu.end(), m_dp_tile_carry, 0, "tile carry");
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: opencl[0].n_CompareBuffer() failed with: %d (%s, %d)\n",
						n_result, __FILE__, __LINE__);
					return false;
				}
				if(!b_verify_result) {
					fprintf(stderr, "error: tile carry results are incorrect\n");
					return false;
				}
				// otherwise not computed
			}

			n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_verify_result, m_tile_flags_cpu.begin(),
				m_tile_flags_cpu.end(), m_dp_tile_flags, 0, "tile flags");
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: opencl[0].n_CompareBuffer() failed with: %d (%s, %d)\n",
					n_result, __FILE__, __LINE__);
				return false;
			}
			if(!b_verify_result) {
				fprintf(stderr, "error: tile flags results are incorrect\n");
				return false;
			}

			n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_verify_result, m_segreduce_cpu.begin(),
				m_segreduce_cpu.end(), m_dp_segreduce, 0, "segmented reduction");
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: opencl[0].n_CompareBuffer() failed with: %d (%s, %d)\n",
					n_result, __FILE__, __LINE__);
				return false;
			}
			if(!b_verify_result) {
				/*size_t n_red_num = n_seg_num;
				m_scan.n_Enqueue_SegmentedReduce_Packed_NoAutoTune(cmd_queue, *const_cast<cl_mem*>(&m_dp_segreduce), n_red_num,
					n_red_num, m_dp_data, m_dp_packed_head_flags, n, m_r_glob_scan, CCLTempBufferStack(h_context));
				cmd_queue.n_Finish();

				n_result = cmd_queue.n_CompareBuffer_DebugVerbose(b_verify_result, m_segreduce_cpu.begin(),
					m_segreduce_cpu.end(), m_dp_segreduce, 0, "segmented reduction");
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: opencl[0].n_CompareBuffer() failed with: %d (%s, %d)\n",
						n_result, __FILE__, __LINE__);
					return false;
				}
				if(b_verify_result)
					fprintf(stderr, "error: segmented reduction results are incorrect due to flawed intermediates\n");
				else*/
					fprintf(stderr, "error: segmented reduction results are incorrect\n");
				// just a debug check

				return false;
			}
			// verify the global reduce result
		}
		// verify

		return true;
	}

	virtual bool Prepare_Input(const std::vector<size_t> &r_input)
	{
		_ASSERTE(r_input.size() == 1);
		const size_t n = r_input.front();
		// read config

		m_data.resize(n);
		// alloc

		for(size_t i = 0; i < n; ++ i)
			m_data[i] = (uint32_t)(i + 1);
		std::random_shuffle(m_data.begin(), m_data.end());
		// generate some data

		return true;
	}

	virtual const char *p_s_Benchmark_Id() const
	{
		return (m_n_bench_type == 0)? "tiled_seg_reduce_pack" :
			(m_n_bench_type == 1)? "tiled_seg_reduce_interleaved" :
			(m_n_bench_type == 2)? "tiled_seg_reduce_natural" : "(null)";
	}

	virtual const size_t n_InputSize_Lookup(size_t n_index) const
	{
		const size_t p_size[] = {10, 20, 50, 100, 200, 500, 1000, 2000, 5000,
			1000 * 10, 1000 * 20, 1000 * 50, 1000 * 100, 1000 * 200, 1000 * 500,
			1000000, 1000000 * 2, 1000000 * 5, 1000000 * 10, 1000000 * 20};
		if(sizeof(p_size) / sizeof(p_size[0]) <= n_index)
			return 0;
		return p_size[n_index];
	}

	virtual const size_t n_WorkGroupSize_Lookup(size_t n_index) const
	{
		const size_t p_size[] = {128, 192, 256, 512, 1024};
		if(sizeof(p_size) / sizeof(p_size[0]) <= n_index)
			return 0;
		return p_size[n_index];
	}

	virtual const size_t n_ItemsPerThread_Lookup(size_t n_index) const
	{
		const size_t p_size[] = {1, 2, 3, 4, 5, 6, 7, 8/*, 16, 32*/};
		if(sizeof(p_size) / sizeof(p_size[0]) <= n_index)
			return 0;
		return p_size[n_index];
	}
};

#endif // !__OPENCL_TILED_SCAN_REDUCTION_AUTOTUNING_INCLUDED
