/*
								+--------------------------------+
								|                                |
								| *** Segmented op debugging *** |
								|                                |
								|  Copyright  -tHE SWINe- 2016  |
								|                                |
								|  SegmentedScanReduce_Debug.h   |
								|                                |
								+--------------------------------+
*/

#pragma once
#ifndef __OPENCL_SEGMENTED_SCAN_REDUCTION_DEBUGGING_INCLUDED
#define __OPENCL_SEGMENTED_SCAN_REDUCTION_DEBUGGING_INCLUDED

/**
 *	@file gpgpu/SegmentedScanReduce_Debug.h
 *	@date 2016
 *	@author -tHE SWINe-
 *	@brief segmented operation debugging helpers and benchmark generators
 */

#include "../CallStack.h"
#include <vector>
#include <algorithm>
#include "../RandGen.h"
#include "../StlUtils.h"

/**
 *	@bfief namespace containing helper objects for debugging segmented reduce / scan
 */
namespace seg_debug {

/**
 *	@brief head flags array and the associated helper arrays for debugging
 */
struct THeadFlag_DebugInfo {
	std::vector<uint32_t> head_flags; /**< @brief a binary array of head flags (size <tt>n</tt>) */
	std::vector<size_t> head_indices; /**< @brief an array of indices of nonzero head flags, ignoring the first one (size up to <tt>n - 1</tt>) */
	std::vector<size_t> tail_indices; /**< @brief an array of indices of tail positions (positions preceding a nonzero head flag, including the implied last one, size up to <tt>n</tt>) */

	/**
	 *	@brief allocates an empty head flags array and the associated arrays
	 *	@param[in] n is size of the input data for the segmented operation, in elements
	 *	@note This function throws std::bad_alloc.
	 */
	void Reset(size_t n) // throw(std::bad_alloc)
	{
		head_flags.assign(n, uint32_t(0));
		head_indices.clear();
		tail_indices.clear();
	}

	/**
	 *	@brief infers tail indices from head flags, performs some sanity checking
	 *	@param[in] r is the number of reductions (the number of segments)
	 *	@note This function throws std::bad_alloc.
	 */
	void Fill_TailIndices(size_t r) // throw(std::bad_alloc)
	{
		const size_t n = head_flags.size();
		tail_indices.clear();

		if(!head_indices.empty() && !head_indices.front())
			head_indices.erase(head_indices.begin());
		// that one is implied

		_ASSERTE(head_indices.size() == r - 1);
		_ASSERTE(stl_ut::b_IsStrictlySorted(head_indices.begin(), head_indices.end()));
		_ASSERTE(head_indices.empty() || head_indices.back() < n);
		for(size_t i = 1; i < r; ++ i) {
			if(head_indices[i - 1]) // no tail before head index 0
				tail_indices.push_back(head_indices[i - 1] - 1);
		}
		//if(head_indices.back() != n - 1) // always
			tail_indices.push_back(n - 1); // implicit tail at the end
		_ASSERTE((!head_indices.empty() && head_indices.front() == 0) || tail_indices.size() == r); // how many reductions will be written
		_ASSERTE((head_indices.empty() || head_indices.front() != 0) || tail_indices.size() == r - 1);
		_ASSERTE(stl_ut::b_IsStrictlySorted(tail_indices.begin(), tail_indices.end()));
		_ASSERTE(tail_indices.back() == n - 1); // the implicit tail, dumb to check
		// generate flags and helper flags
		// note that if head_indices[0] == 0 then head_indices.size() == tail_indices.size()
	}
};

/**
 *	@brief segmented operation head flag generator; benchmark with uniformly sized segments
 */
class CSegmentedOp_UniformSize_Benchmark {
public:
	/**
	 *	@brief calculates the number of different benchmarks that can be performed on data of a specified size
	 *	@param[in] n is size of the input data for the segmented operation, in elements
	 *	@return Returns the number of benchmarks.
	 */
	size_t n_Benchmark_Num(size_t n) const
	{
		return n - 1;
	}

	/**
	 *	@brief fills the head flags
	 *
	 *	@param[out] r_head_flags is head flag array to be filled (it is overwritten rather than or-ed)
	 *	@param[in] n is size of the input data for the segmented operation, in elements
	 *	@param[in] n_benchmark is benchmark index in the [0, \ref n_Benchmark_Num(n)) interval
	 *
	 *	@return Returns the number of segments delimited by the generated head flags.
	 *
	 *	@note This function throws std::bad_alloc.
	 *
	 *	@todo It would be nicer if these benchmarks generated std::vector<uint32_t> of head
	 *		flags directly and the head flag indices would be inferred from that.
	 */
	size_t n_Generate_HeadFlags(THeadFlag_DebugInfo &r_head_flags, size_t n, size_t n_benchmark) const // throw(std::bad_alloc)
	{
		_ASSERTE(n_benchmark <= n_Benchmark_Num(n)); // make sure this is in range, allow one more
		r_head_flags.Reset(n);
		size_t n_segment_size = n_benchmark + 1;
		_ASSERTE(n_segment_size); // otherwise will get division by zero / get stuck in an infinite loop
		size_t n_segment_num = (n + n_segment_size - 1) / n_segment_size;
		for(size_t i = n_segment_size; i < n; i += n_segment_size) {
			r_head_flags.head_flags[i] = 1;
			r_head_flags.head_indices.push_back(i);
		}
		return n_segment_num;
	}
};

/**
 *	@brief segmented operation head flag generator; benchmark with random segment positions (but not the number)
 */
class CSegmentedOp_Random_Benchmark {
public:
	/**
	 *	@copydoc CSegmentedOp_UniformSize_Benchmark::n_Benchmark_Num()
	 */
	size_t n_Benchmark_Num(size_t n) const
	{
		return n;
	}

	/**
	 *	@copydoc CSegmentedOp_UniformSize_Benchmark::n_Generate_HeadFlags()
	 */
	size_t n_Generate_HeadFlags(THeadFlag_DebugInfo &r_head_flags, size_t n, size_t n_benchmark) const // throw(std::bad_alloc)
	{
		_ASSERTE(n_benchmark < n_Benchmark_Num(n)); // make sure this is in range
		r_head_flags.Reset(n);

		size_t r = n_benchmark + 1;
		// the number of reductions

		if(r >= n / 2) { // takes a long time to dart-throw
			r_head_flags.head_indices.resize(n - 1);
			for(size_t i = 1; i < n; ++ i)
				r_head_flags.head_indices[i - 1] = (uint32_t)i;
			// build head indices, skip the first one since that one is implied

			while(r_head_flags.head_indices.size() >= r) {
				r_head_flags.head_indices.erase(r_head_flags.head_indices.begin() +
					CUniformIntegerDistribution<size_t>(0,
					r_head_flags.head_indices.size() - 1)(CCLibGenerator<false>()));
			}
			// drop random elements

			for(size_t i = 1; i < r; ++ i)
				r_head_flags.head_flags[r_head_flags.head_indices[i - 1]] = 1;
			// raise head flags
		} else {
			for(size_t i = 1; i < r; ++ i) {
				size_t n_where = CUniformIntegerDistribution<size_t>(1, n - 1)(CCLibGenerator<false>()); // not zero // don't care much about randomness bias here
				if(!stl_ut::t_Unique_Insert(r_head_flags.head_indices, n_where).first)
					-- i; // not unique
				else
					r_head_flags.head_flags[n_where] = 1;
			}
		}

		return r;
	}
};

/**
 *	@brief segmented operation head flag generator; benchmark only two segments where the boundary travels left to right
 */
class CSegmentedOp_TravellingHead_Benchmark {
public:
	/**
	 *	@copydoc CSegmentedOp_UniformSize_Benchmark::n_Benchmark_Num()
	 */
	size_t n_Benchmark_Num(size_t n) const
	{
		return n - 1; // n - 1 possible positions of the head
	}

	/**
	 *	@copydoc CSegmentedOp_UniformSize_Benchmark::n_Generate_HeadFlags()
	 */
	size_t n_Generate_HeadFlags(THeadFlag_DebugInfo &r_head_flags, size_t n, size_t n_benchmark) const // throw(std::bad_alloc)
	{
		_ASSERTE(n_benchmark < n_Benchmark_Num(n)); // make sure this is in range
		r_head_flags.Reset(n);

		size_t n_where = n_benchmark + 1;
		r_head_flags.head_indices.push_back(n_where);
		r_head_flags.head_flags[n_where] = 1;

		return (n_where > 0)? 2 : 1; // see if it splits the range, then there are two reductions
	}
};

/**
 *	@brief segmented operation head flag generator; benchmark with a single or n segments
 */
class CSegmentedOp_SimplePatterns_Benchmark {
public:
	/**
	 *	@copydoc CSegmentedOp_UniformSize_Benchmark::n_Benchmark_Num()
	 */
	size_t n_Benchmark_Num(size_t n) const
	{
		return 2;
	}

	/**
	 *	@copydoc CSegmentedOp_UniformSize_Benchmark::n_Generate_HeadFlags()
	 */
	size_t n_Generate_HeadFlags(THeadFlag_DebugInfo &r_head_flags, size_t n, size_t n_benchmark) const // throw(std::bad_alloc)
	{
		_ASSERTE(n_benchmark < n_Benchmark_Num(n)); // make sure this is in range
		r_head_flags.Reset(n);
		if(!n_benchmark)
			return 1; // no head flags, 1 segment
		else
			return CSegmentedOp_UniformSize_Benchmark().n_Generate_HeadFlags(r_head_flags, n, 0); // each item is a segment
	}
};

/**
 *	@brief segmented operation head flag generator; benchmark with exhaustive combination
 *		of segment heads at the first, middle or the last element of each tile
 */
class CSegmentedOp_BeginEndMiddle_Benchmark {
protected:
	const size_t m_n_tile_size; /**< @brief tile size */

public:
	/**
	 *	@brief default constructor; specifies bloc size
	 *	@param[in] n_tile_size is tile size
	 */
	CSegmentedOp_BeginEndMiddle_Benchmark(size_t n_tile_size)
		:m_n_tile_size(n_tile_size)
	{}

	/**
	 *	@copydoc CSegmentedOp_UniformSize_Benchmark::n_Benchmark_Num()
	 */
	size_t n_Benchmark_Num(size_t n) const
	{
		const size_t n_tile_num = (n + m_n_tile_size - 1) / m_n_tile_size;
		size_t n_bme_head_benchmark_bit_num = 3 * n_tile_num - 1;
		size_t n_bme_head_benchmark_len = ((n_bme_head_benchmark_bit_num > 16)?
			n_Mask(16) : n_Mask(n_bme_head_benchmark_bit_num)) + 1;
		return n_bme_head_benchmark_len;
	}

	/**
	 *	@copydoc CSegmentedOp_UniformSize_Benchmark::n_Generate_HeadFlags()
	 */
	size_t n_Generate_HeadFlags(THeadFlag_DebugInfo &r_head_flags, size_t n, size_t n_benchmark) const // throw(std::bad_alloc)
	{
		_ASSERTE(n_benchmark < n_Benchmark_Num(n)); // make sure this is in range
		r_head_flags.Reset(n);

		const size_t n_tile_num = (n + m_n_tile_size - 1) / m_n_tile_size;
		uint32_t n_bits = n_benchmark;
		for(size_t i = 1; i < n_tile_num * 3; ++ i, n_bits >>= 1) {
			if(/*i &&*/ (n_bits & 1)) { // the first one is implied
				size_t n_tile = i / 3; // which tile
				size_t n_tile_size = (n_tile + 1 == n_tile_num)?
					n - (n_tile_num - 1) * m_n_tile_size : m_n_tile_size;
				size_t n_where = n_tile * m_n_tile_size +
					((i % 3 == 0)? 0 : // at the beginning of the tile
					(i % 3 == 1)? n_tile_size / 2 : // in the middle of the tile
					n_tile_size - 1); // at the tile end
				if(n_where) { // flag at 0 implied; n_where may still be 0 if n is very small
					if(stl_ut::t_Unique_Insert(r_head_flags.head_indices, (uint32_t)n_where).first) // may repeat values if the last segment is very small
						r_head_flags.head_flags[n_where] = 1;
				}
			}
		}

		r_head_flags.Fill_TailIndices(r_head_flags.head_indices.size() + 1); // debug, just to see

		_ASSERTE(r_head_flags.head_indices.empty() || r_head_flags.head_indices.front() != 0); // the implied one is excluded
		return r_head_flags.head_indices.size() + 1;
	}
};

/**
 *	@brief reference segmented operation implementations
 */
class CReference {
public:
	/**
	 *	@brief global segmented scan
	 */
	static void Segmented_Scan(std::vector<uint32_t> &r_dest,
		const std::vector<uint32_t> &r_data, const std::vector<uint32_t> &r_head_flags) // throw(std::bad_alloc)
	{
		r_dest.resize(r_data.size());
		uint32_t n_accum = 0;
		for(size_t i = 0, n = r_data.size(); i < n; ++ i) {
			if(r_head_flags[i])
				n_accum = 0;
			n_accum += r_data[i];
			r_dest[i] = n_accum; // inclusive
		}
	}

	/**
	 *	@brief global segmented reduction
	 */
	static void Segmented_Reduce(std::vector<uint32_t> &r_dest,
		const std::vector<uint32_t> &r_data, const std::vector<uint32_t> &r_head_flags) // throw(std::bad_alloc)
	{
		r_dest.clear();
		uint32_t n_accum = 0;
		for(size_t i = 0, n = r_data.size(); i < n; ++ i) {
			if(r_head_flags[i]) {
				r_dest.push_back(n_accum);
				n_accum = 0;
			}
			n_accum += r_data[i];
		}
		r_dest.push_back(n_accum);
	}

	/**
	 *	@brief bit head flags to packed head flags conversion, optionally with interleaving
	 *
	 *	@param[in] n_workgroup_size is workgroup size in threads (only needed if b_interleave is set)
	 *	@param[in] b_interleave is interleaved flags flag (if set, the resulting flags will
	 *		be interleaved, otherwise they will be )
	 */
	static void Pack_HeadFlags(std::vector<uint32_t> &r_packed_head_flags,
		const std::vector<uint32_t> &r_head_flags, const size_t n_tile_size,
		const size_t n_workgroup_size, bool b_interleave)
	{
		enum {
			n_warp_size = 32
		};

		const size_t n_flags_size = n_Align_Up(r_head_flags.size(), n_tile_size) / n_warp_size;
		r_packed_head_flags.assign(n_flags_size, uint32_t(0));

		const size_t n = r_head_flags.size();
		if(b_interleave) {
			const size_t n_wgs = n_workgroup_size;//scan.n_WorkGroup_Size();
			const size_t n_ts = n_tile_size;//scan.n_Tile_Size();
			const size_t n_vpt = n_tile_size / n_workgroup_size;
			const size_t n_warp_num = n_workgroup_size / n_warp_size;
			const size_t n_tile_warp_num = n_tile_size / n_warp_size;

			for(size_t i = 0; i < n; ++ i) {
				size_t n_tile = i / n_ts;
				size_t j = i % n_ts;
				size_t n_thread_elem = j % n_vpt;
				size_t n_thread = j / n_vpt;
				size_t n_warp = n_thread / n_warp_size;
				size_t n_lane = n_thread % n_warp_size;
				r_packed_head_flags[n_tile_warp_num * n_tile +
					n_warp + n_warp_num * n_thread_elem] |= ((r_head_flags[i] != 0)? 1 : 0) << n_lane;
			}
		} else {
			for(size_t i = 0; i < n; ++ i)
				r_packed_head_flags[i / n_warp_size] |= ((r_head_flags[i] != 0)? 1 : 0) << (i % n_warp_size);
		}
	}

	/*static void Segmented_SpineScan(std::vector<uint32_t> &r_dest,
		const std::vector<uint32_t> &r_data, const std::vector<uint32_t> &r_head_flags) // throw(std::bad_alloc)
	{
		r_dest.resize(r_data.size());
		uint32_t n_accum = 0;
		for(size_t i = 0, n = r_data.size(); i < n; ++ i) {
			r_dest[i] = n_accum;
			// the first head flag in the receiving tile will zero it

			if(r_head_flags[i]) // delay this
				n_accum = 0;

			n_accum += r_data[i]; // exclusive
		}
	}*/

	/**
	 *	@brief global segmented spine scan
	 */
	static void Segmented_SpineScan(std::vector<uint32_t> &r_dest,
		const std::vector<uint32_t> &r_data, const std::vector<uint32_t> &r_head_flags) // throw(std::bad_alloc)
	{
		Segmented_Scan(r_dest, r_data, r_head_flags);
		// formulate as inclusive scan

		/*for(size_t i = 0, n = r_data.size(); i < n; ++ i)
			r_dest[i] -= r_data[i];*/
		// wrong for some reason

		r_dest.erase(r_dest.end() - 1);
		r_dest.insert(r_dest.begin(), uint32_t(0));
		// "convert" to exclusive (the entries under head flags are incorrect though)
	}

	/**
	 *	@brief global segmented tile scan with carry inputs
	 *
	 *	@param[out] r_dest is allocated and filled with global segmented scan of the input
	 *	@param[in] r_data is input data
	 *	@param[in] r_head_flags is boolean array of head flags
	 *	@param[in] carry_ins is array of tile partial carry (postprocessed by
	 *		\ref Segmented_SpineScan() to yield global scan)
	 *	@param[in] n_tile_size is tile size
	 */
	static void Segmented_TileScan_Downsweep(std::vector<uint32_t> &r_dest,
		const std::vector<uint32_t> &r_data, const std::vector<uint32_t> &r_head_flags,
		const std::vector<uint32_t> &carry_ins, const size_t n_tile_size) // throw(std::bad_alloc)
	{
		r_dest.resize(r_data.size());
		/*carry_outs.resize((r_data.size() + n_tile_size - 1) / n_tile_size);
		tail_counts.resize((r_data.size() + n_tile_size - 1) / n_tile_size);*/

		for(size_t i = 0, n = r_data.size(), n_tile = 0; i < n; ++ n_tile) {
			uint32_t n_accum = carry_ins[n_tile]; // read carry ins
			/*uint32_t n_tail_num = 0, n_warp_tails = 0;
			uint32_t n_head_num = 0, n_warp_heads = 0;*/
			size_t l = 0;
			for(size_t n_end = min(n, i + n_tile_size); i < n_end; ++ i, ++ l) {
				if(r_head_flags[i]) {
					/*if(l)
						++ n_tail_num; // that's a tail
					++ n_head_num; */// that's a head
					n_accum = 0;
				}
				n_accum += r_data[i];
				r_dest[i] = n_accum;
			}
			/*if(i == n || r_head_flags[i]) { // i is now one past the current segment
				++ n_tail_num; // that's a tail
				//if(i == n && n % n_tile_size)
				//	n_accum = 0; // the last tile in the GPU kernel clears the carry // not anymore
			}
			carry_outs[n_tile] = n_accum;
			tail_counts[n_tile] = n_tail_num;*/// n_warp_tails; //n_head_num;// store other
		}
	}

	/**
	 *	@brief segmented tile scan
	 *
	 *	@param[out] r_dest is allocated and filled with segmented tile scan of the input (not a global segmented scan)
	 *	@param[out] r_tile_tail_counts is allocated and filled with tail count of each tile
	 *	@param[out] r_tile_carry_outs is allocated and filled with tile partial carry
	 *	@param[out] r_tile_head_flags is allocated and filled with reduced head flags of each tile (using base 32 reduction)
	 *	@param[in] r_data is input data
	 *	@param[in] r_head_flags is boolean array of head flags
	 *	@param[in] n_tile_size is tile size
	 *	@param[in] n_workgroup_size is the number of threads in a GPU workgroup (needed for
	 *		calculation of the tile head flags)
	 */
	static void Segmented_TileScan(std::vector<uint32_t> &r_dest,
		std::vector<uint32_t> &r_tile_tail_counts, std::vector<uint32_t> &r_tile_carry_outs,
		std::vector<uint32_t> &r_tile_head_flags, const std::vector<uint32_t> &r_data,
		const std::vector<uint32_t> &r_head_flags, const size_t n_tile_size,
		const size_t n_workgroup_size) // throw(std::bad_alloc)
	{
		r_dest.resize(r_data.size());
		r_tile_tail_counts.resize((r_data.size() + n_tile_size - 1) / n_tile_size);
		r_tile_carry_outs.resize((r_data.size() + n_tile_size - 1) / n_tile_size);
		r_tile_head_flags.resize((r_data.size() + n_tile_size - 1) / n_tile_size);
		for(size_t i = 0, n = r_data.size(), n_tile = 0; i < n; ++ n_tile) {
			uint32_t n_accum = 0;
			uint32_t n_tail_num = 0, n_warp_tails = 0;
			uint32_t n_head_num = 0, n_warp_heads = 0;
			size_t l = 0;
			for(size_t n_end = min(n, i + n_tile_size); i < n_end; ++ i, ++ l) {
				if(r_head_flags[i]) {
					{
						{
							size_t n_items_thread = n_tile_size / n_workgroup_size;
							size_t n_handling_thread = l / n_items_thread;
							size_t n_warp = n_handling_thread / 32;
							n_warp_heads |= 1 << n_warp;
						}
						if(l) {
							size_t n_items_thread = n_tile_size / n_workgroup_size;
							size_t n_handling_thread = (l - 1) / n_items_thread;
							size_t n_warp = n_handling_thread / 32;
							n_warp_tails |= 1 << n_warp;
							++ n_tail_num; // that's a tail
						}
						++ n_head_num; // that's a head
					}
					n_accum = 0;
				}
				n_accum += r_data[i];
				r_dest[i] = n_accum;
			}
			if(i == n || r_head_flags[i]) { // i is now one past the current segment
				++ n_tail_num; // that's a tail
				{
					size_t n_items_thread = n_tile_size / n_workgroup_size;
					size_t n_handling_thread = (l - 1) / n_items_thread;
					size_t n_warp = n_handling_thread / 32;
					n_warp_tails |= 1 << n_warp;
				}
				//if(i == n && n % n_tile_size)
				//	n_accum = 0; // the last tile in the GPU kernel clears the carry // not anymore
			}
			r_tile_carry_outs[n_tile] = n_accum;
			r_tile_head_flags[n_tile] = n_warp_heads; //n_head_num;// store other
			r_tile_tail_counts[n_tile] = n_tail_num;
		}
	}

	/*void Segmented_TileScan_HF(std::vector<uint32_t> &r_dest, std::vector<uint32_t> &carry_outs,
		std::vector<uint32_t> &tail_counts, const std::vector<uint32_t> &r_data,
		const std::vector<uint32_t> &r_head_flags) const // throw(std::bad_alloc)
	{
		r_dest.resize(r_data.size());
		carry_outs.resize((r_data.size() + n_tile_size - 1) / n_tile_size);
		tail_counts.resize((r_data.size() + n_tile_size - 1) / n_tile_size);
		for(size_t i = 0, n = r_data.size(), n_tile = 0; i < n; ++ n_tile) {
			uint32_t n_accum = 0;
			uint32_t n_tail_num = 0, n_warp_tails = 0;
			uint32_t n_head_num = 0, n_warp_heads = 0;
			size_t l = 0;
			for(size_t n_end = min(n, i + n_tile_size); i < n_end; ++ i, ++ l) {
				if(r_head_flags[i]) {
					{
						{
							size_t n_items_thread = n_tile_size / n_workgroup_size;
							size_t n_handling_thread = l / n_items_thread;
							size_t n_warp = n_handling_thread / 32;
							n_warp_heads |= 1 << n_warp;
						}
						if(l) {
							size_t n_items_thread = n_tile_size / n_workgroup_size;
							size_t n_handling_thread = (l - 1) / n_items_thread;
							size_t n_warp = n_handling_thread / 32;
							n_warp_tails |= 1 << n_warp;
							++ n_tail_num; // that's a tail
						}
						++ n_head_num; // that's a head
					}
					n_accum = 0;
				}
				n_accum += r_data[i];
				r_dest[i] = n_accum;
			}
			if(i == n || r_head_flags[i]) { // i is now one past the current segment
				++ n_tail_num; // that's a tail
				{
					size_t n_items_thread = n_tile_size / n_workgroup_size;
					size_t n_handling_thread = (l - 1) / n_items_thread;
					size_t n_warp = n_handling_thread / 32;
					n_warp_tails |= 1 << n_warp;
				}
				if(i == n && n % n_tile_size)
					n_accum = 0; // the last tile in the GPU kernel clears the carry
			}
			carry_outs[n_tile] = n_accum;
			tail_counts[n_tile] = n_warp_heads; //n_head_num;// store other
		}
	}*/
};

template <class A, class B>
static inline size_t n_Debug_CompareArrays_Verbose(const A &a, const B &b, size_t n,
	const char *p_s_array_name, size_t n_max_error_line_num = 100)
{
	size_t n_error_num = 0;
	for(size_t i = 0; i < n; ++ i) {
		if(a[i] != b[i]) {
			if(++ n_error_num < n_max_error_line_num) {
				fprintf(stderr, "error: %s[" PRIsize "] = %u (should be %u)\n",
					p_s_array_name, i, a[i], b[i]);
			}
		}
	}
	if(n_error_num)
		fprintf(stderr, "error: %s had " PRIsize " error(s)\n", p_s_array_name, n_error_num);
	return n_error_num;
}

} // ~seg_debug

#endif // !__OPENCL_SEGMENTED_SCAN_REDUCTION_DEBUGGING_INCLUDED
