/**
 *	@file ScanKernels.c
 *	@brief parallel scan OpenCL kernels
 *
 *	@date 2014-11-21
 *
 *	Fixed type of n_sum to be _TyScalar (would fail when calculating floating point scans).
 *
 *	@date 2014-11-24
 *
 *	Added inclusive scan routines (perhaps too naive approach, might have to read about that some more).
 *
 *	@date 2015-12-03
 *
 *	Optimized performance of scans using a hierarchy of unrolled synchronization free warp scans rather
 *	than having one large cooperative scan with a lot of thread block synchronizations.
 *
 */

#pragma OPENCL EXTENSION cl_khr_fp64: enable
#pragma OPENCL EXTENSION cl_nv_pragma_unroll: enable
// want to use doubles and unroll loops

/**
 *	@def TWOWORD
 *	@brief macro to enable passing two words through 
 */
#define TWOWORD(a,b) a b

/**
 *	@def CONCAT2
 *	@brief inner macro for token pasting (use \ref CONCAT instead)
 */
#define CONCAT2(pre,post) pre##post

/**
 *	@def CONCAT
 *	@brief macro for token pasting
 */
#define CONCAT(pre,post) CONCAT2(pre, post)

/**
 *	@def STATIC_ASSERT
 *
 *	@param[in] b_condition is asserted condition (must be a compile-time constant)
 *	@param[in] message_token is message token (use upper-case, must be a valid "C" identifier)
 *
 *	@brief static assertion macro
 */
#define STATIC_ASSERT(b_condition,message_token) \
	typedef struct { int message_token[!!(b_condition)]; } CONCAT(static_assertion_helper_, __COUNTER__)
//	typedef struct { int message_token : !!(b_condition); } CONCAT(static_assertion_helper_, __COUNTER__) // new CUDA won't compile bitfields anymore

//#define __restrict
// restrict gains consistently higher rates, but can disable it by undefining this macro
// none of the kernels reads its writes to the global memory so it doesn't break the computation

// === bit twiddles for OpenCL ===

/**
 *	@brief sums even and odd bits in a 32-bit integer constant
 *	@param[in] x is 32-bit integer constant
 *	@return Returns a number with bit pairs, containing sums
 *		of ones in the same corresponding bits in the original number.
 */
#define n_Sum_EvenOddBits(x) ((x) - (((x) >> 1) & 0x55555555U))

/**
 *	@brief sums bit pairs in a 32-bit integer constant
 *	@param[in] x is 32-bit integer constant
 *	@return Returns a number with nibbles, each containing a sum
 *		of values of two corresponding bit pairs in the original number.
 */
#define n_Sum_BitPairs(x) (((x) & 0x33333333U) + (((x) >> 2) & 0x33333333U))

/**
 *	@brief sums bit nubbles in a 32-bit integer constant
 *	@param[in] x is 32-bit integer constant
 *	@return Returns a number with each byte containing a sum
 *		of values of its two nibbles in the original number.
 */
#define n_Sum_Nibbles(x) (((x) + ((x) >> 4)) & 0x0f0f0f0f)

/**
 *	@brief sums values of bytes a 32-bit integer constant
 *	@param[in] x is 32-bit integer constant
 *	@return Returns a sum of values of the four bytes of the original number.
 */
#define n_Sum_Bytes(x) (((x) * 0x01010101) >> 24)

/**
 *	@brief counts set bits in a 32-bit integer constant
 *	@param[in] x is 32-bit integer constant
 *	@return Returns number of bits that are set.
 */
#define n_SetBit_Num(x) n_Sum_Bytes(n_Sum_Nibbles(n_Sum_BitPairs(n_Sum_EvenOddBits(x))))

/**
 *	@brief determines whether a number is power of two, or not, can be evaluated at compile-time
 *	@param[in] n_x is number being tested. note it must be positive
 *	@return Returns true if n_x is power of two, otherwise returns false.
 */
#define b_Is_POT(n_x) (!((n_x) & ((n_x) - 1)))

/**
 *	@brief set all bits after the first leading bit in each nibble
 *	@param[in] x is 32-bit integer constant
 *	@return Returns the number with the set bits duplicated
 *		towards LSB in each nibble of the input.
 */
#define n_RightFill_4(x) ((x) | ((x) >> 1) | (((x) | ((x) >> 1)) >> 2))

/**
 *	@brief set all bits after the first leading bit in each byte
 *	@param[in] x is 32-bit integer constant
 *	@return Returns the number with the set bits duplicated
 *		towards LSB in each byte of the input.
 */
#define n_RightFill_8(x) (n_RightFill_4(x) | (n_RightFill_4(x) >> 4))

/**
 *	@brief set all bits after the first leading bit in each short
 *	@param[in] x is 32-bit integer constant
 *	@return Returns the number with the set bits duplicated
 *		towards LSB in each short of the input.
 */
#define n_RightFill_16(x) (n_RightFill_8(x) | (n_RightFill_8(x) >> 8))

/**
 *	@brief set all bits after the first leading bit in the input
 *	@param[in] x is 32-bit integer constant
 *	@return Returns the input number with the set bits duplicated towards LSB.
 */
#define n_RightFill_Int(x) (n_RightFill_16(x) | (n_RightFill_16(x) >> 16))

/**
 *	@brief calculates power of two greater or equal to the argument
 *
 *	@param[in] x is 32-bit integer constant
 *
 *	@return Returns power of two greater or equal to the input.
 *
 *	@note In case _Ty is unsigned and n_x is greater than the largest power of two,
 *		representable by the given type, returns null.
 *	@note In case _Ty is signed and n_x is greater than the largest power of two,
 *		representable by this type, returns the maximum negative value representable
 *		by this type (can be set to zero by masking-out the sign bit).
 */
#define n_Make_POT(x) (n_RightFill_Int((x) - 1) + 1)

/**
 *	@brief calculates power of two lower or equal to the argument
 *	@param[in] x is 32-bit integer constant
 *	@return Returns power of two lower or equal to the input.
 */
#define n_Make_Lower_POT(x) (n_RightFill_Int((x) >> 1) + 1)

/**
 *	@brief calculates base-2 logarithm (round down)
 *	@param[in] x is 32-bit integer constant
 *	@return Returns base-2 logarithm of the input.
 */
#define n_Log2(x) (((x) > 0) * n_SetBit_Num(n_Make_Lower_POT(x) - 1))

// === ~bit twiddles for OpenCL ===

typedef SCAN_SCALAR_TYPE _TyScalar;

#define LOCAL_MEMORY_BANK_NUM 32 // 32 on SM 2.0+, right?
#define WARP_SIZE 32 // NV so far, important for scheduling

//#define SCAN_LOCAL_WORK_SIZE 256 // best on GTX-680 (over 74 GB/sec, over 92 GB/sec on GTX-780)
//#define SCAN_BLOCK_SIZE 1024 // or just (2 * SCAN_LOCAL_WORK_SIZE)
// configured via compiler commandline

#if !b_Is_POT(LOCAL_MEMORY_BANK_NUM)
#error "LOCAL_MEMORY_BANK_NUM must be power of two"
#endif // !(b_Is_POT(LOCAL_MEMORY_BANK_NUM))

#define LOG_LOCAL_MEMORY_BANK_NUM  n_Log2(LOCAL_MEMORY_BANK_NUM)
#define SCAN_ITEMS_THREAD  ((SCAN_BLOCK_SIZE) / (SCAN_LOCAL_WORK_SIZE))
#define SCAN_ITEMS_THREAD_HALF  ((SCAN_ITEMS_THREAD) / 2)
// could be enums, but preprocessor conditions fail if they are :-/

#define CONFLICT_FREE_OFFSET(x) ((x) >> LOG_LOCAL_MEMORY_BANK_NUM)
#define CONFLICT_FREE_INDEX(x) ((x) + CONFLICT_FREE_OFFSET((x)))

#if !SCAN_LOCAL_WORK_SIZE || SCAN_BLOCK_SIZE % SCAN_LOCAL_WORK_SIZE != 0 || SCAN_BLOCK_SIZE / SCAN_LOCAL_WORK_SIZE < 2
//#error "SCAN_BLOCK_SIZE must be multiple of SCAN_LOCAL_WORK_SIZE, and at least double"
#endif // !SCAN_LOCAL_WORK_SIZE || SCAN_BLOCK_SIZE % SCAN_LOCAL_WORK_SIZE != 0 || SCAN_BLOCK_SIZE / SCAN_LOCAL_WORK_SIZE < 2
// must be a multiple of SCAN_LOCAL_WORK_SIZE

#if !b_Is_POT(SCAN_BLOCK_SIZE)
#error "SCAN_BLOCK_SIZE must be power of two"
#endif // !(b_Is_POT(SCAN_BLOCK_SIZE))

/**
 *	@brief enables kernel to force a specified workgroup size
 *	@note In case the launch workgroup size doesn't match,
 *		CL_INVALID_WORK_GROUP_SIZE (-54) is returned from clEnqueueNDRangeKernel.
 */
#define REQUIRE_WG_SIZE(n_size) __attribute__((reqd_work_group_size(n_size, 1, 1)))

/*inline _TyScalar LocalExScan_SingleBlock_v0(__global const _TyScalar *__restrict p_array,
	__global _TyScalar *__restrict p_scan, const int li)
{
	__local _TyScalar p_workspace[SCAN_BLOCK_SIZE + CONFLICT_FREE_OFFSET(SCAN_BLOCK_SIZE - 1)];
	// add some padding to avoid bank conflicts
	// note that this can be inside, doesn't slow down when calling from kernel

	enum { ls = SCAN_LOCAL_WORK_SIZE }; // = get_local_size(0)

	#pragma unroll
	for(int j = 0, n_dest = li; j < SCAN_ITEMS_THREAD; ++ j, n_dest += ls)
		p_workspace[n_dest + CONFLICT_FREE_OFFSET(n_dest)] = p_array[n_dest];
	// copy the data from the array to local memory
	// unrolls just fine

	int n_step = 1;
	//#pragma unroll
	// unrolling both upsweep and downsweep loops bloats the code, runs much slower; unrolling
	// downsweep loop gains faster code than unrolling upsweep loop (-> unroll that, not this)
	for(int d = SCAN_BLOCK_SIZE >> 1; d > 0; d >>= 1) { // surprisingly unrolls
		if(d >= WARP_SIZE)
			barrier(CLK_LOCAL_MEM_FENCE);
		else
			write_mem_fence(CLK_LOCAL_MEM_FENCE); // helps somewhat
		int n_pair_index = li;
#if SCAN_ITEMS_THREAD_HALF > 2
		#pragma unroll
		for(int j = 0; j < SCAN_ITEMS_THREAD_HALF; ++ j) { // does not unroll; unroll manually
#endif // SCAN_ITEMS_THREAD_HALF > 2
			if(n_pair_index < d) {
				int n_index0 = n_step * (2 * n_pair_index + 1) - 1;
				int n_index1 = n_step * (2 * n_pair_index + 2) - 1;
				n_index0 += CONFLICT_FREE_OFFSET(n_index0);
				n_index1 += CONFLICT_FREE_OFFSET(n_index1);
				p_workspace[n_index1] += p_workspace[n_index0];
			}
#if SCAN_ITEMS_THREAD_HALF == 2
			n_pair_index += ls;
			if(n_pair_index < d) {
				int n_index0 = n_step * (2 * n_pair_index + 1) - 1;
				int n_index1 = n_step * (2 * n_pair_index + 2) - 1;
				n_index0 += CONFLICT_FREE_OFFSET(n_index0);
				n_index1 += CONFLICT_FREE_OFFSET(n_index1);
				p_workspace[n_index1] += p_workspace[n_index0];
			}
#elif SCAN_ITEMS_THREAD_HALF != 1
			n_pair_index += ls;
		}
#endif // SCAN_ITEMS_THREAD_HALF != 1
		n_step += n_step;
	}
	// build sum in place up the tree  

	write_mem_fence(CLK_LOCAL_MEM_FENCE); // required on K40
	_TyScalar n_sum = p_workspace[SCAN_BLOCK_SIZE - 1 + CONFLICT_FREE_OFFSET(SCAN_BLOCK_SIZE - 1)];
	read_mem_fence(CLK_LOCAL_MEM_FENCE); // required on K40
	// makes the divergent branch below shorter

	if(!li)
		p_workspace[SCAN_BLOCK_SIZE - 1 + CONFLICT_FREE_OFFSET(SCAN_BLOCK_SIZE - 1)] = 0;
	// clear the last element // note that it will contain the sum of elements

	// todo - fuse the last and first iteration, split the loops ...

	#pragma unroll
	for(int d = 1; d < SCAN_BLOCK_SIZE; d += d) { // surprisingly unrolls
		n_step >>= 1;
		if(d >= WARP_SIZE)
			barrier(CLK_LOCAL_MEM_FENCE);
		else
			write_mem_fence(CLK_LOCAL_MEM_FENCE); // helps somewhat
		int n_pair_index = li;
#if SCAN_ITEMS_THREAD_HALF > 2
		#pragma unroll
		for(int j = 0; j < SCAN_ITEMS_THREAD_HALF; ++ j) { // does not unroll; unroll manually
#endif // SCAN_ITEMS_THREAD_HALF > 2
			if(n_pair_index < d) {
				int n_index0 = n_step * (2 * n_pair_index + 1) - 1;
				int n_index1 = n_step * (2 * n_pair_index + 2) - 1;
				n_index0 += CONFLICT_FREE_OFFSET(n_index0);
				n_index1 += CONFLICT_FREE_OFFSET(n_index1);
				_TyScalar n_temp = p_workspace[n_index0];
				p_workspace[n_index0] = p_workspace[n_index1];
				p_workspace[n_index1] += n_temp;   
			}
#if SCAN_ITEMS_THREAD_HALF == 2
			n_pair_index += ls;
			if(n_pair_index < d) {
				int n_index0 = n_step * (2 * n_pair_index + 1) - 1;
				int n_index1 = n_step * (2 * n_pair_index + 2) - 1;
				n_index0 += CONFLICT_FREE_OFFSET(n_index0);
				n_index1 += CONFLICT_FREE_OFFSET(n_index1);
				_TyScalar n_temp = p_workspace[n_index0];
				p_workspace[n_index0] = p_workspace[n_index1];
				p_workspace[n_index1] += n_temp;   
			}
#elif SCAN_ITEMS_THREAD_HALF != 1
			n_pair_index += ls;
		}
#endif // SCAN_ITEMS_THREAD_HALF != 1
	}
	// traverse down the tree, build scan

	// t_odo - need to loop SCAN_BLOCK_SIZE / (SCAN_LOCAL_WORK_SIZE * 2) times inside each iteration to support more elems processed by a single thread

	barrier(CLK_LOCAL_MEM_FENCE);

	#pragma unroll
	for(int j = 0, n_dest = li; j < SCAN_ITEMS_THREAD; ++ j, n_dest += ls) // maybe overly complicated, but can optimize away
		p_scan[n_dest] = p_workspace[n_dest + CONFLICT_FREE_OFFSET(n_dest)];
	// copy the data back to the global memory
	// unrolls just fine

	return n_sum; // incorrect, except for thread 0
}*/

// god please bring CL2 and templates already
#define WARP_COOP_SCAN_TEMPLATE(_n_scan_size,_n_thread_num,T,p_workspace,li)										\
	{																												\
		enum {																										\
			n_scan_size = (_n_scan_size),																			\
			n_thread_num = (_n_thread_num), /* make sure those are compile-time consts */							\
			n_item_per_thread_num = (n_scan_size >= 2 * n_thread_num)? n_scan_size / (2 * n_thread_num) : 1			\
			/* the extra threads will not be used here, they will just get their copy of the sum */					\
		};																											\
		/* number of pairs to swap/add per thread */																\
																													\
		STATIC_ASSERT(n_thread_num <= WARP_SIZE, WARP_SCAN_WITH_TOO_MANY_THREADS);									\
		STATIC_ASSERT(n_scan_size % n_thread_num == 0, HALF_SCAN_SIZE_MUST_DIVIDE_THREAD_COUNT);					\
																													\
		_Pragma("unroll")																							\
		for(int d = n_scan_size >> 1, n_step = 1; d > 0; d >>= 1) { /* surprisingly unrolls */						\
			write_mem_fence(CLK_LOCAL_MEM_FENCE);																	\
																													\
			if(li < d) {																							\
				int n_index0 = n_step * (2 * li + 1) - 1;															\
				int n_index1 = n_step * (2 * li + 2) - 1;															\
				p_workspace[CONFLICT_FREE_INDEX(n_index1)] +=														\
					p_workspace[CONFLICT_FREE_INDEX(n_index0)];														\
			}																										\
			if(n_item_per_thread_num > 1) {																			\
				int pi = li + n_thread_num;																			\
				if(pi < d) {																						\
					int n_index0 = n_step * (2 * pi + 1) - 1;														\
					int n_index1 = n_step * (2 * pi + 2) - 1;														\
					p_workspace[CONFLICT_FREE_INDEX(n_index1)] +=													\
						p_workspace[CONFLICT_FREE_INDEX(n_index0)];													\
				}																									\
			}																										\
			if(n_item_per_thread_num > 2) {																			\
				int pi = li + n_thread_num * 2;																		\
				if(pi < d) {																						\
					int n_index0 = n_step * (2 * pi + 1) - 1;														\
					int n_index1 = n_step * (2 * pi + 2) - 1;														\
					p_workspace[CONFLICT_FREE_INDEX(n_index1)] +=													\
						p_workspace[CONFLICT_FREE_INDEX(n_index0)];													\
				}																									\
			}																										\
			if(n_item_per_thread_num > 3) {																			\
				int pi = li + n_thread_num * 3;																		\
				if(pi < d) {																						\
					int n_index0 = n_step * (2 * pi + 1) - 1;														\
					int n_index1 = n_step * (2 * pi + 2) - 1;														\
					p_workspace[CONFLICT_FREE_INDEX(n_index1)] +=													\
						p_workspace[CONFLICT_FREE_INDEX(n_index0)];													\
				}																									\
			}																										\
			if(n_item_per_thread_num > 4) {																			\
				_Pragma("unroll")																					\
				for(int i = 4; i < n_item_per_thread_num; ++ i) { /* this sadly does not unroll very well */		\
					int pi = li + n_thread_num * i;																	\
					if(pi < d) {																					\
						int n_index0 = n_step * (2 * pi + 1) - 1;													\
						int n_index1 = n_step * (2 * pi + 2) - 1;													\
						p_workspace[CONFLICT_FREE_INDEX(n_index1)] +=												\
							p_workspace[CONFLICT_FREE_INDEX(n_index0)];												\
					}																								\
				}																									\
			}																										\
			n_step += n_step;																						\
		}																											\
		/* build sum in place up the tree   */																		\
																													\
		write_mem_fence(CLK_LOCAL_MEM_FENCE); /* required on K40 */													\
		T n_sum = p_workspace[CONFLICT_FREE_INDEX(n_scan_size - 1)];												\
		read_mem_fence(CLK_LOCAL_MEM_FENCE); /* required on K40 */													\
		/* makes the divergent branch below shorter */																\
																													\
		if(!li)																										\
			p_workspace[CONFLICT_FREE_INDEX(n_scan_size - 1)] = 0;													\
		/* clear the last element // note that it will contain the sum of elements */								\
																													\
		_Pragma("unroll")																							\
		for(int d = 1, n_step = n_scan_size; d < n_scan_size; d += d) { /* surprisingly unrolls */					\
			n_step >>= 1;																							\
			write_mem_fence(CLK_LOCAL_MEM_FENCE);																	\
																													\
			if(li < d) {																							\
				int n_index0 = n_step * (2 * li + 1) - 1;															\
				int n_index1 = n_step * (2 * li + 2) - 1;															\
				n_index0 = CONFLICT_FREE_INDEX(n_index0);															\
				n_index1 = CONFLICT_FREE_INDEX(n_index1);															\
				T n_temp = p_workspace[n_index0];																	\
				p_workspace[n_index0] = p_workspace[n_index1];														\
				p_workspace[n_index1] += n_temp;																	\
			}																										\
			if(n_item_per_thread_num > 1) {																			\
				int pi = li + n_thread_num;																			\
				if(pi < d) {																						\
					int n_index0 = n_step * (2 * pi + 1) - 1;														\
					int n_index1 = n_step * (2 * pi + 2) - 1;														\
					n_index0 = CONFLICT_FREE_INDEX(n_index0);														\
					n_index1 = CONFLICT_FREE_INDEX(n_index1);														\
					T n_temp = p_workspace[n_index0];																\
					p_workspace[n_index0] = p_workspace[n_index1];													\
					p_workspace[n_index1] += n_temp;																\
				}																									\
			}																										\
			if(n_item_per_thread_num > 2) {																			\
				int pi = li + n_thread_num * 2;																		\
				if(pi < d) {																						\
					int n_index0 = n_step * (2 * pi + 1) - 1;														\
					int n_index1 = n_step * (2 * pi + 2) - 1;														\
					n_index0 = CONFLICT_FREE_INDEX(n_index0);														\
					n_index1 = CONFLICT_FREE_INDEX(n_index1);														\
					T n_temp = p_workspace[n_index0];																\
					p_workspace[n_index0] = p_workspace[n_index1];													\
					p_workspace[n_index1] += n_temp;																\
				}																									\
			}																										\
			if(n_item_per_thread_num > 3) {																			\
				int pi = li + n_thread_num * 3;																		\
				if(pi < d) {																						\
					int n_index0 = n_step * (2 * pi + 1) - 1;														\
					int n_index1 = n_step * (2 * pi + 2) - 1;														\
					n_index0 = CONFLICT_FREE_INDEX(n_index0);														\
					n_index1 = CONFLICT_FREE_INDEX(n_index1);														\
					T n_temp = p_workspace[n_index0];																\
					p_workspace[n_index0] = p_workspace[n_index1];													\
					p_workspace[n_index1] += n_temp;																\
				}																									\
			}																										\
			if(n_item_per_thread_num > 4) {																			\
				_Pragma("unroll")																					\
				for(int i = 4; i < n_item_per_thread_num; ++ i) { /* this sadly does not unroll very well */		\
					int pi = li + n_thread_num * i;																	\
					if(pi < d) {																					\
						int n_index0 = n_step * (2 * pi + 1) - 1;													\
						int n_index1 = n_step * (2 * pi + 2) - 1;													\
						n_index0 = CONFLICT_FREE_INDEX(n_index0);													\
						n_index1 = CONFLICT_FREE_INDEX(n_index1);													\
						T n_temp = p_workspace[n_index0];															\
						p_workspace[n_index0] = p_workspace[n_index1];												\
						p_workspace[n_index1] += n_temp;															\
					}																								\
				}																									\
			}																										\
		}																											\
		/* traverse down the tree, build scan */																	\
																													\
		return n_sum;																								\
	}

#define WARP_SCAN_SIZE ((WARP_SIZE) * 2)
//((WARP_SIZE) * 2) // each thread scans two elements; 32 * 2 = 64 elements

inline _TyScalar Warp_ExScan_SingleBlock(__local _TyScalar *p_workspace/*[32]*/, const int li)
{
	WARP_COOP_SCAN_TEMPLATE(WARP_SCAN_SIZE, WARP_SIZE, _TyScalar, p_workspace, li)
}

#define SCAN_WARPTHREAD_THREADS ((SCAN_LOCAL_WORK_SIZE) / (WARP_SCAN_SIZE)) // 512 / 64 = 8 threads working on each 16 elements

#if (SCAN_BLOCK_SIZE) % (WARP_SCAN_SIZE) != 0
#error "SCAN_BLOCK_SIZE must be a multiple of WARP_SCAN_SIZE"
#endif // (SCAN_BLOCK_SIZE) % (WARP_SCAN_SIZE) != 0

#define SCAN_ITEMS_WARPTHREAD ((SCAN_BLOCK_SIZE) / (WARP_SCAN_SIZE)) // 1024 / 64 = 16 elements to scan in the top level
//#define LOG2_SCAN_ITEMS_WARPTHREAD n_Log2((SCAN_ITEMS_WARPTHREAD)) // not needed, loops unrolled anyway

inline _TyScalar Warp_ExScan_MultiBlock(__local _TyScalar *p_workspace/*[32]*/, const int li)
{
	p_workspace += CONFLICT_FREE_INDEX(SCAN_ITEMS_WARPTHREAD * (li / SCAN_WARPTHREAD_THREADS)); // shift workspace
	const int lim = li % SCAN_WARPTHREAD_THREADS; // modulo thread index
	WARP_COOP_SCAN_TEMPLATE(SCAN_ITEMS_WARPTHREAD, SCAN_WARPTHREAD_THREADS, _TyScalar, p_workspace, lim) // reduce each block separately
}

inline _TyScalar LocalExScan_SingleBlock/*_v1*/(__global const _TyScalar *__restrict p_array,
	__global _TyScalar *__restrict p_scan, const int li)
{
	__local _TyScalar p_warp_workspace[WARP_SCAN_SIZE + CONFLICT_FREE_OFFSET(WARP_SCAN_SIZE - 1)]; // if WARP_SCAN_SIZE == LOCAL_MEMORY_BANK_NUM then this array could hide inside p_workspace (in the skipped entries) but this would then make the warp scan nontrivial
	// only enough storage to calculate the warp size storage

	//__private _TyScalar p_my_elems[SCAN_ITEMS_THREAD];
	// for each thread (problems with read coalescing though, could read from the shared memory,
	// would need more registers or on cuda could use fast warp exchanges) // t_odo - try that
	// cannot be done, the first warp will read the first 32 values, all of them will go to the
	// first thread registers. this cannot be done using __shuffle alone

	__local _TyScalar p_workspace[SCAN_BLOCK_SIZE + CONFLICT_FREE_OFFSET(SCAN_BLOCK_SIZE - 1)];
	// add some padding to avoid bank conflicts
	// note that this can be inside, doesn't slow down when calling from kernel

	enum { ls = SCAN_LOCAL_WORK_SIZE }; // = get_local_size(0) // todo make it an enum

	#pragma unroll
	for(int j = 0, n_src = li; j < SCAN_ITEMS_THREAD; ++ j, n_src += ls)
		p_workspace[n_src + CONFLICT_FREE_OFFSET(n_src)] = p_array[n_src];
	// copy the data from the array to local memory

	barrier(CLK_LOCAL_MEM_FENCE);

	if(li < WARP_SCAN_SIZE) {
		_TyScalar n_local_sum = 0;

		#pragma unroll
		for(int j = 0, n_src = li * SCAN_ITEMS_WARPTHREAD; j < SCAN_ITEMS_WARPTHREAD; ++ j, ++ n_src) {
			const _TyScalar n_add = p_workspace[n_src + CONFLICT_FREE_OFFSET(n_src)];
			p_workspace[n_src + CONFLICT_FREE_OFFSET(n_src)] = n_local_sum;
			n_local_sum += n_add;
		}
		// calculate thread-local scan (e.g. of 8 elements), this saves greatly on synchronization

		p_warp_workspace[CONFLICT_FREE_INDEX(li)] = n_local_sum;
	}

	barrier(CLK_LOCAL_MEM_FENCE);

	_TyScalar n_sum = 0;
	if(li < WARP_SIZE) // could put this inside of Warp_ExScan_SingleBlock() and then all the threads would have sum; or make n_sum __local
		n_sum = Warp_ExScan_SingleBlock(p_warp_workspace, li); // this actually only uses 16 threads to reduce 32 elems or 32 threads to reduce 64 elems
	// calculate warp scan

	barrier(CLK_LOCAL_MEM_FENCE);

	#pragma unroll
	for(int j = 0, n_dest = li; j < SCAN_ITEMS_THREAD; ++ j, n_dest += ls) {
		/*if(n_dest < 32)
			p_scan[n_dest] = p_warp_workspace[n_dest]; // debug - see contents
		else if(n_dest < 69)
			p_scan[n_dest] = p_warp_workspace[n_dest % 32];*/
		p_scan[n_dest] = p_workspace[n_dest + CONFLICT_FREE_OFFSET(n_dest)] +
			p_warp_workspace[CONFLICT_FREE_INDEX(n_dest / SCAN_ITEMS_WARPTHREAD)]; // no back conflicts, resolved using broadcast
		/*else
			p_scan[n_dest] = get_local_size(0);*/
	}

	return n_sum;
}

inline _TyScalar LocalExScan_SingleBlock_v2(__global const _TyScalar *__restrict p_array,
	__global _TyScalar *__restrict p_scan, const int li) // this is now slower than the above _v1
{
	__local _TyScalar p_warp_workspace[WARP_SCAN_SIZE + CONFLICT_FREE_OFFSET(WARP_SCAN_SIZE - 1)]; // if WARP_SCAN_SIZE == LOCAL_MEMORY_BANK_NUM then this array could hide inside p_workspace (in the skipped entries) but this would then make the warp scan nontrivial
	// only enough storage to calculate the warp size storage

	//__private _TyScalar p_my_elems[SCAN_ITEMS_THREAD];
	// for each thread (problems with read coalescing though, could read from the shared memory,
	// would need more registers or on cuda could use fast warp exchanges) // todo - try that

	__local _TyScalar p_workspace[SCAN_BLOCK_SIZE + CONFLICT_FREE_OFFSET(SCAN_BLOCK_SIZE - 1)];
	// add some padding to avoid bank conflicts
	// note that this can be inside, doesn't slow down when calling from kernel

	enum { ls = SCAN_LOCAL_WORK_SIZE }; // = get_local_size(0) // todo make it an enum

	#pragma unroll
	for(int j = 0, n_src = li; j < SCAN_ITEMS_THREAD; ++ j, n_src += ls)
		p_workspace[n_src + CONFLICT_FREE_OFFSET(n_src)] = p_array[n_src];
	// copy the data from the array to local memory

	barrier(CLK_LOCAL_MEM_FENCE);

	_TyScalar n_local_sum = Warp_ExScan_MultiBlock(p_workspace, li);
	// calculate warp-local scans (e.g. of 8 elements), this saves greatly on synchronization

	if(!(li % SCAN_WARPTHREAD_THREADS)) // the first thread for each block
		p_warp_workspace[CONFLICT_FREE_INDEX(li / SCAN_WARPTHREAD_THREADS)] = n_local_sum;
	// save for the next level

	barrier(CLK_LOCAL_MEM_FENCE);

	_TyScalar n_sum = 0;
	if(li < WARP_SIZE) // could put this inside of Warp_ExScan_SingleBlock() and then all the threads would have sum; or make n_sum __local
		n_sum = Warp_ExScan_SingleBlock(p_warp_workspace, li); // this actually only uses 16 threads
	// calculate warp scan

	barrier(CLK_LOCAL_MEM_FENCE);

	#pragma unroll
	for(int j = 0, n_dest = li; j < SCAN_ITEMS_THREAD; ++ j, n_dest += ls) {
		/*if(n_dest < 64)
			p_scan[n_dest] = p_warp_workspace[n_dest]; // debug - see contents
		else if(n_dest < 69)
			p_scan[n_dest] = p_warp_workspace[n_dest % 32];*/
		p_scan[n_dest] = p_workspace[n_dest + CONFLICT_FREE_OFFSET(n_dest)] +
			p_warp_workspace[CONFLICT_FREE_INDEX(n_dest / SCAN_ITEMS_WARPTHREAD)]; // no back conflicts, resolved using broadcast
		/*else
			p_scan[n_dest] = get_local_size(0);*/
	}

	return n_sum;
}

inline _TyScalar LocalInScan_SingleBlock(__global const _TyScalar *__restrict p_array,
	__global _TyScalar *__restrict p_scan, const int li) // inclusive scan
{
	__local _TyScalar p_workspace[SCAN_BLOCK_SIZE + CONFLICT_FREE_OFFSET(SCAN_BLOCK_SIZE - 1)];
	// add some padding to avoid bank conflicts
	// note that this can be inside, doesn't slow down when calling from kernel

	enum { ls = SCAN_LOCAL_WORK_SIZE }; // = get_local_size(0)

	#pragma unroll
	for(int j = 0, n_dest = li; j < SCAN_ITEMS_THREAD; ++ j, n_dest += ls)
		p_workspace[n_dest + CONFLICT_FREE_OFFSET(n_dest)] = p_array[n_dest];
	// copy the data from the array to local memory
	// unrolls just fine

	int n_step = 1;
	//#pragma unroll
	// unrolling both upsweep and downsweep loops bloats the code, runs much slower; unrolling
	// downsweep loop gains faster code than unrolling upsweep loop (-> unroll that, not this)
	for(int d = SCAN_BLOCK_SIZE >> 1; d > 0; d >>= 1) { // surprisingly unrolls
		if(d >= WARP_SIZE)
			barrier(CLK_LOCAL_MEM_FENCE);
		else
			write_mem_fence(CLK_LOCAL_MEM_FENCE); // helps somewhat
		int n_pair_index = li;
#if SCAN_ITEMS_THREAD_HALF > 2
		#pragma unroll
		for(int j = 0; j < SCAN_ITEMS_THREAD_HALF; ++ j) { // does not unroll; unroll manually
#endif // SCAN_ITEMS_THREAD_HALF > 2
			if(n_pair_index < d) {
				int n_index0 = n_step * (2 * n_pair_index + 1) - 1;
				int n_index1 = n_step * (2 * n_pair_index + 2) - 1;
				n_index0 += CONFLICT_FREE_OFFSET(n_index0);
				n_index1 += CONFLICT_FREE_OFFSET(n_index1);
				p_workspace[n_index1] += p_workspace[n_index0];
			}
#if SCAN_ITEMS_THREAD_HALF == 2
			n_pair_index += ls;
			if(n_pair_index < d) {
				int n_index0 = n_step * (2 * n_pair_index + 1) - 1;
				int n_index1 = n_step * (2 * n_pair_index + 2) - 1;
				n_index0 += CONFLICT_FREE_OFFSET(n_index0);
				n_index1 += CONFLICT_FREE_OFFSET(n_index1);
				p_workspace[n_index1] += p_workspace[n_index0];
			}
#elif SCAN_ITEMS_THREAD_HALF != 1
			n_pair_index += ls;
		}
#endif // SCAN_ITEMS_THREAD_HALF != 1
		n_step += n_step;
	}
	// build sum in place up the tree  

	write_mem_fence(CLK_LOCAL_MEM_FENCE); // required on K40
	_TyScalar n_sum = p_workspace[SCAN_BLOCK_SIZE - 1 + CONFLICT_FREE_OFFSET(SCAN_BLOCK_SIZE - 1)];
	read_mem_fence(CLK_LOCAL_MEM_FENCE); // required on K40
	// makes the divergent branch below shorter

	if(!li)
		p_workspace[SCAN_BLOCK_SIZE - 1 + CONFLICT_FREE_OFFSET(SCAN_BLOCK_SIZE - 1)] = 0;
	// clear the last element // note that it will contain the sum of elements

	// todo - fuse the last and first iteration, split the loops ...

	#pragma unroll
	for(int d = 1; d < SCAN_BLOCK_SIZE; d += d) { // surprisingly unrolls
		n_step >>= 1;
		if(d >= WARP_SIZE)
			barrier(CLK_LOCAL_MEM_FENCE);
		else
			write_mem_fence(CLK_LOCAL_MEM_FENCE); // helps somewhat
		int n_pair_index = li;
#if SCAN_ITEMS_THREAD_HALF > 2
		#pragma unroll
		for(int j = 0; j < SCAN_ITEMS_THREAD_HALF; ++ j) { // does not unroll; unroll manually
#endif // SCAN_ITEMS_THREAD_HALF > 2
			if(n_pair_index < d) {
				int n_index0 = n_step * (2 * n_pair_index + 1) - 1;
				int n_index1 = n_step * (2 * n_pair_index + 2) - 1;
				n_index0 += CONFLICT_FREE_OFFSET(n_index0);
				n_index1 += CONFLICT_FREE_OFFSET(n_index1);
				_TyScalar n_temp = p_workspace[n_index0];
				p_workspace[n_index0] = p_workspace[n_index1];
				p_workspace[n_index1] += n_temp;   
			}
#if SCAN_ITEMS_THREAD_HALF == 2
			n_pair_index += ls;
			if(n_pair_index < d) {
				int n_index0 = n_step * (2 * n_pair_index + 1) - 1;
				int n_index1 = n_step * (2 * n_pair_index + 2) - 1;
				n_index0 += CONFLICT_FREE_OFFSET(n_index0);
				n_index1 += CONFLICT_FREE_OFFSET(n_index1);
				_TyScalar n_temp = p_workspace[n_index0];
				p_workspace[n_index0] = p_workspace[n_index1];
				p_workspace[n_index1] += n_temp;   
			}
#elif SCAN_ITEMS_THREAD_HALF != 1
			n_pair_index += ls;
		}
#endif // SCAN_ITEMS_THREAD_HALF != 1
	}
	// traverse down the tree, build scan

	// t_odo - need to loop SCAN_BLOCK_SIZE / (SCAN_LOCAL_WORK_SIZE * 2) times inside each iteration to support more elems processed by a single thread

	barrier(CLK_LOCAL_MEM_FENCE);

	if(!li)
		p_workspace[0 + CONFLICT_FREE_OFFSET(0)] = n_sum; // will be read by the last thread in the loop below, barrier required
	// the first thread fills the full sum in the first place

	barrier(CLK_LOCAL_MEM_FENCE); // !!

	#pragma unroll
	for(int j = 0, n_dest = li; j < SCAN_ITEMS_THREAD; ++ j, n_dest += ls) { // maybe overly complicated, but can optimize away
		unsigned int n_src = (n_dest + 1) & (SCAN_BLOCK_SIZE - 1); // SCAN_BLOCK_SIZE is power of two (checked with #error), no need for modulo
		p_scan[n_dest] = p_workspace[n_src + CONFLICT_FREE_OFFSET(n_src)];
	}
	// copy the data back to the global memory, skew by one item on reading from local memory (no performance toll)
	// unrolls just fine

	// note that here needs to be one more barrier if calling multiple times, due to the skewed

	// alternately could alloc more local memory (one more per workgroup) and save the "&",
	// but that should be mostly hidden under the global memory write latency and the loop is unrolled anyway

	return n_sum; // incorrect, except for thread 0
return 0;
}

// note that it can work inplace; to support arbitrary-size arrays, just allocate p_array and p_scan to the nearest multiple of SCAN_BLOCK_SIZE
__kernel void REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) LocalScan_v0(__global const _TyScalar *__restrict p_array,
	int n_array_size, __global _TyScalar *__restrict p_scan)
{
	const int n_group_offset = get_group_id(0) * SCAN_BLOCK_SIZE;
	p_array += n_group_offset;
	p_scan += n_group_offset;
	n_array_size -= n_group_offset;
	const int n_global_step = get_num_groups(0) * SCAN_BLOCK_SIZE;
	n_array_size = (n_array_size + n_global_step - 1) / n_global_step;
	// calculate group offset

	const int li = get_local_id(0);
	for(int i = 0; i < n_array_size; ++ i, p_array += n_global_step, p_scan += n_global_step)
		LocalExScan_SingleBlock(p_array, p_scan, li);
	// process all the blocks in the array
	// note that passing thread id as an arg is actually faster than getting
	// it inside (calling get_local_id(0) is evidently costy)
}

__kernel void REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) LocalInScan_v0(__global const _TyScalar *__restrict p_array,
	int n_array_size, __global _TyScalar *__restrict p_scan)
{
	const int n_group_offset = get_group_id(0) * SCAN_BLOCK_SIZE;
	p_array += n_group_offset;
	p_scan += n_group_offset;
	n_array_size -= n_group_offset;
	const int n_global_step = get_num_groups(0) * SCAN_BLOCK_SIZE;
	n_array_size = (n_array_size + n_global_step - 1) / n_global_step;
	// calculate group offset

	const int li = get_local_id(0);
	for(int i = 0; i < n_array_size; ++ i, p_array += n_global_step, p_scan += n_global_step) {
		LocalInScan_SingleBlock(p_array, p_scan, li);
		barrier(CLK_LOCAL_MEM_FENCE); // the threads are writing out the scan to global memory skewed. without this the next iteration threads might overwrite the still unwritten values
	}
	// process all the blocks in the array
	// note that passing thread id as an arg is actually faster than getting
	// it inside (calling get_local_id(0) is evidently costy)
}

__kernel void REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) LocalScan_NoLoop_v0(__global const _TyScalar *__restrict p_array,
	int n_array_size, __global _TyScalar *__restrict p_scan)
{
	const int n_group_offset = get_group_id(0) * SCAN_BLOCK_SIZE;
	p_array += n_group_offset;
	p_scan += n_group_offset;
	// calculate group offset

	LocalExScan_SingleBlock(p_array, p_scan, get_local_id(0));
	// there is only a single block per workgroup
}

__kernel void REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) LocalInScan_NoLoop_v0(__global const _TyScalar *__restrict p_array,
	int n_array_size, __global _TyScalar *__restrict p_scan)
{
	const int n_group_offset = get_group_id(0) * SCAN_BLOCK_SIZE;
	p_array += n_group_offset;
	p_scan += n_group_offset;
	// calculate group offset

	LocalInScan_SingleBlock(p_array, p_scan, get_local_id(0));
	// there is only a single block per workgroup
}

__kernel void REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) LocalScan_Single_v0(__global const _TyScalar *__restrict p_array,
	int n_array_size, __global _TyScalar *__restrict p_scan)
{
	LocalExScan_SingleBlock(p_array, p_scan, get_local_id(0));
	// no offset, there is only a single block, and a single workgroup
}

__kernel void REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) LocalInScan_Single_v0(__global const _TyScalar *__restrict p_array,
	int n_array_size, __global _TyScalar *__restrict p_scan)
{
	LocalInScan_SingleBlock(p_array, p_scan, get_local_id(0));
	// no offset, there is only a single block, and a single workgroup
}

__kernel void REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) LocalScan_Sums_v0(__global const _TyScalar *__restrict p_array,
	int n_array_size, __global _TyScalar *__restrict p_scan, __global _TyScalar *__restrict p_sums)
{
	int n_group_offset = get_group_id(0);
	p_sums += n_group_offset;
	n_group_offset *= SCAN_BLOCK_SIZE;
	p_array += n_group_offset;
	p_scan += n_group_offset;
	n_array_size -= n_group_offset;
	const int n_group_num = get_num_groups(0);
	const int n_global_step = n_group_num * SCAN_BLOCK_SIZE;
	n_array_size = (n_array_size + n_global_step - 1) / n_global_step;
	// calculate group offset

	const int li = get_local_id(0);
	for(int i = 0; i < n_array_size; ++ i, p_array += n_global_step, p_scan += n_global_step, p_sums += n_group_num) {
		_TyScalar n_sum = LocalExScan_SingleBlock(p_array, p_scan, li);
		if(!li)
			*p_sums = n_sum;
	}
	// process all the blocks in the array
	// note that passing thread id as an arg is actually faster than getting
	// it inside (calling get_local_id(0) is evidently costy)
}

__kernel void REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) LocalInScan_Sums_v0(__global const _TyScalar *__restrict p_array,
	int n_array_size, __global _TyScalar *__restrict p_scan, __global _TyScalar *__restrict p_sums)
{
	int n_group_offset = get_group_id(0);
	p_sums += n_group_offset;
	n_group_offset *= SCAN_BLOCK_SIZE;
	p_array += n_group_offset;
	p_scan += n_group_offset;
	n_array_size -= n_group_offset;
	const int n_group_num = get_num_groups(0);
	const int n_global_step = n_group_num * SCAN_BLOCK_SIZE;
	n_array_size = (n_array_size + n_global_step - 1) / n_global_step;
	// calculate group offset

	const int li = get_local_id(0);
	for(int i = 0; i < n_array_size; ++ i, p_array += n_global_step, p_scan += n_global_step, p_sums += n_group_num) {
		_TyScalar n_sum = LocalInScan_SingleBlock(p_array, p_scan, li);
		barrier(CLK_LOCAL_MEM_FENCE); // the threads are writing out the scan to global memory skewed. without this the next iteration threads might overwrite the still unwritten values
		if(!li)
			*p_sums = n_sum;
	}
	// process all the blocks in the array
	// note that passing thread id as an arg is actually faster than getting
	// it inside (calling get_local_id(0) is evidently costy)
}

__kernel void REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) LocalScan_Sums_NoLoop_v0(__global const _TyScalar *__restrict p_array,
	int n_array_size, __global _TyScalar *__restrict p_scan, __global _TyScalar *__restrict p_sums)
{
	int n_group_offset = get_group_id(0);
	p_sums += n_group_offset;
	n_group_offset *= SCAN_BLOCK_SIZE;
	p_array += n_group_offset;
	p_scan += n_group_offset;
	// calculate group offset

	int li;
	_TyScalar n_sum = LocalExScan_SingleBlock(p_array, p_scan, li = get_local_id(0));
	if(!li)
		*p_sums = n_sum;
	// process all the blocks in the array; there is a single block per workgroup
}

__kernel void REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) LocalInScan_Sums_NoLoop_v0(__global const _TyScalar *__restrict p_array,
	int n_array_size, __global _TyScalar *__restrict p_scan, __global _TyScalar *__restrict p_sums)
{
	int n_group_offset = get_group_id(0);
	p_sums += n_group_offset;
	n_group_offset *= SCAN_BLOCK_SIZE;
	p_array += n_group_offset;
	p_scan += n_group_offset;
	// calculate group offset

	int li;
	_TyScalar n_sum = LocalInScan_SingleBlock(p_array, p_scan, li = get_local_id(0));
	if(!li)
		*p_sums = n_sum;
	// process all the blocks in the array; there is a single block per workgroup
}

/*__kernel void REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) LocalScan_Sums_Single_v0(__global const _TyScalar *p_array,
	int n_array_size, __global _TyScalar *p_scan, __global _TyScalar *p_sums)
{
	int li;
	_TyScalar n_sum = LocalExScan_SingleBlock(p_array, p_scan, li = get_local_id(0));
	if(!li)
		*p_sums = n_sum;
	// process all the blocks in the array; there is a single block and only one workgroup
}

__kernel void REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) LocalInScan_Sums_Single_v0(__global const _TyScalar *p_array,
	int n_array_size, __global _TyScalar *p_scan, __global _TyScalar *p_sums)
{
	int li;
	_TyScalar n_sum = LocalInScan_SingleBlock(p_array, p_scan, li = get_local_id(0));
	if(!li)
		*p_sums = n_sum;
	// process all the blocks in the array; there is a single block and only one workgroup
}*/ // this does not make sense, unless we want scan and the sum of elements

#define SCAN_BLOCK_SIZE_LOG n_Log2(SCAN_BLOCK_SIZE)

__kernel void  GlobalScan_Offset_v1(__global _TyScalar *__restrict p_block_offsets_scan,
	__global _TyScalar *__restrict p_scan, int n_array_size)
{
	const int gs = get_global_size(0);
	const int gi = get_global_id(0);

	++ p_block_offsets_scan;
	p_scan += SCAN_BLOCK_SIZE;
	n_array_size -= SCAN_BLOCK_SIZE;
	// the first block offset always null

	#pragma unroll 16
	for(int i = gi; i < n_array_size; i += gs)
		p_scan[i] += p_block_offsets_scan[i >> SCAN_BLOCK_SIZE_LOG];
	// much simpler, easier to schedule, but relies on cache (and is pretty slow)
}

__kernel void  REQUIRE_WG_SIZE(SCAN_BLOCK_SIZE / 4) GlobalScan_Offset_Single_v2(__global _TyScalar *__restrict p_block_offsets_scan,
	__global _TyScalar *__restrict p_scan, int n_array_size)
{
	__local _TyScalar n_local_block_off;
	const int li = get_local_id(0) + SCAN_BLOCK_SIZE; // the first block offset always null
	if(li < n_array_size) {
		if(li == SCAN_BLOCK_SIZE)
			n_local_block_off = p_block_offsets_scan[1];
		barrier(CLK_LOCAL_MEM_FENCE);
		// this is actually a good idea

		_TyScalar n_offset = n_local_block_off;//p_block_offsets_scan[1];
		p_scan[li] += n_offset;
		p_scan[li + SCAN_BLOCK_SIZE / 4] += n_offset;
		p_scan[li + SCAN_BLOCK_SIZE / 4 * 2] += n_offset;
		p_scan[li + SCAN_BLOCK_SIZE / 4 * 3] += n_offset;
		// the memory is always aligned up to SCAN_BLOCK_SIZE so we can save some checling
	}
}

__kernel void REQUIRE_WG_SIZE(SCAN_BLOCK_SIZE / 4) GlobalScan_Offset_v2(__global _TyScalar *__restrict p_block_offsets_scan,
	__global _TyScalar *__restrict p_scan, int n_array_size)
{
	const int gs = get_num_groups(0) * SCAN_BLOCK_SIZE;//get_global_size(0);
	const int li = get_local_id(0);
	enum { ls = SCAN_BLOCK_SIZE / 4 };
	const int gi = li + get_group_id(0) * SCAN_BLOCK_SIZE + SCAN_BLOCK_SIZE;//get_global_id(0);
	// each group processes a single block
	// the first block offset always null

	__local _TyScalar n_local_block_off;
	for(int i = gi; i < n_array_size; i += gs) {
		if(!li)
			n_local_block_off = p_block_offsets_scan[i >> SCAN_BLOCK_SIZE_LOG];
		barrier(CLK_LOCAL_MEM_FENCE);
		// this is actually a good idea

		_TyScalar n_private_block_off = n_local_block_off; // also a good idea (faster than using just the local variable)
#if 1
		p_scan[i] += n_private_block_off;
		p_scan[i + ls] += n_private_block_off;
		p_scan[i + ls * 2] += n_private_block_off;
		p_scan[i + ls * 3] += n_private_block_off; // this is also better (below branch not coalesced)
#else // 1
		p_scan[i - li + li * 4 + 0] += n_private_block_off;
		p_scan[i - li + li * 4 + 1] += n_private_block_off;
		p_scan[i - li + li * 4 + 2] += n_private_block_off;
		p_scan[i - li + li * 4 + 3] += n_private_block_off;
#endif // 1
	}
	// actually pretty fast
}

__kernel void REQUIRE_WG_SIZE(SCAN_BLOCK_SIZE / 4) GlobalScan_Offset_v3(__global _TyScalar *__restrict p_block_offsets_scan,
	__global _TyScalar *__restrict p_scan, int n_array_size)
{
	enum { ls = SCAN_BLOCK_SIZE / 4 };//get_local_size(0);
	const int li = get_local_id(0);
	const int n_group_num = get_num_groups(0);
	const int n_global_step = n_group_num * SCAN_BLOCK_SIZE;

	int n_group_offset = get_group_id(0) + 1;// the first block offset always null
	p_block_offsets_scan += n_group_offset;
	n_group_offset *= SCAN_BLOCK_SIZE;
	p_scan += n_group_offset + li;
	// offset the arrays

	//__local _TyScalar n_local_block_off;
	for(int i = n_group_offset; i < n_array_size; i += n_global_step) {
		//if(!li)
		//	n_local_block_off = *p_block_offsets_scan; // this is actually a good idea
		//barrier(CLK_LOCAL_MEM_FENCE);
		_TyScalar n_offset = *p_block_offsets_scan;//n_local_block_off;
		// get offset for the given block

		#pragma unroll // can unroll completely
		for(int j = 0/*li*/; j < SCAN_BLOCK_SIZE; j += SCAN_BLOCK_SIZE / 4/*ls*/)
			p_scan[j] += n_offset;
		// offset the scan blocks, work in coalesced manner

		p_block_offsets_scan += n_group_num;
		p_scan += n_global_step;
		// shift the arrays
	}
}

//	end-of-file
