
#pragma OPENCL EXTENSION cl_khr_fp64: enable
#pragma OPENCL EXTENSION cl_nv_pragma_unroll: enable
// want to use doubles and unroll loops

/**
 *	@def TWOWORD
 *	@brief macro to enable passing two words through 
 */
#define TWOWORD(a,b) a b

#define __restrict
// surprisingly, restrict gains consistently lower rates. ugh.

#include "../kernel_utils/StaticAssert.h"
#include "../kernel_utils/VendorDetect.h"
#include "../kernel_utils/Integer.h"
#include "../kernel_utils/NVIDIA.h" // includes them all
#include "../kernel_utils/LoadStore.h"
#include "../kernel_utils/BitLoadStore.h"

#ifdef NVIDIA
#include "../kernel_utils/NV20.h"
#else // NVIDIA
#include "../kernel_utils/NV20Emu.h"
#endif // NVIDIA
//#include "../kernel_utils/Integer.h" // conditional includes break line numbering (line number restored in the branch taken but then the code continues with the branch not taken and the line numbers increment, the #line statement in the branch not taken is ignored!)

// ---- scan reduce macros ----

//#define SCAN_SCALAR_TYPE uint32_t
//#define SCAN_SCALAR_SIZE 4
// configured via compiler commandline

#define USE_PACKED_HEAD_FLAGS
//#define USE_STRIDED_PACKED_HEAD_FLAGS // defined by the compiler

typedef SCAN_SCALAR_TYPE _TyScalar;

STATIC_ASSERT(SCAN_SCALAR_SIZE == sizeof(_TyScalar), SCAN_SCALAR_SIZE_SCAN_SCALAR_TYPE_MISMATCH); // make sure this is correct

//#define SCAN_LOCAL_WORK_SIZE 256 // best on GTX-680 (over 74 GB/sec, over 92 GB/sec on GTX-780)
//#define SCAN_TILE_SIZE 1024 // or just (2 * SCAN_LOCAL_WORK_SIZE)
// configured via compiler commandline

//STATIC_ASSERT(b_Is_POT(SCAN_TILE_SIZE), SCAN_TILE_SIZE_MUST_BE_POWER_OF_TWO); // does it need to be? do not think so, not anymore with the workgroup approach

#define SCAN_ITEMS_THREAD  ((SCAN_TILE_SIZE) / (SCAN_LOCAL_WORK_SIZE))
//#define SCAN_ITEMS_THREAD_HALF  ((SCAN_ITEMS_THREAD) / 2) // not needed any more, now we're working with the warp tiles
// could be enums, but preprocessor conditions fail if they are :-/

STATIC_ASSERT(!(SCAN_LOCAL_WORK_SIZE % WARP_SIZE), SCAN_LOCAL_WORK_SIZE_MUST_BE_MULTIPLE_OF_WARP_SIZE);
STATIC_ASSERT(SCAN_TILE_SIZE == SCAN_ITEMS_THREAD * SCAN_LOCAL_WORK_SIZE, SCAN_TILE_SIZE_MUST_BE_MULTIPLE_OF_SCAN_LOCAL_WORK_SIZE);

/**
 *	@brief enables kernel to force a specified workgroup size
 *	@note In case the launch workgroup size doesn't match,
 *		CL_INVALID_WORK_GROUP_SIZE (-54) is returned from clEnqueueNDRangeKernel.
 */
#define REQUIRE_WG_SIZE(n_size) __attribute__((reqd_work_group_size(n_size, 1, 1)))

// reduction parts

#define REDUCE_LOCAL_WORK_SIZE SCAN_LOCAL_WORK_SIZE
#define REDUCE_TILE_SIZE SCAN_TILE_SIZE
#define REDUCE_ITEMS_THREAD  ((REDUCE_TILE_SIZE) / (REDUCE_LOCAL_WORK_SIZE))
// those are the same as for scan

//#define REDUCTION_ELEM_OP ((x))
//#define REDUCTION_REDUCE_OP ((x) + (y))
//#define REDUCTION_REDUCE_OPERATOR '+'
//#define REDUCTION_FINAL_OP ((x))
//#define REDUCTION_IDENTITY ((_TyScalar)0)
// configured via compiler commandline

inline _TyScalar t_ReductionElemOp(_TyScalar x) { return (REDUCTION_ELEM_OP); }
inline _TyScalar t_ReductionReduceOp(_TyScalar x, _TyScalar y) { return (REDUCTION_REDUCE_OP); }
//inline _TyScalar t_ReductionFinalOp(_TyScalar x) { return (REDUCTION_FINAL_OP); } // not supported in seg-scan/seg-reduce
// to make the code pretty

#if !(SCAN_ITEMS_THREAD & 1) // even SCAN_ITEMS_THREAD, must be 2, 4, ...
#define USE_CFI
#define CFI_FLAG 1
// for some reason without CFI, scan crashes on K40 otherwise almost as if it couldnt handle the bank conflicts
#else // !(SCAN_ITEMS_THREAD & 1)
#define CFI_FLAG 0
#endif // !(SCAN_ITEMS_THREAD & 1)

#include "../kernel_utils/IntScanReduce.h" // needs REDUCE_LOCAL_WORK_SIZE

// ------ segmented ops -----

#if WARP_SIZE <= 32
#define n_bfind_uintwarp_t(x) n_bfind_uint32_t(x)
#elif WARP_SIZE <= 64
#define n_bfind_uintwarp_t(x) n_bfind_uint64_t(x)
#else
#error "error: warp size too large for segmented scan"
#endif

enum {
	SCAN_WORKGROUP_WARP_NUM = SCAN_LOCAL_WORK_SIZE / WARP_SIZE,
	SCAN_TILE_SIZE_BY_WARP_SIZE = SCAN_TILE_SIZE / WARP_SIZE
};

// calculates distance to the next segment boundary to the left of this thread's value
// the first SCAN_WORKGROUP_WARP_NUM values in p_delta are warp flag ballots, the next SCAN_WORKGROUP_WARP_NUM values
// are warp head flag positions, the last value if workgroup head flag scan
inline int n_SegScan_SegDistance(unsigned int li, bool b_flag, __local uintwarp_t *p_delta)
{
	STATIC_ASSERT(SCAN_WORKGROUP_WARP_NUM <= WARP_SIZE, SEGMENTED_SCAN_CANNOT_WORK_WITH_OVER_WARP_SIZE_WARPS); // wouldn't be able to use ballot

	const unsigned int n_warp = li >> LOG_WARP_SIZE;
	const unsigned int n_lane = li & (WARP_SIZE - 1);
	const uintwarp_t n_warp_mask = ((uintwarp_t)-1) >> (WARP_SIZE - 1 - n_lane); // inclusive search // needed at the end
	const uintwarp_t n_workgroup_mask = ((uintwarp_t)-1 >> 1) >> (WARP_SIZE - 1 - n_lane); // exclusive search // t_odo - is this n_warp_mask >> 1? could save ops but also could serialize the thing

	const uintwarp_t n_warp_flags = n_warp_ballot(b_flag); // could use bit reversal here to get rid of the -31 as well
	//if(!n_lane) // avoid bank conflicts // seen to cause problems on K40 but not in this particular instance
		p_delta[n_warp] = n_warp_flags;
	// the first SCAN_WORKGROUP_WARP_NUM elements form a bit array of b_flag for each thread

	barrier(CLK_LOCAL_MEM_FENCE);

	if(li < SCAN_WORKGROUP_WARP_NUM) {
		uintwarp_t n_workgroup_flags = n_warp_ballot(p_delta[li]); // could use bit reversal here to get rid of the -31 as well
		p_delta[2 * SCAN_WORKGROUP_WARP_NUM] = n_workgroup_flags; // store this
		// since 32*32 = 1024, n_workgroup_flags contains reduction of each warp and coincidentially holds the entire array; can static assert NT to be <= 32 * WARP_SIZE

		int n_warp_start_warp = n_bfind_uintwarp_t(n_workgroup_mask & n_workgroup_flags); // finds position of the first flag to the left, not including this warp's flags (if any) // t_odo - should we use bfind to save the subtraction?
		// finds where the warp segment starts (in which uint32_t)

		int n_warp_start_elem = (n_warp_start_warp != -1)?
			(n_bfind_uintwarp_t(p_delta[n_warp_start_warp]) + (n_warp_start_warp << LOG_WARP_SIZE)) : 0;
		// refines to exactly on which element does it start

		p_delta[SCAN_WORKGROUP_WARP_NUM + li] = n_warp_start_elem; // the second SCAN_WORKGROUP_WARP_NUM elements contain the first boundary to the left of each warp
		// store for everyone to see; warp synchronous, no sync needed
	}

	barrier(CLK_LOCAL_MEM_FENCE);

	//n_warp_flags &= n_warp_mask;
	int n_thread_start_elem = n_bfind_uintwarp_t(n_warp_mask & n_warp_flags); // finds position of the first flag to the left, including this thread's flag // t_odo - should we use bfind to save the subtraction?
	int n_seg_start_distance;
	if(n_thread_start_elem != -1) // todo - is using >= 0 better?
		n_seg_start_distance = n_lane - n_thread_start_elem; // inside this warp
		//n_thread_start_elem = li - (n_thread_start_elem + (~(WARP_SIZE - 1) & li)); // inside this warp // todo - ~31 & li = n_warp * WARP_SIZE - replace it
	else
		n_seg_start_distance = li - p_delta[SCAN_WORKGROUP_WARP_NUM + n_warp]; // in the warp to the left, or at 0 if this is the first warp
	// find the closest flag to the left of this thread within the warp, include the flag for this thread

	return n_seg_start_distance; // moved the "li -" part up
	//return li - n_thread_start_elem; // calculate how far to the left should this thread scan and use that as a threshold for the scan ... makes sense
}

typedef union { // t_odo - make this a union, return the workgroup flags via a pointer
	uintwarp_t p_delta[2 * SCAN_WORKGROUP_WARP_NUM + 1]; // the last element is filled with the workgroup flags scan
#ifdef LOCAL_SEG_SCAN_SAVE_MEMORY
	_TyScalar p_workspace[CONFLICT_FREE_SIZE(SCAN_LOCAL_WORK_SIZE + 1)];
#else LOCAL_SEG_SCAN_SAVE_MEMORY
	_TyScalar p_workspace[2 * SCAN_LOCAL_WORK_SIZE + 1]; // of no consequence unless SCAN_LOCAL_WORK_SIZE == SCAN_TILE_SIZE
#endif // LOCAL_SEG_SCAN_SAVE_MEMORY
} TLocalSegScanStorage;

// processes SCAN_LOCAL_WORK_SIZE elements
inline _TyScalar t_LocalSegScan(unsigned int li, unsigned int n_segment_distance,
	_TyScalar x, __local TLocalSegScanStorage *p_storage, _TyScalar *p_tile_carry_out)
{
	__local _TyScalar *p_values = p_storage->p_workspace; // ...

#ifdef LOCAL_SEG_SCAN_SAVE_MEMORY
	p_values[CONFLICT_FREE_INDEX(li)] = x;
	// save each thread's value

	barrier(CLK_LOCAL_MEM_FENCE);

	//unsigned int n_first = 0;
	#pragma unroll
	for(unsigned int n_offset = 1; n_offset < SCAN_LOCAL_WORK_SIZE; n_offset *= 2) {
		if(n_segment_distance >= n_offset) 
			x = t_ReductionReduceOp(p_values[CONFLICT_FREE_INDEX(/*n_first +*/ li - n_offset)], x); // li >= n_segment_distance always holds and so n_offset <= li
		//n_first = SCAN_LOCAL_WORK_SIZE - n_first;

		barrier(CLK_LOCAL_MEM_FENCE);

		p_values[CONFLICT_FREE_INDEX(/*n_first +*/ li)] = x;

		barrier(CLK_LOCAL_MEM_FENCE);
	}
	// inclusive ping-poinging scan, not work-efficient but performs the reduction in fewer steps than Harris' scan

	x = (li)? p_values[CONFLICT_FREE_INDEX(/*n_first +*/ li - 1)] : REDUCTION_IDENTITY;
	*p_tile_carry_out = p_values[CONFLICT_FREE_INDEX(/*n_first +*/ SCAN_LOCAL_WORK_SIZE - 1)];
	// get the exclusive scan.
#else // LOCAL_SEG_SCAN_SAVE_MEMORY
	p_values[li] = x;
	// save each thread's value

	barrier(CLK_LOCAL_MEM_FENCE);

	unsigned int n_first = 0;
	#pragma unroll
	for(unsigned int n_offset = 1; n_offset < SCAN_LOCAL_WORK_SIZE; n_offset *= 2) {
		if(n_offset <= n_segment_distance) 
			x = t_ReductionReduceOp(p_values[n_first + li - n_offset], x);
		n_first = SCAN_LOCAL_WORK_SIZE - n_first;

		p_values[n_first + li] = x;

		barrier(CLK_LOCAL_MEM_FENCE);
	}
	// inclusive ping-poinging scan, not work-efficient but performs the reduction in fewer steps than Harris' scan

	x = (li)? p_values[n_first + li - 1] : REDUCTION_IDENTITY;
	*p_tile_carry_out = p_values[n_first + SCAN_LOCAL_WORK_SIZE - 1];
	// get the exclusive scan.
#endif // LOCAL_SEG_SCAN_SAVE_MEMORY

	return x;
}

// processes SCAN_LOCAL_WORK_SIZE elements
inline _TyScalar t_LocalSegScan_CalcDist(int li, bool b_flag, _TyScalar x,
	__local TLocalSegScanStorage *p_storage, _TyScalar *p_tile_carry_out)
{
	int n_segment_distance = n_SegScan_SegDistance(li, b_flag, p_storage->p_delta);

	barrier(CLK_LOCAL_MEM_FENCE);

	return t_LocalSegScan(li, n_segment_distance, x, p_storage, p_tile_carry_out);
}

inline _TyScalar t_LocalSegScan_CalcDist_GetTileFlags(int li, bool b_flag, _TyScalar x,
	__local TLocalSegScanStorage *p_storage, _TyScalar *p_tile_carry_out, uint32_t *p_tile_flags)
{
	int n_segment_distance = n_SegScan_SegDistance(li, b_flag, p_storage->p_delta);

	*p_tile_flags = p_storage->p_delta[2 * SCAN_WORKGROUP_WARP_NUM]; // this is already there, visible for all the threads

	barrier(CLK_LOCAL_MEM_FENCE);

	return t_LocalSegScan(li, n_segment_distance, x, p_storage, p_tile_carry_out);
}

typedef struct {
#if defined(USE_STRIDED_PACKED_HEAD_FLAGS)
	uint32_t p_shared_hf[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)]; // only need this to load the flags from global memory to registers, the rest happens in registers
	// this is not supposed to be uintwarp_t, those are not bitfields but rather "flag elements"
#else // USE_STRIDED_PACKED_HEAD_FLAGS
	uintwarp_t p_shared_packed_hf[SCAN_TILE_SIZE_BY_WARP_SIZE + 1]; // last one to simplify the addressing logic
#endif // USE_STRIDED_PACKED_HEAD_FLAGS
} THeadFlag_DecodeStorage; // note that this is not empty, even if USE_PACKED_HEAD_FLAGS is not defined

typedef struct {
#if defined(USE_STRIDED_PACKED_HEAD_FLAGS)
	uintwarp_t p_shared_packed_hf[SCAN_TILE_SIZE_BY_WARP_SIZE];
#else // USE_STRIDED_PACKED_HEAD_FLAGS
	uintwarp_t p_shared_packed_hf[SCAN_TILE_SIZE_BY_WARP_SIZE + 1]; // last one to simplify the addressing logic
#endif // USE_STRIDED_PACKED_HEAD_FLAGS
} THeadFlag_ReadStorage; // note that this is not empty, even if USE_PACKED_HEAD_FLAGS is not defined

typedef struct {
	uintwarp_t p_shared_packed_hf[SCAN_TILE_SIZE_BY_WARP_SIZE + 1]; // last one to simplify the addressing logic
} THeadFlag_ReadStorage1; // note that this is not empty, even if USE_PACKED_HEAD_FLAGS is not defined

typedef struct {
#if defined(USE_STRIDED_PACKED_HEAD_FLAGS)
	union { // !! not used simultaneously
		uintwarp_t p_shared_packed_hf[SCAN_TILE_SIZE_BY_WARP_SIZE];

		uint32_t p_shared_hf[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		// this is not supposed to be uintwarp_t, those are not bitfields but rather "flag elements"
	};
#else // USE_STRIDED_PACKED_HEAD_FLAGS
	uintwarp_t p_shared_packed_hf[SCAN_TILE_SIZE_BY_WARP_SIZE + 1]; // last one to simplify the addressing logic
#endif // USE_STRIDED_PACKED_HEAD_FLAGS
} THeadFlag_DecodePackStorage; // note that this is not empty, even if USE_PACKED_HEAD_FLAGS is not defined 

// decode head flags stored as words, return thread's own flags
inline uint32_t n_Decode_HeadFlags(__global const uint32_t *p_head_flags, const unsigned int n_remainder, 
	const unsigned int n_tid, const unsigned int n_warp, const unsigned int n_lane,
	__local THeadFlag_DecodeStorage *p_temp)
{
#ifdef USE_STRIDED_PACKED_HEAD_FLAGS
	__local uint32_t *p_shared_hf = p_temp->p_shared_hf;
	// this is not supposed to be uintwarp_t, those are not bitfields but rather "flag elements"

	int p_my_hf[SCAN_ITEMS_THREAD];
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_hf, n_tid, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_head_flags, n_remainder, 0, p_shared_hf);
	// cooperatively read all the head flags (need two more clocks to handle the last element)

	uint32_t n_my_head_flags;
	REGISTER_DECODE_PACK_BITARRAY(n_my_head_flags, SCAN_ITEMS_THREAD, p_my_hf);
#else // USE_STRIDED_PACKED_HEAD_FLAGS
	__local uintwarp_t *p_shared_packed_hf = p_temp->p_shared_packed_hf;

	GLOBAL_TO_LOCAL_DECODE_PACK_BITARRAY(p_shared_packed_hf, n_tid,
		SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_head_flags, n_remainder, 0);
	// cooperatively read the flags from global memory (coalesced) and ballot it to shared memory

#if SCAN_ITEMS_THREAD > 1
	barrier(CLK_LOCAL_MEM_FENCE);
#else // SCAN_ITEMS_THREAD > 1
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	// if SCAN_ITEMS_THREAD == 1 then this is all warp synchronous
#endif // SCAN_ITEMS_THREAD > 1

	uint32_t n_my_head_flags;
	LOCAL_READ_BITARRAY(n_my_head_flags, n_tid, n_warp, n_lane,
		SCAN_ITEMS_THREAD, p_shared_packed_hf); // needs padding elem in p_shared_packed_hf
	// read head flags
#endif // USE_STRIDED_PACKED_HEAD_FLAGS

	return n_my_head_flags;
}

typedef struct {
#ifdef USE_STRIDED_PACKED_HEAD_FLAGS
	uint32_t p_shared_hf[ORDERED_LOAD_OVERLAP_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, 1, SCAN_TILE_SIZE)]; // only need this to load the flags from global memory to registers, the rest happens in registers
	// this is not supposed to be uintwarp_t, those are not bitfields but rather "flag elements"
#else // USE_STRIDED_PACKED_HEAD_FLAGS
	uintwarp_t p_shared_packed_hf[SCAN_TILE_SIZE_BY_WARP_SIZE + 1]; // last one to simplify the addressing logic
#endif // USE_STRIDED_PACKED_HEAD_FLAGS
} THeadFlag_DecodeStorage_OneMore;

inline uint32_t n_Decode_HeadFlags_OneMore(__global const uint32_t *p_head_flags, const unsigned int n_remainder, 
	const unsigned int n_tid, const unsigned int n_warp, const unsigned int n_lane,
	__local THeadFlag_DecodeStorage_OneMore *p_temp)
{
#ifdef USE_STRIDED_PACKED_HEAD_FLAGS
	__local uint32_t *p_shared_hf = p_temp->p_shared_hf;
	// this is not supposed to be uintwarp_t, those are not bitfields but rather "flag elements"

	int p_my_hf[SCAN_ITEMS_THREAD + 1];
	GLOBAL_TO_REGISTER_ORDERED_OVERLAP_CFI_COND(CFI_FLAG, p_my_hf, n_tid, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, 1, p_head_flags, n_remainder, 0, p_shared_hf);
	// cooperatively read all the head flags (need two more clocks to handle the last element)

	uint32_t n_my_head_flags;
	REGISTER_DECODE_PACK_BITARRAY(n_my_head_flags, SCAN_ITEMS_THREAD + 1, p_my_hf);
#else // USE_STRIDED_PACKED_HEAD_FLAGS
	__local uintwarp_t *p_shared_packed_hf = p_temp->p_shared_packed_hf;

	GLOBAL_TO_LOCAL_DECODE_PACK_BITARRAY(p_shared_packed_hf, n_tid,
		SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD + 1, p_head_flags, n_remainder, 0);
	// cooperatively read the flags from global memory (coalesced) and ballot it to shared memory

	/*if(!n_tid)
		p_shared_packed_hf[SCAN_TILE_SIZE_BY_WARP_SIZE] = 0;*/
	// set the last warp flags to zero

#if SCAN_ITEMS_THREAD > 1
	barrier(CLK_LOCAL_MEM_FENCE);
#else // SCAN_ITEMS_THREAD > 1
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	// if SCAN_ITEMS_THREAD == 1 then this is all warp synchronous
#endif // SCAN_ITEMS_THREAD > 1

	uint32_t n_my_head_flags;
	LOCAL_READ_BITARRAY_OVERLAP(n_my_head_flags, n_tid, n_warp, n_lane,
		SCAN_ITEMS_THREAD, 1, p_shared_packed_hf); // needs padding elem in p_shared_packed_hf
	// read head flags
#endif // USE_STRIDED_PACKED_HEAD_FLAGS

	return n_my_head_flags;
}

/**
 *	@brief calculates the size of interleaved head flags array
 *	@param[in] n_remainder is remainder in bits
 *	@return Returns the corresponding size of packed head flags, in uintwarp_t.
 */
inline unsigned int n_StridedHeadFlag_Remainder(const unsigned int n_remainder)
{
	/*unsigned int n_highest_thread = n_remainder / SCAN_ITEMS_THREAD;
	unsigned int n_highest_thread_bit = min(n_remainder, (unsigned int)SCAN_ITEMS_THREAD);
	unsigned int n_warps = (n_highest_thread_bit - 1) * SCAN_WORKGROUP_WARP_NUM +
		((n_highest_thread + WARP_SIZE - 1) >> LOG_WARP_SIZE);*/
	// exact formula, slow

	STATIC_ASSERT(SCAN_WORKGROUP_WARP_NUM * SCAN_ITEMS_THREAD == SCAN_TILE_SIZE_BY_WARP_SIZE, JUST_CHECKING); // should be

#if SCAN_ITEMS_THREAD > 1
	unsigned int n_highest_thread_bit = min(n_remainder, (unsigned int)SCAN_ITEMS_THREAD);
#if b_Is_POT(SCAN_ITEMS_THREAD)
	enum {
		n_log_scan_items_thread = n_Log2(SCAN_ITEMS_THREAD),
		n_shifted_warp_size_minus_1 = (WARP_SIZE - 1) << n_log_scan_items_thread,
		n_final_shift = LOG_WARP_SIZE + n_log_scan_items_thread
	};
	//unsigned int n_highest_thread = n_remainder >> n_log_scan_items_thread;
	unsigned int n_warps = (n_highest_thread_bit - 1) * SCAN_WORKGROUP_WARP_NUM +
	//	((n_highest_thread + WARP_SIZE - 1) >> LOG_WARP_SIZE);
		((n_remainder + n_shifted_warp_size_minus_1) >> n_final_shift);
#else // b_Is_POT(SCAN_ITEMS_THREAD)
	//unsigned int n_highest_thread = n_remainder / SCAN_ITEMS_THREAD; // don't want this divide
	unsigned int n_warps = n_highest_thread_bit * SCAN_WORKGROUP_WARP_NUM; // entire workgroups
#endif // b_Is_POT(SCAN_ITEMS_THREAD)
#else // SCAN_ITEMS_THREAD > 1
	unsigned int n_highest_thread = n_remainder; // divide by 1
	//unsigned int n_highest_thread_bit = min(n_remainder, (unsigned int)SCAN_ITEMS_THREAD); // always 1
	unsigned int n_warps = /*(n_highest_thread_bit - 1) * SCAN_WORKGROUP_WARP_NUM +*/ // always 0
		((n_highest_thread + WARP_SIZE - 1) >> LOG_WARP_SIZE);
#endif // SCAN_ITEMS_THREAD > 1

	return n_warps;
}

// decode head flags stored as words, write back packed head flags
inline uint32_t n_Decode_HeadFlags_WritePacked(__global uintwarp_t *p_packed_head_flags,
	__global const uint32_t *p_head_flags, const unsigned int n_remainder,
	const unsigned int n_tid, const unsigned int n_warp, const unsigned int n_lane,
	__local THeadFlag_DecodePackStorage *p_temp)
{
	__local uintwarp_t *p_shared_packed_hf = p_temp->p_shared_packed_hf;

#ifdef USE_STRIDED_PACKED_HEAD_FLAGS
	uint32_t n_my_head_flags;
	{
		__local uint32_t *p_shared_hf = p_temp->p_shared_hf;
		// this is not supposed to be uintwarp_t, those are not bitfields but rather "flag elements"

		int p_my_hf[SCAN_ITEMS_THREAD];
		GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_hf, n_tid, SCAN_LOCAL_WORK_SIZE,
			SCAN_ITEMS_THREAD, p_head_flags, n_remainder, 0, p_shared_hf);
		// cooperatively read all the head flags (need two more clocks to handle the last element)

		barrier(CLK_LOCAL_MEM_FENCE); // wait for everyone to stop using p_shared_hf so that we can use p_shared_packed_hf

		REGISTER_TO_LOCAL_DECODE_PACK_AND_READ_STRIDED_BITARRAY(p_shared_packed_hf, n_my_head_flags, n_warp,
			n_lane, SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_my_hf);

		barrier(CLK_LOCAL_MEM_FENCE); // wait for everyone to fill p_shared_packed_hf
	}
#else // USE_STRIDED_PACKED_HEAD_FLAGS
	GLOBAL_TO_LOCAL_DECODE_PACK_BITARRAY(p_shared_packed_hf, n_tid,
		SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_head_flags, n_remainder, 0);
	// cooperatively read the flags from global memory (coalesced) and ballot it to shared memory

	barrier(CLK_LOCAL_MEM_FENCE);
	// would need a barrier below anyway, as the warps that generate the
	// head flags are different warps than the ones that will write it

	uint32_t n_my_head_flags;
	LOCAL_READ_BITARRAY(n_my_head_flags, n_tid, n_warp, n_lane,
		SCAN_ITEMS_THREAD, p_shared_packed_hf); // needs padding elem in p_shared_packed_hf
	// read head flags
#endif // USE_STRIDED_PACKED_HEAD_FLAGS

	STATIC_ASSERT(SCAN_LOCAL_WORK_SIZE >= SCAN_TILE_SIZE_BY_WARP_SIZE, DECODE_PACK_HEADFLAGS_NOT_ENOUGH_THREADS_TO_STORE_COMPRESSED_FLAGS);

#ifdef USE_STRIDED_PACKED_HEAD_FLAGS
	unsigned int n_warps = n_StridedHeadFlag_Remainder(n_remainder);
	unsigned int n_remainder_warps = min((unsigned int)SCAN_TILE_SIZE_BY_WARP_SIZE, n_warps); // each bit fills one warp due to striding, so almost all the tiles will be full unless having only a few elements in them
	//unsigned int n_remainder_warps = min((unsigned int)SCAN_TILE_SIZE_BY_WARP_SIZE, n_remainder * SCAN_ITEMS_THREAD); // each bit fills one warp due to striding, so almost all the tiles will be full unless having only a few elements in them
#else // USE_STRIDED_PACKED_HEAD_FLAGS
	unsigned int n_remainder_warps = min((unsigned int)SCAN_TILE_SIZE_BY_WARP_SIZE, ((n_remainder + WARP_SIZE - 1) >> LOG_WARP_SIZE));
#endif // USE_STRIDED_PACKED_HEAD_FLAGS
	if(n_tid < n_remainder_warps)
		p_packed_head_flags[n_tid] = p_shared_packed_hf[n_tid];
	// cooperatively write the packed head flags to global memory

	return n_my_head_flags;
}

// decode head flags stored as bit array, return thread's own flags
inline uint32_t n_Read_Packed_HeadFlags(__global const uintwarp_t *p_packed_head_flags,
	const unsigned int n_remainder, const unsigned int n_tid, const unsigned int n_warp,
	const unsigned int n_lane, __local THeadFlag_ReadStorage *p_temp)
{
	STATIC_ASSERT(SCAN_LOCAL_WORK_SIZE >= SCAN_TILE_SIZE_BY_WARP_SIZE, SEGMENTED_SCAN_DOWNSWEEP_NOT_ENOUGH_THREADS_TO_LOAD_COMPRESSED_FLAGS);

	__local uintwarp_t *p_shared_packed_hf = p_temp->p_shared_packed_hf;
#ifdef USE_STRIDED_PACKED_HEAD_FLAGS
	unsigned int n_warps = n_StridedHeadFlag_Remainder(n_remainder);
	unsigned int n_remainder_warps = /*min((unsigned int)SCAN_TILE_SIZE_BY_WARP_SIZE,*/ n_warps/*)*/; // each bit fills one warp due to striding, so almost all the tiles will be full unless having only a few elements in them
	//unsigned int n_remainder_warps =  /*min((unsigned int)SCAN_TILE_SIZE_BY_WARP_SIZE,*/ n_remainder * SCAN_ITEMS_THREAD/*)*/; // each bit fills one warp due to striding, so almost all the tiles will be full unless having only a few elements in them
#else // USE_STRIDED_PACKED_HEAD_FLAGS
	unsigned int n_remainder_warps = /*min((unsigned int)SCAN_TILE_SIZE_BY_WARP_SIZE,*/ ((n_remainder + WARP_SIZE - 1) >> LOG_WARP_SIZE)/*)*/;
#endif // USE_STRIDED_PACKED_HEAD_FLAGS
	if(n_tid < SCAN_TILE_SIZE_BY_WARP_SIZE)
		p_shared_packed_hf[n_tid] = (n_tid < n_remainder_warps)? p_packed_head_flags[n_tid] : 0;
	// cooperatively write the packed head flags to global memory

	barrier(CLK_LOCAL_MEM_FENCE);

	uint32_t n_my_head_flags;
#ifdef USE_STRIDED_PACKED_HEAD_FLAGS
	LOCAL_READ_STRIDED_BITARRAY(n_my_head_flags, n_warp, n_lane,
		SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_shared_packed_hf);
#else // USE_STRIDED_PACKED_HEAD_FLAGS
	LOCAL_READ_BITARRAY(n_my_head_flags, n_tid, n_warp, n_lane,
		SCAN_ITEMS_THREAD, p_shared_packed_hf); // needs padding elem in p_shared_packed_hf
#endif // USE_STRIDED_PACKED_HEAD_FLAGS

	return n_my_head_flags;
}

typedef struct {
/*#ifdef USE_STRIDED_PACKED_HEAD_FLAGS
	uintwarp_t p_shared_packed_hf[SCAN_TILE_SIZE_BY_WARP_SIZE * 2 + 1]; // last one to simplify the addressing logic
#else*/
	uintwarp_t p_shared_packed_hf[SCAN_TILE_SIZE_BY_WARP_SIZE + 1]; // last one to simplify the addressing logic
//#endif
} THeadFlag_ReadStorage_OneMore; // note that this is not empty, even if USE_PACKED_HEAD_FLAGS is not defined

// decode head flags stored as bit array, return thread's own flags
inline uint32_t n_Read_Packed_HeadFlags_OneMore(__global const uintwarp_t *p_packed_head_flags,
	const unsigned int n_remainder, const unsigned int n_tid, const unsigned int n_warp,
	const unsigned int n_lane, __local THeadFlag_ReadStorage_OneMore *p_temp)
{
	STATIC_ASSERT(SCAN_LOCAL_WORK_SIZE > SCAN_TILE_SIZE_BY_WARP_SIZE, SEGMENTED_SCAN_DOWNSWEEP_NOT_ENOUGH_THREADS_TO_LOAD_COMPRESSED_FLAGS);

	__local uintwarp_t *p_shared_packed_hf = p_temp->p_shared_packed_hf;
#ifdef USE_STRIDED_PACKED_HEAD_FLAGS
	unsigned int n_remainder_warps = n_StridedHeadFlag_Remainder(n_remainder);
	if(n_tid < SCAN_TILE_SIZE_BY_WARP_SIZE) // difficult to calculate remainder correcty
		p_shared_packed_hf[n_tid] = (n_tid < n_remainder_warps)? p_packed_head_flags[n_tid] : 0;
	else if(n_tid == SCAN_TILE_SIZE_BY_WARP_SIZE)
		p_shared_packed_hf[n_tid] = (SCAN_TILE_SIZE < n_remainder)? p_packed_head_flags[n_tid] : 0;
#else // USE_STRIDED_PACKED_HEAD_FLAGS
	unsigned int n_remainder_warps = /*min((unsigned int)SCAN_TILE_SIZE_BY_WARP_SIZE,*/ ((n_remainder /*+ 1*/ + WARP_SIZE - 1) >> LOG_WARP_SIZE)/*)*/; // the +1 is not supposed to be there!
	if(n_tid <= SCAN_TILE_SIZE_BY_WARP_SIZE)
		p_shared_packed_hf[n_tid] = (n_tid < n_remainder_warps)? p_packed_head_flags[n_tid] : 0;
#endif // USE_STRIDED_PACKED_HEAD_FLAGS
	// cooperatively write the packed head flags to global memory

	barrier(CLK_LOCAL_MEM_FENCE);

	uint32_t n_my_head_flags;
#ifdef USE_STRIDED_PACKED_HEAD_FLAGS
	LOCAL_READ_STRIDED_BITARRAY_OVERLAP_1(n_my_head_flags, n_warp, n_lane,
		SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_shared_packed_hf); // just read one more, those overlap automatically
#else // USE_STRIDED_PACKED_HEAD_FLAGS
	LOCAL_READ_BITARRAY_OVERLAP(n_my_head_flags, n_tid, n_warp, n_lane,
		SCAN_ITEMS_THREAD, 1, p_shared_packed_hf); // needs padding elem in p_shared_packed_hf
#endif // USE_STRIDED_PACKED_HEAD_FLAGS

	return n_my_head_flags;
}

// decode head flags stored as a non-interleaved bit array, return thread's own flags
inline uint32_t n_Read_PackedUninterleaved_HeadFlags(__global const uintwarp_t *p_packed_head_flags,
	const unsigned int n_remainder, const unsigned int n_tid, const unsigned int n_warp,
	const unsigned int n_lane, __local THeadFlag_ReadStorage1 *p_temp) // needs padding elem in p_shared_packed_hf
{
	STATIC_ASSERT(SCAN_LOCAL_WORK_SIZE >= SCAN_TILE_SIZE_BY_WARP_SIZE, SEGMENTED_SCAN_DOWNSWEEP_NOT_ENOUGH_THREADS_TO_LOAD_COMPRESSED_FLAGS);

	__local uintwarp_t *p_shared_packed_hf = p_temp->p_shared_packed_hf;
	unsigned int n_remainder_warps = (n_remainder + WARP_SIZE - 1) >> LOG_WARP_SIZE;
	if(n_tid < SCAN_TILE_SIZE_BY_WARP_SIZE)
		p_shared_packed_hf[n_tid] = (n_tid < n_remainder_warps)? p_packed_head_flags[n_tid] : 0;
	// cooperatively read the packed head flags from global memory

	barrier(CLK_LOCAL_MEM_FENCE);

	uint32_t n_my_head_flags;
	LOCAL_READ_BITARRAY(n_my_head_flags, n_tid, n_warp, n_lane,
		SCAN_ITEMS_THREAD, p_shared_packed_hf); // needs padding elem in p_shared_packed_hf

	return n_my_head_flags;
}

inline uint32_t n_Read_PackedUninterleaved_HeadFlags_OneMore(__global const uintwarp_t *p_packed_head_flags,
	const unsigned int n_remainder, const unsigned int n_tid, const unsigned int n_warp,
	const unsigned int n_lane, __local THeadFlag_ReadStorage1 *p_temp) // needs padding elem in p_shared_packed_hf
{
	STATIC_ASSERT(SCAN_LOCAL_WORK_SIZE > SCAN_TILE_SIZE_BY_WARP_SIZE, SEGMENTED_SCAN_DOWNSWEEP_NOT_ENOUGH_THREADS_TO_LOAD_COMPRESSED_FLAGS);

	__local uintwarp_t *p_shared_packed_hf = p_temp->p_shared_packed_hf;
	unsigned int n_remainder_warps = (n_remainder /*+ 1*/ + WARP_SIZE - 1) >> LOG_WARP_SIZE; // the +1 is not supposed to be there!
	if(n_tid <= SCAN_TILE_SIZE_BY_WARP_SIZE)
		p_shared_packed_hf[n_tid] = (n_tid < n_remainder_warps)? p_packed_head_flags[n_tid] : 0;
	// cooperatively read the packed head flags from global memory

	barrier(CLK_LOCAL_MEM_FENCE);

	uint32_t n_my_head_flags;
	LOCAL_READ_BITARRAY_OVERLAP(n_my_head_flags, n_tid, n_warp, n_lane,
		SCAN_ITEMS_THREAD, 1, p_shared_packed_hf); // needs padding elem in p_shared_packed_hf

	return n_my_head_flags;
}

#ifdef BUILD_PACK_FLAGS_KERNELS

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void Pack_HeadFlags(__global uintwarp_t *p_packed_head_flags,
	__global const uint32_t *p_head_flags, const unsigned int n_size)
{
	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;//min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_head_flags += n_start;
	p_packed_head_flags += n_start >> LOG_WARP_SIZE;

	__local union {
		THeadFlag_DecodePackStorage t_storage;
	} temp;

	const unsigned int n_warp = l >> LOG_WARP_SIZE;
	const unsigned int n_lane = l & (WARP_SIZE - 1);

#ifdef USE_STRIDED_PACKED_HEAD_FLAGS
	{
		int p_my_hf[SCAN_ITEMS_THREAD];
		GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_hf, l, SCAN_LOCAL_WORK_SIZE,
			SCAN_ITEMS_THREAD, p_head_flags, n_remainder, 0, temp.t_storage.p_shared_hf);
		// cooperatively read all the head flags (need two more clocks to handle the last element)

		barrier(CLK_LOCAL_MEM_FENCE); // wait for everyone to stop using p_shared_hf so that we can use p_shared_packed_hf

		REGISTER_TO_LOCAL_DECODE_PACK_STRIDED_BITARRAY(temp.t_storage.p_shared_packed_hf, n_warp,
			n_lane, SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_my_hf);
	}
#else // USE_STRIDED_PACKED_HEAD_FLAGS
	GLOBAL_TO_LOCAL_DECODE_PACK_BITARRAY(temp.t_storage.p_shared_packed_hf, l,
		SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_head_flags, n_remainder, 0);
	// cooperatively read the flags from global memory (coalesced) and ballot it to shared memory
#endif // USE_STRIDED_PACKED_HEAD_FLAGS

	barrier(CLK_LOCAL_MEM_FENCE); // wait for everyone to fill p_shared_packed_hf

	STATIC_ASSERT(SCAN_LOCAL_WORK_SIZE >= SCAN_TILE_SIZE_BY_WARP_SIZE, DECODE_PACK_HEADFLAGS_NOT_ENOUGH_THREADS_TO_STORE_COMPRESSED_FLAGS);

#ifdef USE_STRIDED_PACKED_HEAD_FLAGS
	unsigned int n_warps = n_StridedHeadFlag_Remainder(n_remainder);
	unsigned int n_remainder_warps = min((unsigned int)SCAN_TILE_SIZE_BY_WARP_SIZE, n_warps); // each bit fills one warp due to striding, so almost all the tiles will be full unless having only a few elements in them
#else // USE_STRIDED_PACKED_HEAD_FLAGS
	unsigned int n_remainder_warps = min((unsigned int)SCAN_TILE_SIZE_BY_WARP_SIZE, ((n_remainder + WARP_SIZE - 1) >> LOG_WARP_SIZE));
#endif // USE_STRIDED_PACKED_HEAD_FLAGS
	if(l < n_remainder_warps)
		p_packed_head_flags[l] = temp.t_storage.p_shared_packed_hf[l];
	// cooperatively write the packed head flags to global memory (entire tiles)
}

#endif // BUILD_PACK_FLAGS_KERNELS

#ifdef BUILD_SEG_SCAN_KERNELS

// performs local segmented (exclusive) scan on array p_data of size n_size, while using p_head_flags as head flags
// (also n_size; nonzero head flag starts a new segment, the first head flag is implied and does not really matter)
// writes p_tile_carry_out (the sum of elements in the last segment of the tile; n_size / SCAN_TILE_SIZE of those (round up))
// writes p_tile_head_flags (the bit reduction of thread flags, nonzero if the segment contains any tail flags; n_size / SCAN_TILE_SIZE of those (round up))
__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegScan_Carry(__global _TyScalar *p_tile_carry_out,
	__global uint32_t *p_tile_head_flags, __global const _TyScalar *p_data,
	__global const uint32_t *p_head_flags, const unsigned int n_size) // note that this is *inclusive*
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD <= 32, SEGMENTED_SCAN_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;//min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	//p_scan += n_start;
	p_data += n_start;
	p_head_flags += n_start;

	__local union {
		THeadFlag_DecodeStorage headflag_temp;
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalSegScanStorage seg_scan_temp; // need seg scan
		//uint32_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // tail flags ballot for each warp
	} temp;

	const unsigned int n_warp = l >> LOG_WARP_SIZE;
	const unsigned int n_lane = l & (WARP_SIZE - 1);

	uint32_t n_my_head_flags = n_Decode_HeadFlags(p_head_flags,
		n_remainder, l, n_warp, n_lane, &temp.headflag_temp);
	// collect head flags

	barrier(CLK_LOCAL_MEM_FENCE);

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	_TyScalar partial = t_ReductionElemOp(p_my_data[0]); // ignore the first head flag since there is no carry-in

    #pragma unroll
	for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			partial = REDUCTION_IDENTITY;
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
	}
	// perform local scan

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	uint32_t n_tile_flags;
	_TyScalar carry_out;
	_TyScalar scan = t_LocalSegScan_CalcDist_GetTileFlags(l, n_my_head_flags != 0, partial,
		&temp.seg_scan_temp, &carry_out, &n_tile_flags);
	// spine scan

	/*barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.seg_scan_temp

	n_my_head_flags = n_warp_ballot(n_my_head_flags != 0); // all the threads need to contribute!
	if(!n_lane) // the first thread of each warp
		temp.p_shared_head_flags[n_warp] = n_my_head_flags;
	barrier(CLK_LOCAL_MEM_FENCE); // wait for all warp representatives to write
	if(l < SCAN_WORKGROUP_WARP_NUM)
		n_my_head_flags = n_warp_ballot(temp.p_shared_head_flags[l] != 0);*/ // if there are less than 32 warps, the other threads contribute zeros
	// reduce head flags (don't really need the value - only zero / nonzero, could simply use atomic or)

	if(!l) {
		p_tile_carry_out[g] = carry_out;
		p_tile_head_flags[g] = n_tile_flags;//temp.seg_scan_temp.p_delta[SCAN_WORKGROUP_WARP_NUM * 2];//n_my_head_flags; // this is already calculated
	}
	// write carry-out and reduced tail flags
	// todo - do this first; probably saves barriers and also if there are no flags in this workgroup, then can use a simple scan which is faster than seg scan
}

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegScan_Carry_Pack(__global _TyScalar *p_tile_carry_out,
	__global uint32_t *p_tile_head_flags, __global uintwarp_t *p_packed_head_flags, __global const _TyScalar *p_data,
	__global const uint32_t *p_head_flags, const unsigned int n_size) // note that this is *inclusive*
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD <= 32, SEGMENTED_SCAN_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;//min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	//p_scan += n_start;
	p_data += n_start;
	p_head_flags += n_start;
	p_packed_head_flags += n_start >> LOG_WARP_SIZE;

	__local union {
		THeadFlag_DecodePackStorage headflag_temp;
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalSegScanStorage seg_scan_temp; // need seg scan
		//uint32_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // tail flags ballot for each warp
	} temp;

	const unsigned int n_warp = l >> LOG_WARP_SIZE;
	const unsigned int n_lane = l & (WARP_SIZE - 1);

	uint32_t n_my_head_flags = n_Decode_HeadFlags_WritePacked(p_packed_head_flags, p_head_flags,
		n_remainder, l, n_warp, n_lane, &temp.headflag_temp);
	// collect head flags, write the packed form back to global memory for downsweep
	// (saves O(n - n/32) bandwidth on the headflags)

	barrier(CLK_LOCAL_MEM_FENCE);

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	_TyScalar partial = t_ReductionElemOp(p_my_data[0]); // ignore the first head flag since there is no carry-in

    #pragma unroll
	for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			partial = REDUCTION_IDENTITY;
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
	}
	// perform local scan

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	uint32_t n_tile_flags;
	_TyScalar carry_out;
	_TyScalar scan = t_LocalSegScan_CalcDist_GetTileFlags(l, n_my_head_flags != 0, partial,
		&temp.seg_scan_temp, &carry_out, &n_tile_flags);
	// spine scan

	/*barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.seg_scan_temp

	n_my_head_flags = n_warp_ballot(n_my_head_flags != 0); // all the threads need to contribute!
	if(!n_lane) // the first thread of each warp
		temp.p_shared_head_flags[n_warp] = n_my_head_flags;
	barrier(CLK_LOCAL_MEM_FENCE); // wait for all warp representatives to write
	if(l < SCAN_WORKGROUP_WARP_NUM)
		n_my_head_flags = n_warp_ballot(temp.p_shared_head_flags[l] != 0);*/ // if there are less than 32 warps, the other threads contribute zeros
	// reduce head flags (don't really need the value - only zero / nonzero, could simply use atomic or)

	if(!l) {
		p_tile_carry_out[g] = carry_out;
		p_tile_head_flags[g] = n_tile_flags;//temp.seg_scan_temp.p_delta[SCAN_WORKGROUP_WARP_NUM * 2];//n_my_head_flags; // this is already calculated
	}
	// write carry-out and reduced tail flags
	// todo - do this first; probably saves barriers and also if there are no flags in this workgroup, then can use a simple scan which is faster than seg scan
}

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegScan_Carry_Packed(__global _TyScalar *p_tile_carry_out,
	__global uint32_t *p_tile_head_flags, __global const _TyScalar *p_data,
	__global const uintwarp_t *p_packed_head_flags, const unsigned int n_size) // note that this is *inclusive*
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD <= 32, SEGMENTED_SCAN_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;//min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	//p_scan += n_start;
	p_data += n_start;
	p_packed_head_flags += n_start >> LOG_WARP_SIZE;

	__local union {
		THeadFlag_ReadStorage headflag_temp;
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalSegScanStorage seg_scan_temp; // need seg scan
		//uint32_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // tail flags ballot for each warp
	} temp;

	const unsigned int n_warp = l >> LOG_WARP_SIZE;
	const unsigned int n_lane = l & (WARP_SIZE - 1);

	uint32_t n_my_head_flags = n_Read_Packed_HeadFlags(p_packed_head_flags,
		n_remainder, l, n_warp, n_lane, &temp.headflag_temp);
	// collect head flags, write the packed form back to global memory for downsweep
	// (saves O(n - n/32) bandwidth on the headflags)

	barrier(CLK_LOCAL_MEM_FENCE);

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	_TyScalar partial = t_ReductionElemOp(p_my_data[0]); // ignore the first head flag since there is no carry-in

    #pragma unroll
	for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			partial = REDUCTION_IDENTITY;
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
	}
	// perform local scan

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	uint32_t n_tile_flags;
	_TyScalar carry_out;
	_TyScalar scan = t_LocalSegScan_CalcDist_GetTileFlags(l, n_my_head_flags != 0, partial,
		&temp.seg_scan_temp, &carry_out, &n_tile_flags);
	// spine scan

	/*barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.seg_scan_temp

	n_my_head_flags = n_warp_ballot(n_my_head_flags != 0); // all the threads need to contribute!
	if(!n_lane) // the first thread of each warp
		temp.p_shared_head_flags[n_warp] = n_my_head_flags;
	barrier(CLK_LOCAL_MEM_FENCE); // wait for all warp representatives to write
	if(l < SCAN_WORKGROUP_WARP_NUM)
		n_my_head_flags = n_warp_ballot(temp.p_shared_head_flags[l] != 0);*/ // if there are less than 32 warps, the other threads contribute zeros
	// reduce head flags (don't really need the value - only zero / nonzero, could simply use atomic or)

	if(!l) {
		p_tile_carry_out[g] = carry_out;
		p_tile_head_flags[g] = n_tile_flags;//temp.seg_scan_temp.p_delta[SCAN_WORKGROUP_WARP_NUM * 2];//n_my_head_flags; // this is already calculated
	}
	// write carry-out and reduced tail flags
	// todo - do this first; probably saves barriers and also if there are no flags in this workgroup, then can use a simple scan which is faster than seg scan
}

#ifdef USE_STRIDED_PACKED_HEAD_FLAGS

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegScan_Carry_PackedNatural(__global _TyScalar *p_tile_carry_out,
	__global uint32_t *p_tile_head_flags, __global const _TyScalar *p_data,
	__global const uintwarp_t *p_packed_head_flags, const unsigned int n_size) // note that this is *inclusive*
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD <= 32, SEGMENTED_SCAN_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;//min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	//p_scan += n_start;
	p_data += n_start;
	p_packed_head_flags += n_start >> LOG_WARP_SIZE;

	__local union {
		THeadFlag_ReadStorage1 headflag_temp;
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalSegScanStorage seg_scan_temp; // need seg scan
		//uint32_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // tail flags ballot for each warp
	} temp;

	const unsigned int n_warp = l >> LOG_WARP_SIZE;
	const unsigned int n_lane = l & (WARP_SIZE - 1);

	uint32_t n_my_head_flags = n_Read_PackedUninterleaved_HeadFlags(p_packed_head_flags,
		n_remainder, l, n_warp, n_lane, &temp.headflag_temp);
	// collect head flags, write the packed form back to global memory for downsweep
	// (saves O(n - n/32) bandwidth on the headflags)

	barrier(CLK_LOCAL_MEM_FENCE);

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	_TyScalar partial = t_ReductionElemOp(p_my_data[0]); // ignore the first head flag since there is no carry-in

    #pragma unroll
	for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			partial = REDUCTION_IDENTITY;
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
	}
	// perform local scan

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	uint32_t n_tile_flags;
	_TyScalar carry_out;
	_TyScalar scan = t_LocalSegScan_CalcDist_GetTileFlags(l, n_my_head_flags != 0, partial,
		&temp.seg_scan_temp, &carry_out, &n_tile_flags);
	// spine scan

	/*barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.seg_scan_temp

	n_my_head_flags = n_warp_ballot(n_my_head_flags != 0); // all the threads need to contribute!
	if(!n_lane) // the first thread of each warp
		temp.p_shared_head_flags[n_warp] = n_my_head_flags;
	barrier(CLK_LOCAL_MEM_FENCE); // wait for all warp representatives to write
	if(l < SCAN_WORKGROUP_WARP_NUM)
		n_my_head_flags = n_warp_ballot(temp.p_shared_head_flags[l] != 0);*/ // if there are less than 32 warps, the other threads contribute zeros
	// reduce head flags (don't really need the value - only zero / nonzero, could simply use atomic or)

	if(!l) {
		p_tile_carry_out[g] = carry_out;
		p_tile_head_flags[g] = n_tile_flags;//temp.seg_scan_temp.p_delta[SCAN_WORKGROUP_WARP_NUM * 2];//n_my_head_flags; // this is already calculated
	}
	// write carry-out and reduced tail flags
	// todo - do this first; probably saves barriers and also if there are no flags in this workgroup, then can use a simple scan which is faster than seg scan
}

#endif // USE_STRIDED_PACKED_HEAD_FLAGS

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegScan(__global _TyScalar *p_scan,
	//__global _TyScalar *p_tile_carry_out, __global uint32_t *p_tile_head_flags, // those are not needed, as this does only a single tile
	__global const _TyScalar *p_data, __global const uint32_t *p_head_flags, const unsigned int n_size) // note that this is *inclusive*
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD <= 32, SEGMENTED_SCAN_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;//min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_scan += n_start;
	p_data += n_start;
	p_head_flags += n_start;

	__local union {
		THeadFlag_DecodeStorage headflag_temp;
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalSegScanStorage seg_scan_temp; // need seg scan
		//uint32_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // tail flags ballot for each warp
	} temp;

	uint32_t n_my_head_flags = n_Decode_HeadFlags(p_head_flags,
		n_remainder, l, l >> LOG_WARP_SIZE, l & (WARP_SIZE - 1), &temp.headflag_temp);
	// collect head flags

	barrier(CLK_LOCAL_MEM_FENCE);

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	//uint32_t n_my_tail_flags = 0;
	_TyScalar partial = t_ReductionElemOp(p_my_data[0]); // ignore the first head flag since there is no carry-in
	_TyScalar p_my_scan[SCAN_ITEMS_THREAD]; // todo - see if doing this inplace reduces the number of registers

	p_my_scan[0] = partial;
    #pragma unroll
	for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			partial = REDUCTION_IDENTITY;
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		p_my_scan[i] = partial;
	}
	// perform local scan

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	_TyScalar carry_out;
	_TyScalar scan = t_LocalSegScan_CalcDist(l, n_my_head_flags != 0, partial, &temp.seg_scan_temp, &carry_out);
	// spine scan

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			scan = REDUCTION_IDENTITY;
		p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);
	}
	// reduce all

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.seg_scan_temp

	/*n_my_head_flags = n_warp_ballot(n_my_head_flags != 0); // all the threads need to contribute!
	if(!(l & (WARP_SIZE - 1))) // the first thread of each warp
		temp.p_shared_head_flags[l >> LOG_WARP_SIZE] = n_my_head_flags;
	barrier(CLK_LOCAL_MEM_FENCE); // wait for all warp representatives to write
	if(l < SCAN_WORKGROUP_WARP_NUM)
		n_my_head_flags = n_warp_ballot(temp.p_shared_head_flags[l] != 0);*/ // if there are less than 32 warps, the other threads contribute zeros
	// reduce tail flags (don't really need the value - only zero / nonzero, could simply use atomic or)

	/*if(!l) {
		p_tile_carry_out[g] = carry_out;
		p_tile_head_flags[g] = n_my_head_flags;
	}*/
	// write carry-out and reduced tail flags
	// todo - do this first; probably saves barriers and also if there are no flags in this workgroup, then can use a simple scan which is faster than seg scan

	REGISTER_TO_GLOBAL_ORDERED_CFI_COND(CFI_FLAG, p_scan, n_remainder, l,
		SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_my_scan, temp.p_shared_data);
	// write out scan to memory
}

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegScan_Packed(__global _TyScalar *p_scan,
	//__global _TyScalar *p_tile_carry_out, __global uint32_t *p_tile_head_flags, // those are not needed, as this does only a single tile
	__global const _TyScalar *p_data, __global const uintwarp_t *p_packed_head_flags, const unsigned int n_size) // note that this is *inclusive*
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD <= 32, SEGMENTED_SCAN_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;//min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_scan += n_start;
	p_data += n_start;
	p_packed_head_flags += n_start >> LOG_WARP_SIZE;

	__local union {
		THeadFlag_ReadStorage headflag_temp;
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalSegScanStorage seg_scan_temp; // need seg scan
		//uint32_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // tail flags ballot for each warp
	} temp;

	const unsigned int n_warp = l >> LOG_WARP_SIZE;
	const unsigned int n_lane = l & (WARP_SIZE - 1);

	uint32_t n_my_head_flags = n_Read_Packed_HeadFlags(p_packed_head_flags,
		n_remainder, l, n_warp, n_lane, &temp.headflag_temp);
	// collect head flags

	barrier(CLK_LOCAL_MEM_FENCE);

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	//uint32_t n_my_tail_flags = 0;
	_TyScalar partial = t_ReductionElemOp(p_my_data[0]); // ignore the first head flag since there is no carry-in
	_TyScalar p_my_scan[SCAN_ITEMS_THREAD]; // todo - see if doing this inplace reduces the number of registers

	p_my_scan[0] = partial;
    #pragma unroll
	for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			partial = REDUCTION_IDENTITY;
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		p_my_scan[i] = partial;
	}
	// perform local scan

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	_TyScalar carry_out;
	_TyScalar scan = t_LocalSegScan_CalcDist(l, n_my_head_flags != 0, partial, &temp.seg_scan_temp, &carry_out);
	// spine scan

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			scan = REDUCTION_IDENTITY;
		p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);
	}
	// reduce all

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.seg_scan_temp

	/*n_my_head_flags = n_warp_ballot(n_my_head_flags != 0); // all the threads need to contribute!
	if(!(l & (WARP_SIZE - 1))) // the first thread of each warp
		temp.p_shared_head_flags[l >> LOG_WARP_SIZE] = n_my_head_flags;
	barrier(CLK_LOCAL_MEM_FENCE); // wait for all warp representatives to write
	if(l < SCAN_WORKGROUP_WARP_NUM)
		n_my_head_flags = n_warp_ballot(temp.p_shared_head_flags[l] != 0);*/ // if there are less than 32 warps, the other threads contribute zeros
	// reduce tail flags (don't really need the value - only zero / nonzero, could simply use atomic or)

	/*if(!l) {
		p_tile_carry_out[g] = carry_out;
		p_tile_head_flags[g] = n_my_head_flags;
	}*/
	// write carry-out and reduced tail flags
	// todo - do this first; probably saves barriers and also if there are no flags in this workgroup, then can use a simple scan which is faster than seg scan

	REGISTER_TO_GLOBAL_ORDERED_CFI_COND(CFI_FLAG, p_scan, n_remainder, l,
		SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_my_scan, temp.p_shared_data);
	// write out scan to memory
}

#ifdef USE_STRIDED_PACKED_HEAD_FLAGS

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegScan_PackedNatural(__global _TyScalar *p_scan,
	//__global _TyScalar *p_tile_carry_out, __global uint32_t *p_tile_head_flags, // those are not needed, as this does only a single tile
	__global const _TyScalar *p_data, __global const uintwarp_t *p_packed_head_flags, const unsigned int n_size) // note that this is *inclusive*
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD <= 32, SEGMENTED_SCAN_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;//min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_scan += n_start;
	p_data += n_start;
	p_packed_head_flags += n_start >> LOG_WARP_SIZE;

	__local union {
		THeadFlag_ReadStorage1 headflag_temp;
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalSegScanStorage seg_scan_temp; // need seg scan
		//uint32_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // tail flags ballot for each warp
	} temp;

	uint32_t n_my_head_flags = n_Read_PackedUninterleaved_HeadFlags(p_packed_head_flags,
		n_remainder, l, l >> LOG_WARP_SIZE, l & (WARP_SIZE - 1), &temp.headflag_temp);
	// collect head flags

	barrier(CLK_LOCAL_MEM_FENCE);

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	//uint32_t n_my_tail_flags = 0;
	_TyScalar partial = t_ReductionElemOp(p_my_data[0]); // ignore the first head flag since there is no carry-in
	_TyScalar p_my_scan[SCAN_ITEMS_THREAD]; // todo - see if doing this inplace reduces the number of registers

	p_my_scan[0] = partial;
    #pragma unroll
	for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			partial = REDUCTION_IDENTITY;
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		p_my_scan[i] = partial;
	}
	// perform local scan

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	_TyScalar carry_out;
	_TyScalar scan = t_LocalSegScan_CalcDist(l, n_my_head_flags != 0, partial, &temp.seg_scan_temp, &carry_out);
	// spine scan

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			scan = REDUCTION_IDENTITY;
		p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);
	}
	// reduce all

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.seg_scan_temp

	/*n_my_head_flags = n_warp_ballot(n_my_head_flags != 0); // all the threads need to contribute!
	if(!(l & (WARP_SIZE - 1))) // the first thread of each warp
		temp.p_shared_head_flags[l >> LOG_WARP_SIZE] = n_my_head_flags;
	barrier(CLK_LOCAL_MEM_FENCE); // wait for all warp representatives to write
	if(l < SCAN_WORKGROUP_WARP_NUM)
		n_my_head_flags = n_warp_ballot(temp.p_shared_head_flags[l] != 0);*/ // if there are less than 32 warps, the other threads contribute zeros
	// reduce tail flags (don't really need the value - only zero / nonzero, could simply use atomic or)

	/*if(!l) {
		p_tile_carry_out[g] = carry_out;
		p_tile_head_flags[g] = n_my_head_flags;
	}*/
	// write carry-out and reduced tail flags
	// todo - do this first; probably saves barriers and also if there are no flags in this workgroup, then can use a simple scan which is faster than seg scan

	REGISTER_TO_GLOBAL_ORDERED_CFI_COND(CFI_FLAG, p_scan, n_remainder, l,
		SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_my_scan, temp.p_shared_data);
	// write out scan to memory
}

#endif // USE_STRIDED_PACKED_HEAD_FLAGS

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegScan_Downsweep(__global _TyScalar *p_scan,
	//__global _TyScalar *p_tile_carry_out, __global uint32_t *p_tile_head_flags, // not needed in downsweep?
	__global const _TyScalar *p_data, __global const _TyScalar *p_tile_carry_in,
	__global const uint32_t *p_head_flags, const unsigned int n_size) // note that this is *inclusive*
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD <= 32, SEGMENTED_SCAN_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;//min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_scan += n_start;
	p_data += n_start;
	p_head_flags += n_start;

	__local union {
		THeadFlag_DecodeStorage headflag_temp;
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalSegScanStorage seg_scan_temp; // need seg scan
		//uint32_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // tail flags ballot for each warp
	} temp;

	uint32_t n_my_head_flags = n_Decode_HeadFlags(p_head_flags,
		n_remainder, l, l >> LOG_WARP_SIZE, l & (WARP_SIZE - 1), &temp.headflag_temp);
	// collect head flags

	barrier(CLK_LOCAL_MEM_FENCE);

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	_TyScalar partial = REDUCTION_IDENTITY;
	if((!l) & (g != 0))
		partial = p_tile_carry_in[g - 1]; // convert carry ins to exclusive scan, sort of (we only care about the elems which are nonzero and those are correct, the zero ones will be zeroed out below anyway)
	// the first thread of each workgroup reads partial, except for the first workgroup

	_TyScalar p_my_scan[SCAN_ITEMS_THREAD]; // todo - see if doing this inplace reduces the number of registers

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			partial = REDUCTION_IDENTITY;
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		p_my_scan[i] = partial;
	}
	// perform local scan

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	_TyScalar carry_out;
	_TyScalar scan = t_LocalSegScan_CalcDist(l, n_my_head_flags != 0, partial, &temp.seg_scan_temp, &carry_out);
	// spine scan

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			scan = REDUCTION_IDENTITY;
		p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);
	}
	// reduce all

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.seg_scan_temp

	/*n_my_head_flags = n_warp_ballot(n_my_head_flags != 0); // all the threads need to contribute!
	if(!(l & (WARP_SIZE - 1))) // the first thread of each warp
		temp.p_shared_head_flags[l >> LOG_WARP_SIZE] = n_my_head_flags;
	barrier(CLK_LOCAL_MEM_FENCE); // wait for all warp representatives to write
	if(l < SCAN_WORKGROUP_WARP_NUM)
		n_my_head_flags = n_warp_ballot(temp.p_shared_head_flags[l] != 0);*/ // if there are less than 32 warps, the other threads contribute zeros
	// reduce tail flags (don't really need the value - only zero / nonzero, could simply use atomic or)

	/*if(!l) {
		p_tile_carry_out[g] = carry_out;
		p_tile_head_flags[g] = n_my_head_flags;
	}*/
	// write carry-out and reduced tail flags // not needed here

	REGISTER_TO_GLOBAL_ORDERED_CFI_COND(CFI_FLAG, p_scan, n_remainder, l,
		SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_my_scan, temp.p_shared_data);
	// write out scan to memory
}

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegScan_Downsweep_Packed(__global _TyScalar *p_scan,
	//__global _TyScalar *p_tile_carry_out, __global uint32_t *p_tile_head_flags, // not needed in downsweep?
	__global const _TyScalar *p_data, __global const _TyScalar *p_tile_carry_in,
	__global const uintwarp_t *p_packed_head_flags, const unsigned int n_size) // note that this is *inclusive*
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD <= 32, SEGMENTED_SCAN_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;//min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_scan += n_start;
	p_data += n_start;
	p_packed_head_flags += n_start >> LOG_WARP_SIZE;

	__local union {
		THeadFlag_ReadStorage headflag_temp;
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalSegScanStorage seg_scan_temp; // need seg scan
		//uint32_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // tail flags ballot for each warp
	} temp;

	const unsigned int n_warp = l >> LOG_WARP_SIZE;
	const unsigned int n_lane = l & (WARP_SIZE - 1);

	uint32_t n_my_head_flags = n_Read_Packed_HeadFlags(p_packed_head_flags,
		n_remainder, l, n_warp, n_lane, &temp.headflag_temp);
	// grab head flags

	barrier(CLK_LOCAL_MEM_FENCE); // needed :(

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	_TyScalar partial = REDUCTION_IDENTITY;
	if((!l) & (g != 0))
		partial = p_tile_carry_in[g - 1]; // convert carry ins to exclusive scan, sort of (we only care about the elems which are nonzero and those are correct, the zero ones will be zeroed out below anyway)
	// the first thread of each workgroup reads partial, except for the first workgroup

	_TyScalar p_my_scan[SCAN_ITEMS_THREAD]; // todo - see if doing this inplace reduces the number of registers

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			partial = REDUCTION_IDENTITY;
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		p_my_scan[i] = partial;
	}
	// perform local scan

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	_TyScalar carry_out;
	_TyScalar scan = t_LocalSegScan_CalcDist(l, n_my_head_flags != 0, partial, &temp.seg_scan_temp, &carry_out);
	// spine scan

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			scan = REDUCTION_IDENTITY;
		p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);
	}
	// reduce all

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.seg_scan_temp

	/*n_my_head_flags = n_warp_ballot(n_my_head_flags != 0); // all the threads need to contribute!
	if(!n_lane) // the first thread of each warp
		temp.p_shared_head_flags[n_warp] = n_my_head_flags;
	barrier(CLK_LOCAL_MEM_FENCE); // wait for all warp representatives to write
	if(l < SCAN_WORKGROUP_WARP_NUM)
		n_my_head_flags = n_warp_ballot(temp.p_shared_head_flags[l] != 0);*/ // if there are less than 32 warps, the other threads contribute zeros
	// reduce tail flags (don't really need the value - only zero / nonzero, could simply use atomic or)

	/*if(!l) {
		p_tile_carry_out[g] = carry_out;
		p_tile_head_flags[g] = n_my_head_flags;
	}*/
	// write carry-out and reduced tail flags // not needed here

	REGISTER_TO_GLOBAL_ORDERED_CFI_COND(CFI_FLAG, p_scan, n_remainder, l,
		SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_my_scan, temp.p_shared_data);
	// write out scan to memory
}

#ifdef USE_STRIDED_PACKED_HEAD_FLAGS

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegScan_Downsweep_PackedNatural(__global _TyScalar *p_scan,
	//__global _TyScalar *p_tile_carry_out, __global uint32_t *p_tile_head_flags, // not needed in downsweep?
	__global const _TyScalar *p_data, __global const _TyScalar *p_tile_carry_in,
	__global const uintwarp_t *p_packed_head_flags, const unsigned int n_size) // note that this is *inclusive*
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD <= 32, SEGMENTED_SCAN_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;//min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_scan += n_start;
	p_data += n_start;
	p_packed_head_flags += n_start >> LOG_WARP_SIZE;

	__local union {
		THeadFlag_ReadStorage1 headflag_temp;
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalSegScanStorage seg_scan_temp; // need seg scan
		//uint32_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // tail flags ballot for each warp
	} temp;

	const unsigned int n_warp = l >> LOG_WARP_SIZE;
	const unsigned int n_lane = l & (WARP_SIZE - 1);

	uint32_t n_my_head_flags = n_Read_PackedUninterleaved_HeadFlags(p_packed_head_flags,
		n_remainder, l, n_warp, n_lane, &temp.headflag_temp);
	// grab head flags

	barrier(CLK_LOCAL_MEM_FENCE); // needed :(

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	_TyScalar partial = REDUCTION_IDENTITY;
	if((!l) & (g != 0))
		partial = p_tile_carry_in[g - 1]; // convert carry ins to exclusive scan, sort of (we only care about the elems which are nonzero and those are correct, the zero ones will be zeroed out below anyway)
	// the first thread of each workgroup reads partial, except for the first workgroup

	_TyScalar p_my_scan[SCAN_ITEMS_THREAD]; // todo - see if doing this inplace reduces the number of registers

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			partial = REDUCTION_IDENTITY;
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		p_my_scan[i] = partial;
	}
	// perform local scan

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	_TyScalar carry_out;
	_TyScalar scan = t_LocalSegScan_CalcDist(l, n_my_head_flags != 0, partial, &temp.seg_scan_temp, &carry_out);
	// spine scan

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			scan = REDUCTION_IDENTITY;
		p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);
	}
	// reduce all

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.seg_scan_temp

	/*n_my_head_flags = n_warp_ballot(n_my_head_flags != 0); // all the threads need to contribute!
	if(!n_lane) // the first thread of each warp
		temp.p_shared_head_flags[n_warp] = n_my_head_flags;
	barrier(CLK_LOCAL_MEM_FENCE); // wait for all warp representatives to write
	if(l < SCAN_WORKGROUP_WARP_NUM)
		n_my_head_flags = n_warp_ballot(temp.p_shared_head_flags[l] != 0);*/ // if there are less than 32 warps, the other threads contribute zeros
	// reduce tail flags (don't really need the value - only zero / nonzero, could simply use atomic or)

	/*if(!l) {
		p_tile_carry_out[g] = carry_out;
		p_tile_head_flags[g] = n_my_head_flags;
	}*/
	// write carry-out and reduced tail flags // not needed here

	REGISTER_TO_GLOBAL_ORDERED_CFI_COND(CFI_FLAG, p_scan, n_remainder, l,
		SCAN_LOCAL_WORK_SIZE, SCAN_ITEMS_THREAD, p_my_scan, temp.p_shared_data);
	// write out scan to memory
}

#endif // USE_STRIDED_PACKED_HEAD_FLAGS

#endif // BUILD_SEG_SCAN_KERNELS

// borrowed from ScanReducev3.c ... ugh. make a header for this
#if defined(NVIDIA) && (REDUCTION_REDUCE_OPERATOR == '+' || REDUCTION_REDUCE_OPERATOR == '-' || \
	REDUCTION_REDUCE_OPERATOR == '*') && SCAN_SCALAR_SIZE == 4 && !defined(DISABLE_NV_SHFL)// && b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE)

#pragma message("building the NV SHFL branch")

#include "../kernel_utils/NV30.h"

#if REDUCTION_REDUCE_OPERATOR == '+'

inline _TyScalar shfl_reduce(_TyScalar x, int offset, int width) // "overloaded" shfl_add function for the current type
{
	return CONCAT(shfl_add, CONCAT(_, EXPAND(SCAN_SCALAR_TYPE)))(x, offset, width);
}

#elif REDUCTION_REDUCE_OPERATOR == '-'

inline _TyScalar shfl_reduce(_TyScalar x, int offset, int width) // "overloaded" shfl_sub function for the current type
{
	return CONCAT(shfl_sub, CONCAT(_, EXPAND(SCAN_SCALAR_TYPE)))(x, offset, width);
}

#elif REDUCTION_REDUCE_OPERATOR == '*'

inline _TyScalar shfl_reduce(_TyScalar x, int offset, int width) // "overloaded" shfl_mul function for the current type
{
	return CONCAT(shfl_mul, CONCAT(_, EXPAND(SCAN_SCALAR_TYPE)))(x, offset, width);
}

#endif // REDUCTION_REDUCE_OPERATOR == '+'

typedef struct {
	_TyScalar p_storage[WARP_SIZE];
} TLocalReduceStorage;

inline _TyScalar t_LocalReduce(const unsigned int l, _TyScalar x, __local TLocalReduceStorage *p_storage)
{
	__local _TyScalar *p_sh_mem = p_storage->p_storage; // ...

	//STATIC_ASSERT(b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE), NVIDIA_SHUFFLE_REDUCE_ONLY_WORKS_WITH_POWER_OF_TWO_WORKGROUPS);

	enum {
#if b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE)
		n_section_num = WARP_SIZE,
		n_section_size = REDUCE_LOCAL_WORK_SIZE / n_section_num,
#else // b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE)
		n_section_num = REDUCE_LOCAL_WORK_SIZE / WARP_SIZE,
		n_section_size = WARP_SIZE,
#endif // b_Is_POT(REDUCE_LOCAL_WORK_SIZE / WARP_SIZE)
		n_section_size_log2 = n_Log2(n_section_size)
	};
	const int n_lane = l & (n_section_size - 1);
	const int n_section = l >> n_section_size_log2;
	// want to reduce the tile to WARP_SIZE items and then do a single warp-cooperative scan

    #pragma unroll
    for(int offset = 1; offset < n_section_size; offset *= 2)
        x = shfl_reduce(x, offset, n_section_size);

	if(n_section_size - 1 == n_lane)
		p_sh_mem[n_section] = x;

	barrier(CLK_LOCAL_MEM_FENCE);

	if(l < n_section_num) {
        x = p_sh_mem[l];
        #pragma unroll
        for(int offset = 1; offset < n_section_num; offset *= 2)
            x = shfl_reduce(x, offset, n_section_num);
        p_sh_mem[l] = x;
    }

    barrier(CLK_LOCAL_MEM_FENCE);

    _TyScalar reduction = p_sh_mem[n_section_num - 1];

    return reduction;
}

#else // NVIDIA && (REDUCTION_REDUCE_OPERATOR == '+' || REDUCTION_REDUCE_OPERATOR == '-' ||
	  // REDUCTION_REDUCE_OPERATOR == '*') && SCAN_SCALAR_SIZE == 4 && !DISABLE_NV_SHFL

#pragma message("building the generic reduce branch")

typedef struct {
	_TyScalar p_storage[REDUCE_LOCAL_WORK_SIZE];
} TLocalReduceStorage;

/**
 *	@brief reduces a tile of an array in local memory
 *
 *	@param[in] l is local thread id
 *	@param[in] p_sh_mem is pointer to shared memory (only REDUCE_LOCAL_WORK_SIZE elements is accessed)
 *
 *	@note The result is left in p_sh_mem[0].
 *	@note This version uses REDUCTION_REDUCE_OP operation and _TyScalar.
 */
inline _TyScalar t_LocalReduce(const unsigned int l, _TyScalar x, __local TLocalReduceStorage *p_storage)
{
	__local _TyScalar *p_sh_mem = p_storage->p_storage; // ...

	p_sh_mem[l] = x; // store

#if REDUCE_LOCAL_WORK_SIZE <= 2048 && b_Is_POT(REDUCE_LOCAL_WORK_SIZE)
#if REDUCE_LOCAL_WORK_SIZE >= 2048
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 1024)
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 1024]);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 1024
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 512)
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 512]);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 512
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 256)
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 256]);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 256
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 128)
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 128]);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 128
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 64)
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 64]);
#endif
	// reduce down to 32 elements stored in the shared memory

	// the below section is within a single warp, want to avoid divergence
	// even though unneccessary reductions are made no barriers required,
	// just a memory fence to avoid compiler optimization
#if REDUCE_LOCAL_WORK_SIZE > 32
	barrier(CLK_LOCAL_MEM_FENCE);
	if(l < 32) {
#elif REDUCE_LOCAL_WORK_SIZE > 16
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 16) {
#elif REDUCE_LOCAL_WORK_SIZE > 8
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 8) {
#elif REDUCE_LOCAL_WORK_SIZE > 4
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 4) {
#elif REDUCE_LOCAL_WORK_SIZE > 2
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	if(l < 2) {
#else
	write_mem_fence(CLK_LOCAL_MEM_FENCE);
	{
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 64
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 32]);
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 32
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 16]);
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 16
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 8]);
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 8
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 4]);
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 4
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 2]);
		write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif
#if REDUCE_LOCAL_WORK_SIZE >= 2
		p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + 1]);
		write_mem_fence(CLK_LOCAL_MEM_FENCE); // !! so that all the threads have the same value
#endif
	}
	// reduce results down to one result per work group
#else // REDUCE_LOCAL_WORK_SIZE <= 2048 && b_Is_POT(REDUCE_LOCAL_WORK_SIZE)
	enum {
		n_first = n_Make_Lower_POT(REDUCE_LOCAL_WORK_SIZE),
		n_first_warp = (n_first > WARP_SIZE)? WARP_SIZE : n_first
	};

	#pragma unroll
	for(int offset = n_first; offset > n_first_warp; offset /= 2) {
		int size = (offset == n_first)? REDUCE_LOCAL_WORK_SIZE : offset * 2;
		barrier(CLK_LOCAL_MEM_FENCE);
		if(l < size - offset)
			p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + offset]);
	}

	if(REDUCE_LOCAL_WORK_SIZE > WARP_SIZE)
		barrier(CLK_LOCAL_MEM_FENCE);

	if(l < n_first_warp) {
		#pragma unroll
		for(int offset = n_first_warp; offset > 0; offset /= 2) {
			int size = (offset == n_first)? REDUCE_LOCAL_WORK_SIZE : offset * 2; // not "offset == n_first_warp" but rather "offset == n_first_warp && n_first_warp == n_first" or for short "offset == n_first"
			write_mem_fence(CLK_LOCAL_MEM_FENCE);
			if(n_first_warp < REDUCE_LOCAL_WORK_SIZE || l < size - offset) // skip the condition for the last warp if the shared mem is large enough
				p_sh_mem[l] = t_ReductionReduceOp(p_sh_mem[l], p_sh_mem[l + offset]);
		}
	}

	write_mem_fence(CLK_LOCAL_MEM_FENCE);
#endif // REDUCE_LOCAL_WORK_SIZE <= 2048 && b_Is_POT(REDUCE_LOCAL_WORK_SIZE)

	return p_sh_mem[0];
}

#endif // NVIDIA && (REDUCTION_REDUCE_OPERATOR == '+' || REDUCTION_REDUCE_OPERATOR == '-' ||
	   // REDUCTION_REDUCE_OPERATOR == '*') && SCAN_SCALAR_SIZE == 4 && !DISABLE_NV_SHFL

#ifdef BUILD_SEG_REDUCE_KERNELS

// want tile reduce to be two phase, preparation / allocation and numerical
// in the preparation, the segment flags are read, tail flags are calculated and tail counts are determined and tile reduced
//		could also write the tile carry
// then tile tail counts are global scanned to get the writing addresses for the numerical phase
// then tile carry is global seg-scanned to get segment starts
//		this could be fixed directly in the reductions, saving O(N) reads
//		each tile segscan only contributes to up to one result, not worth reading all the data because of it
// in the numerical phase, the segment flags are read, tile tail counts are read and based on tail counts either tile reduce or seg tile reduce
__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegReduce(
	__global _TyScalar *p_reductions, __global _TyScalar *p_tile_sums,
	__global const uint32_t *p_tail_counts_scan, __global const _TyScalar *p_data, // p_tail_counts_scan is exclusive and does not need to be one larger
	__global const uint32_t *p_head_flags, const unsigned int n_size)
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD < 32, SEGMENTED_REDUCE_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags_1 (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_data += n_start;
	p_head_flags += n_start;
	p_reductions += p_tail_counts_scan[g]; // need that much later

	typedef union {
		THeadFlag_DecodeStorage_OneMore headflag_temp;
		TLocalScanStorage_Int int_scan_temp; // need int scan to recalculate where each value goes
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)]; // todo - CFI only needed if SCAN_ITEMS_THREAD is even (and not 1 but that is implied by being even)
		TLocalReduceStorage reduce_temp; // need scalar reduction to calculate the values if there are no tails
		TLocalSegScanStorage seg_scan_temp; // need scalar seg scan to calculate the values if there are tails
	} TOtherData;

	enum {
		n_other_data_size = sizeof(TOtherData) / sizeof(_TyScalar),
		n_coop_write_min_size = (SCAN_TILE_SIZE + 1) / 2,
		n_max_coop_write_size = (n_other_data_size > n_coop_write_min_size)? n_other_data_size : n_coop_write_min_size
	};

	__local union {
		THeadFlag_DecodeStorage_OneMore headflag_temp;
		TLocalScanStorage_Int int_scan_temp; // need int scan to recalculate where each value goes
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)]; // todo - CFI only needed if SCAN_ITEMS_THREAD is even (and not 1 but that is implied by being even)
		TLocalReduceStorage reduce_temp; // need scalar reduction to calculate the values if there are no tails
		TLocalSegScanStorage seg_scan_temp; // need scalar seg scan to calculate the values if there are tails

		_TyScalar p_shared_dest[n_max_coop_write_size]; // needed to cooperatively write out the scans
		// this is likely the biggest array in the pack (it is not, p_shared_hf and p_shared_data are just as big, unless those are unused)
	} temp;

	uint32_t n_my_head_flags_1 = n_Decode_HeadFlags_OneMore(p_head_flags,
		n_remainder, l, l >> LOG_WARP_SIZE, l & (WARP_SIZE - 1), &temp.headflag_temp);
	// use a modified function

	if((l == SCAN_LOCAL_WORK_SIZE - 1) & (n_remainder <= SCAN_TILE_SIZE /*|| p_head_flags[SCAN_TILE_SIZE] != 0*/)) // the first condition not lazily evaluated, the second one needs to be
		n_my_head_flags_1 |= 1 << SCAN_ITEMS_THREAD; // or with a constant
	// item with index SCAN_TILE_SIZE maps to the last thread // no need to read p_head_flags[SCAN_TILE_SIZE], that would already have been done in n_Decode_HeadFlags_OneMore()

	barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_hf ----------

	const int n_my_tail_num = n_popc32(n_my_head_flags_1 >> 1);
	// count tail flags to see how much do I write

	int n_workgroup_write_num;
	int n_my_write_pos = n_LocalExScan_Int(l, n_my_tail_num,
		&n_workgroup_write_num, &temp.int_scan_temp);
	// calculate where each thread writes

	barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.int_scan_temp ----------

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	_TyScalar sum;
	if(!n_workgroup_write_num) { // convergent branch
		// use a simple tile reduce, there are no segment boundaries here (fast path for long segments)

		_TyScalar partial = t_ReductionElemOp(p_my_data[0]);
		#pragma unroll
		for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i)
			partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		// perform thread-local reduction

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_data ----------

		sum = t_LocalReduce(l, partial, &temp.reduce_temp);
	} else {
		_TyScalar partial = t_ReductionElemOp(p_my_data[0]), p_my_scan[SCAN_ITEMS_THREAD]; // ignore the first head flag since there is no carry-in
		p_my_scan[0] = partial;
		#pragma unroll
		for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i) {
			if(n_my_head_flags_1 & (1 << i)) // can't access temp.p_shared_hf anymore
				partial = REDUCTION_IDENTITY;
			partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
			p_my_scan[i] = partial;
		}
		// perform thread-local scan
		// partial holds this thread sum

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_data ----------

		_TyScalar scan = t_LocalSegScan_CalcDist(l, (n_my_head_flags_1 &
			n_Mask_32(SCAN_ITEMS_THREAD)) != 0, partial, &temp.seg_scan_temp, &sum);
		// use segmented tile reduce

		if(n_my_head_flags_1 & 1) // can't access temp.p_shared_hf anymore
			scan = REDUCTION_IDENTITY;
		// zero the scan if the first head flag is set

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.seg_scan_temp ----------

		if(n_workgroup_write_num <= n_max_coop_write_size) { // convergent branch
			// write it to p_shared_dest first, then cooperatively write that to global memory

			// scan zeroed above if the first head flag is set
			#pragma unroll
			for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
				p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);

				if(n_my_head_flags_1 & (1 << (i + 1))) { // and with compile-time const
					temp.p_shared_dest[n_my_write_pos] = p_my_scan[i];
					++ n_my_write_pos;
					scan = REDUCTION_IDENTITY;
				}
				// tail flag? write it out // could also compact it in registers first and then
				// write the first n registers (might be difficult for the compiler to unroll but
				// could use sort of reversed duff's device)
			}
			// reduce all

			barrier(CLK_LOCAL_MEM_FENCE); // wait for everyone to fill temp.p_shared_dest

			for(unsigned int i = l; i < n_workgroup_write_num; i += SCAN_LOCAL_WORK_SIZE)
				p_reductions[i] = temp.p_shared_dest[i];
			// write it to global memory cooperatively (loops not unrolled though, the number
			// of writes depends on the values on the flags and is relatively unbounded; could
			// likely use Duff's device)
		} else {
			// too much data to hold on local memory, write it to global directly (likely not the
			// worst waste as the writes would be quite dense and so almost coalesced)

			// scan zeroed above if the first head flag is set
			#pragma unroll
			for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
				p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);

				if(n_my_head_flags_1 & (1 << (i + 1))) { // and with compile-time const
					p_reductions[n_my_write_pos] = p_my_scan[i];
					++ n_my_write_pos;
					scan = REDUCTION_IDENTITY;
				}
				// tail flag? write it out // could also compact it in registers first and then
				// write the first n registers (might be difficult for the compiler to unroll but
				// could use sort of reversed duff's device)
			}
			// reduce all, write it to global memory immediately (causes uncoalesced writes)
		}
	}
	// reduce, write reductions under the tail flags

	if(!l)
		p_tile_sums[g] = sum;
	// write tile sum always
}

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegReduce_Packed(
	__global _TyScalar *p_reductions, __global _TyScalar *p_tile_sums,
	__global const uint32_t *p_tail_counts_scan, __global const _TyScalar *p_data, // p_tail_counts_scan is exclusive and does not need to be one larger
	__global const uintwarp_t *p_packed_head_flags, const unsigned int n_size)
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD < 32, SEGMENTED_REDUCE_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags_1 (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_data += n_start;
	p_packed_head_flags += n_start >> LOG_WARP_SIZE;
	p_reductions += p_tail_counts_scan[g]; // need that much later

	typedef union {
		THeadFlag_ReadStorage_OneMore headflag_temp;
		TLocalScanStorage_Int int_scan_temp; // need int scan to recalculate where each value goes
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)]; // todo - CFI only needed if SCAN_ITEMS_THREAD is even (and not 1 but that is implied by being even)
		TLocalReduceStorage reduce_temp; // need scalar reduction to calculate the values if there are no tails
		TLocalSegScanStorage seg_scan_temp; // need scalar seg scan to calculate the values if there are tails
	} TOtherData;

	enum {
		n_other_data_size = sizeof(TOtherData) / sizeof(_TyScalar),
		n_coop_write_min_size = (SCAN_TILE_SIZE + 1) / 2,
		n_max_coop_write_size = (n_other_data_size > n_coop_write_min_size)? n_other_data_size : n_coop_write_min_size
	};

	__local union {
		THeadFlag_ReadStorage_OneMore headflag_temp;
		TLocalScanStorage_Int int_scan_temp; // need int scan to recalculate where each value goes
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)]; // todo - CFI only needed if SCAN_ITEMS_THREAD is even (and not 1 but that is implied by being even)
		TLocalReduceStorage reduce_temp; // need scalar reduction to calculate the values if there are no tails
		TLocalSegScanStorage seg_scan_temp; // need scalar seg scan to calculate the values if there are tails

		_TyScalar p_shared_dest[n_max_coop_write_size]; // needed to cooperatively write out the scans
		// this is likely the biggest array in the pack (it is not, p_shared_hf and p_shared_data are just as big, unless those are unused)
	} temp;

	uint32_t n_my_head_flags_1 = n_Read_Packed_HeadFlags_OneMore(p_packed_head_flags,
		n_remainder, l, l >> LOG_WARP_SIZE, l & (WARP_SIZE - 1), &temp.headflag_temp);
	// use a modified function

	if((l == SCAN_LOCAL_WORK_SIZE - 1) && (n_remainder <= SCAN_TILE_SIZE /*||
	   (p_packed_head_flags[SCAN_WORKGROUP_WARP_NUM] & 1) != 0*/)) // the first condition not lazily evaluated, the second one needs to be
		n_my_head_flags_1 |= 1 << SCAN_ITEMS_THREAD; // or with a constant
	// item with index SCAN_TILE_SIZE maps to the last thread // no need to read (p_packed_head_flags[SCAN_WORKGROUP_WARP_NUM] & 1) != 0, that would already have been done in n_Decode_HeadFlags_OneMore()

	barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_hf ----------

	const int n_my_tail_num = n_popc32(n_my_head_flags_1 >> 1);
	// count tail flags to see how much do I write

	int n_workgroup_write_num;
	int n_my_write_pos = n_LocalExScan_Int(l, n_my_tail_num,
		&n_workgroup_write_num, &temp.int_scan_temp);
	// calculate where each thread writes

	barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.int_scan_temp ----------

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	_TyScalar sum;
	if(!n_workgroup_write_num) { // convergent branch
		// use a simple tile reduce, there are no segment boundaries here (fast path for long segments)

		_TyScalar partial = t_ReductionElemOp(p_my_data[0]);
		#pragma unroll
		for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i)
			partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		// perform thread-local reduction

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_data ----------

		sum = t_LocalReduce(l, partial, &temp.reduce_temp);
	} else {
		_TyScalar partial = t_ReductionElemOp(p_my_data[0]), p_my_scan[SCAN_ITEMS_THREAD]; // ignore the first head flag since there is no carry-in
		p_my_scan[0] = partial;
		#pragma unroll
		for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i) {
			if(n_my_head_flags_1 & (1 << i)) // can't access temp.p_shared_hf anymore
				partial = REDUCTION_IDENTITY;
			partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
			p_my_scan[i] = partial;
		}
		// perform thread-local scan
		// partial holds this thread sum

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_data ----------

		_TyScalar scan = t_LocalSegScan_CalcDist(l, (n_my_head_flags_1 &
			n_Mask_32(SCAN_ITEMS_THREAD)) != 0, partial, &temp.seg_scan_temp, &sum);
		// use segmented tile reduce

		if(n_my_head_flags_1 & 1) // can't access temp.p_shared_hf anymore
			scan = REDUCTION_IDENTITY;
		// zero the scan if the first head flag is set

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.seg_scan_temp ----------

		if(n_workgroup_write_num <= n_max_coop_write_size) { // convergent branch
			// write it to p_shared_dest first, then cooperatively write that to global memory

			// scan zeroed above if the first head flag is set
			#pragma unroll
			for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
				p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);

				if(n_my_head_flags_1 & (1 << (i + 1))) { // and with compile-time const
					temp.p_shared_dest[n_my_write_pos] = p_my_scan[i];
					++ n_my_write_pos;
					scan = REDUCTION_IDENTITY;
				}
				// tail flag? write it out // could also compact it in registers first and then
				// write the first n registers (might be difficult for the compiler to unroll but
				// could use sort of reversed duff's device)
			}
			// reduce all

			barrier(CLK_LOCAL_MEM_FENCE); // wait for everyone to fill temp.p_shared_dest

			for(unsigned int i = l; i < n_workgroup_write_num; i += SCAN_LOCAL_WORK_SIZE)
				p_reductions[i] = temp.p_shared_dest[i];
			// write it to global memory cooperatively (loops not unrolled though, the number
			// of writes depends on the values on the flags and is relatively unbounded; could
			// likely use Duff's device)
		} else {
			// too much data to hold on local memory, write it to global directly (likely not the
			// worst waste as the writes would be quite dense and so almost coalesced)

			// scan zeroed above if the first head flag is set
			#pragma unroll
			for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
				p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);

				if(n_my_head_flags_1 & (1 << (i + 1))) { // and with compile-time const
					p_reductions[n_my_write_pos] = p_my_scan[i];
					++ n_my_write_pos;
					scan = REDUCTION_IDENTITY;
				}
				// tail flag? write it out // could also compact it in registers first and then
				// write the first n registers (might be difficult for the compiler to unroll but
				// could use sort of reversed duff's device)
			}
			// reduce all, write it to global memory immediately (causes uncoalesced writes)
		}
	}
	// reduce, write reductions under the tail flags

	if(!l)
		p_tile_sums[g] = sum;
	// write tile sum always
}

#ifdef USE_STRIDED_PACKED_HEAD_FLAGS

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegReduce_PackedNatural(
	__global _TyScalar *p_reductions, __global _TyScalar *p_tile_sums,
	__global const uint32_t *p_tail_counts_scan, __global const _TyScalar *p_data, // p_tail_counts_scan is exclusive and does not need to be one larger
	__global const uintwarp_t *p_packed_head_flags, const unsigned int n_size)
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD < 32, SEGMENTED_REDUCE_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags_1 (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_data += n_start;
	p_packed_head_flags += n_start >> LOG_WARP_SIZE;
	p_reductions += p_tail_counts_scan[g]; // need that much later

	__local union {
		THeadFlag_ReadStorage1 headflag_temp;
		TLocalScanStorage_Int int_scan_temp; // need int scan to recalculate where each value goes
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)]; // todo - CFI only needed if SCAN_ITEMS_THREAD is even (and not 1 but that is implied by being even)
		TLocalReduceStorage reduce_temp; // need scalar reduction to calculate the values if there are no tails
		TLocalSegScanStorage seg_scan_temp; // need scalar seg scan to calculate the values if there are tails

		_TyScalar p_shared_dest[(SCAN_TILE_SIZE + 1) / 2]; // needed to cooperatively write out the scans
		// this is likely the biggest array in the pack (it is not, p_shared_hf and p_shared_data are just as big, unless those are unused)
	} temp;

	enum {
		n_max_coop_write_size = sizeof(temp.p_shared_dest) / sizeof(temp.p_shared_dest[0]) // so that we can change the size of the array above
	};

	uint32_t n_my_head_flags_1 = n_Read_PackedUninterleaved_HeadFlags_OneMore(p_packed_head_flags,
		n_remainder, l, l >> LOG_WARP_SIZE, l & (WARP_SIZE - 1), &temp.headflag_temp);
	// use a modified function

	if((l == SCAN_LOCAL_WORK_SIZE - 1) & (n_remainder <= SCAN_TILE_SIZE /*|| (p_packed_head_flags[SCAN_WORKGROUP_WARP_NUM] & 1) != 0*/)) // the first condition not lazily evaluated, the second one needs to be
		n_my_head_flags_1 |= 1 << SCAN_ITEMS_THREAD; // or with a constant
	// item with index SCAN_TILE_SIZE maps to the last thread // no need to read (p_packed_head_flags[SCAN_WORKGROUP_WARP_NUM] & 1) != 0, that would already have been done in n_Decode_HeadFlags_OneMore()

	barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_hf ----------

	const int n_my_tail_num = n_popc32(n_my_head_flags_1 >> 1);
	// count tail flags to see how much do I write

	int n_workgroup_write_num;
	int n_my_write_pos = n_LocalExScan_Int(l, n_my_tail_num,
		&n_workgroup_write_num, &temp.int_scan_temp);
	// calculate where each thread writes

	barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.int_scan_temp ----------

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	_TyScalar sum;
	if(!n_workgroup_write_num) { // convergent branch
		// use a simple tile reduce, there are no segment boundaries here (fast path for long segments)

		_TyScalar partial = t_ReductionElemOp(p_my_data[0]);
		#pragma unroll
		for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i)
			partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		// perform thread-local reduction

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_data ----------

		sum = t_LocalReduce(l, partial, &temp.reduce_temp);
	} else {
		_TyScalar partial = t_ReductionElemOp(p_my_data[0]), p_my_scan[SCAN_ITEMS_THREAD]; // ignore the first head flag since there is no carry-in
		p_my_scan[0] = partial;
		#pragma unroll
		for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i) {
			if(n_my_head_flags_1 & (1 << i)) // can't access temp.p_shared_hf anymore
				partial = REDUCTION_IDENTITY;
			partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
			p_my_scan[i] = partial;
		}
		// perform thread-local scan
		// partial holds this thread sum

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_data ----------

		_TyScalar scan = t_LocalSegScan_CalcDist(l, (n_my_head_flags_1 &
			n_Mask_32(SCAN_ITEMS_THREAD)) != 0, partial, &temp.seg_scan_temp, &sum);
		// use segmented tile reduce

		if(n_my_head_flags_1 & 1) // can't access temp.p_shared_hf anymore
			scan = REDUCTION_IDENTITY;
		// zero the scan if the first head flag is set

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.seg_scan_temp ----------

		if(n_workgroup_write_num <= n_max_coop_write_size) { // convergent branch
			// write it to p_shared_dest first, then cooperatively write that to global memory

			// scan zeroed above if the first head flag is set
			#pragma unroll
			for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
				p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);

				if(n_my_head_flags_1 & (1 << (i + 1))) { // and with compile-time const
					temp.p_shared_dest[n_my_write_pos] = p_my_scan[i];
					++ n_my_write_pos;
					scan = REDUCTION_IDENTITY;
				}
				// tail flag? write it out // could also compact it in registers first and then
				// write the first n registers (might be difficult for the compiler to unroll but
				// could use sort of reversed duff's device)
			}
			// reduce all

			barrier(CLK_LOCAL_MEM_FENCE); // wait for everyone to fill temp.p_shared_dest

			for(unsigned int i = l; i < n_workgroup_write_num; i += SCAN_LOCAL_WORK_SIZE)
				p_reductions[i] = temp.p_shared_dest[i];
			// write it to global memory cooperatively (loops not unrolled though, the number
			// of writes depends on the values on the flags and is relatively unbounded; could
			// likely use Duff's device)
		} else {
			// too much data to hold on local memory, write it to global directly (likely not the
			// worst waste as the writes would be quite dense and so almost coalesced)

			// scan zeroed above if the first head flag is set
			#pragma unroll
			for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
				p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);

				if(n_my_head_flags_1 & (1 << (i + 1))) { // and with compile-time const
					p_reductions[n_my_write_pos] = p_my_scan[i];
					++ n_my_write_pos;
					scan = REDUCTION_IDENTITY;
				}
				// tail flag? write it out // could also compact it in registers first and then
				// write the first n registers (might be difficult for the compiler to unroll but
				// could use sort of reversed duff's device)
			}
			// reduce all, write it to global memory immediately (causes uncoalesced writes)
		}
	}
	// reduce, write reductions under the tail flags

	if(!l)
		p_tile_sums[g] = sum;
	// write tile sum always
}

#endif // USE_STRIDED_PACKED_HEAD_FLAGS

// note that the head flags are a major source of bandwidth now, this kernel should pack them in a bit array here (only need to see if the decoding logic wouldnt get too complex)
// p_tile_head_flags are needed for the successive segmented scan in case this will be larger than a single tile (could have a version of the kernel without it, for very small reductions)
__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegReduce_Bootstrap(__global uint32_t *p_tail_counts,
	__global uint32_t *p_tile_head_flags, __global const uint32_t *p_head_flags, const unsigned int n_size)
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD < 32, SEGMENTED_REDUCE_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags_1 and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_head_flags += n_start;

	__local union {
		THeadFlag_DecodeStorage headflag_temp;
		TLocalReduceStorage_Int reduce_temp; // need reduction
		uintwarp_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // head flags ballot for each warp
	} temp;

	const unsigned int n_warp = l >> LOG_WARP_SIZE;
	const unsigned int n_lane = l & (WARP_SIZE - 1);

	uint32_t n_my_head_flags = n_Decode_HeadFlags(p_head_flags, n_remainder, l, n_warp, n_lane, &temp.headflag_temp);
	// read up to SCAN_TILE_SIZE head flags, use the nice function

	// each thread counts tails, which are heads shifted right + the first head from the next tile
	// this is the same as counting heads, with the exception in the first thread

	uint32_t n_my_tail_flags = n_my_head_flags;
	//n_my_tail_flags &= ~(uint32_t)((!l)? 1 : 0); // ignore the first head flag in the first thread (makes a difference in other than the first tiles)
	n_my_tail_flags >>= (!l)? 1 : 0; // maybe easier
	// the least signigficant bit is the first head flag (1 << 0), the rest can be interpreted as tail flags

	int n_tail_flag_num = n_popc32(n_my_tail_flags);
	// count tail flags of this thread

	if((!l) && (n_remainder <= SCAN_TILE_SIZE || p_head_flags[SCAN_TILE_SIZE] != 0)) // the first condition not lazily evaluated, the second one needs to be
		++ n_tail_flag_num;
	// only the first thread reads p_head_flags[SCAN_TILE_SIZE] to avoid larger than needed memory traffic

	barrier(CLK_LOCAL_MEM_FENCE); // wait for everyone to stop using temp.headflag_temp

	int n_tile_tail_num = n_LocalReduce_Int(l, n_tail_flag_num, &temp.reduce_temp) /*+
		((n_remainder <= SCAN_TILE_SIZE)? 1 : // count an extra tail at the end
		((p_head_flags[SCAN_TILE_SIZE] != 0)? 1 : 0))*/; // count an extra tail if the first element of the next tile is head
	// reduce to get tile count

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.reduce_temp

	{
		uintwarp_t n_wg_head_flags = n_warp_ballot(n_my_head_flags != 0); // all the threads need to contribute!
		if(!n_lane) // the first thread of each warp
			temp.p_shared_head_flags[n_warp] = n_wg_head_flags;

		barrier(CLK_LOCAL_MEM_FENCE); // wait for all warp representatives to write

		if(l < SCAN_WORKGROUP_WARP_NUM)
			n_wg_head_flags = n_warp_ballot(temp.p_shared_head_flags[l] != 0); // if there are less than 32 warps, the other threads contribute zeros
		// reduce head flags (don't really need the value - only zero / nonzero, could simply use atomic or)

		if(!l) {
			p_tail_counts[g] = n_tile_tail_num;
			p_tile_head_flags[g] = n_wg_head_flags;
		}
	}
	// this is fragile code on K40, not using n_wg_head_flags the whole way through leads to errors
	// could have been caused by n_my_head_flags declared in both internal and
	// the wrapping scope and different behavior of different driver versions
}

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegReduce_Bootstrap_Pack(__global uint32_t *p_tail_counts,
	__global uint32_t *p_tile_head_flags, __global uintwarp_t *p_packed_head_flags,
	__global const uint32_t *p_head_flags, const unsigned int n_size)
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD < 32, SEGMENTED_REDUCE_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags_1 and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_head_flags += n_start;
	p_packed_head_flags += n_start >> LOG_WARP_SIZE;

	__local union {
		THeadFlag_DecodePackStorage headflag_temp;
		TLocalReduceStorage_Int reduce_temp; // need reduction
		uintwarp_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // head flags ballot for each warp
	} temp;

	const unsigned int n_warp = l >> LOG_WARP_SIZE;
	const unsigned int n_lane = l & (WARP_SIZE - 1);

	uint32_t n_my_head_flags = n_Decode_HeadFlags_WritePacked(p_packed_head_flags,
		p_head_flags, n_remainder, l, n_warp, n_lane, &temp.headflag_temp);
	// read up to SCAN_TILE_SIZE head flags, use the nice function

	// each thread counts tails, which are heads shifted right + the first head from the next tile
	// this is the same as counting heads, with the exception in the first thread

	uint32_t n_my_tail_flags = n_my_head_flags;
	//n_my_tail_flags &= ~(uint32_t)((!l)? 1 : 0); // ignore the first head flag in the first thread (makes a difference in other than the first tiles)
	n_my_tail_flags >>= (!l)? 1 : 0; // maybe easier
	// the least signigficant bit is the first head flag (1 << 0), the rest can be interpreted as tail flags

	int n_tail_flag_num = n_popc32(n_my_tail_flags);
	// count tail flags of this thread

	if((!l) && (n_remainder <= SCAN_TILE_SIZE || p_head_flags[SCAN_TILE_SIZE] != 0)) // the first condition not lazily evaluated, the second one needs to be
		++ n_tail_flag_num;
	// only the first thread reads p_head_flags[SCAN_TILE_SIZE] to avoid larger than needed memory traffic

	barrier(CLK_LOCAL_MEM_FENCE); // wait for everyone to stop using temp.headflag_temp

	int n_tile_tail_num = n_LocalReduce_Int(l, n_tail_flag_num, &temp.reduce_temp) /*+
		((n_remainder <= SCAN_TILE_SIZE)? 1 : // count an extra tail at the end
		((p_head_flags[SCAN_TILE_SIZE] != 0)? 1 : 0))*/; // count an extra tail if the first element of the next tile is head
	// reduce to get tile count

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.reduce_temp

	{
		uintwarp_t n_wg_head_flags = n_warp_ballot(n_my_head_flags != 0); // all the threads need to contribute!
		if(!n_lane) // the first thread of each warp
			temp.p_shared_head_flags[n_warp] = n_wg_head_flags;

		barrier(CLK_LOCAL_MEM_FENCE); // wait for all warp representatives to write

		if(l < SCAN_WORKGROUP_WARP_NUM)
			n_wg_head_flags = n_warp_ballot(temp.p_shared_head_flags[l] != 0); // if there are less than 32 warps, the other threads contribute zeros
		// reduce head flags (don't really need the value - only zero / nonzero, could simply use atomic or)

		if(!l) {
			p_tail_counts[g] = n_tile_tail_num;
			p_tile_head_flags[g] = n_wg_head_flags;
		}
	}
	// this is fragile code on K40, not using n_wg_head_flags the whole way through leads to errors
	// could have been caused by n_my_head_flags declared in both internal and
	// the wrapping scope and different behavior of different driver versions
}

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegReduce_Bootstrap_Packed(__global uint32_t *p_tail_counts,
	__global uint32_t *p_tile_head_flags, __global const uintwarp_t *p_packed_head_flags, const unsigned int n_size)
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD < 32, SEGMENTED_REDUCE_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags_1 and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_packed_head_flags += n_start >> LOG_WARP_SIZE;

	__local union {
		THeadFlag_ReadStorage headflag_temp;
		TLocalReduceStorage_Int reduce_temp; // need reduction
		uintwarp_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // head flags ballot for each warp
	} temp;

	const unsigned int n_warp = l >> LOG_WARP_SIZE;
	const unsigned int n_lane = l & (WARP_SIZE - 1);

	uint32_t n_my_head_flags = n_Read_Packed_HeadFlags(p_packed_head_flags,
		n_remainder, l, n_warp, n_lane, &temp.headflag_temp);
	// read up to SCAN_TILE_SIZE head flags, use the nice function

	barrier(CLK_LOCAL_MEM_FENCE); // wait for everyone to stop using temp.headflag_temp

	// each thread counts tails, which are heads shifted right + the first head from the next tile
	// this is the same as counting heads, with the exception in the first thread

	uint32_t n_my_tail_flags = n_my_head_flags;
	//n_my_tail_flags &= ~(uint32_t)((!l)? 1 : 0); // ignore the first head flag in the first thread (makes a difference in other than the first tiles)
	n_my_tail_flags >>= (!l)? 1 : 0; // maybe easier
	// the least signigficant bit is the first head flag (1 << 0), the rest can be interpreted as tail flags

	int n_tail_flag_num = n_popc32(n_my_tail_flags);
	// count tail flags of this thread

	int n_extra_tail_flag = 0; // shorter branch
	if((!l))//  (n_remainder <= SCAN_TILE_SIZE || (p_packed_head_flags[SCAN_TILE_SIZE_BY_WARP_SIZE] & 1) != 0)) // the first condition not lazily evaluated, the second one needs to be
		n_extra_tail_flag = (n_remainder <= SCAN_TILE_SIZE)? 1 : p_packed_head_flags[SCAN_TILE_SIZE_BY_WARP_SIZE];
	n_tail_flag_num += n_extra_tail_flag & 1; // shorter branch
	// only the first thread reads (p_packed_head_flags[SCAN_TILE_SIZE_BY_WARP_SIZE] & 1) to avoid larger than needed memory traffic

	int n_tile_tail_num = n_LocalReduce_Int(l, n_tail_flag_num, &temp.reduce_temp) /*+
		((n_remainder <= SCAN_TILE_SIZE)? 1 : // count an extra tail at the end
		(((p_packed_head_flags[SCAN_TILE_SIZE_BY_WARP_SIZE] & 1) != 0)? 1 : 0))*/; // count an extra tail if the first element of the next tile is head
	// reduce to get tile count

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.reduce_temp

	{
		uintwarp_t n_wg_head_flags = n_warp_ballot(n_my_head_flags != 0); // all the threads need to contribute!
		if(!n_lane) // the first thread of each warp
			temp.p_shared_head_flags[n_warp] = n_wg_head_flags;

		barrier(CLK_LOCAL_MEM_FENCE); // wait for all warp representatives to write

		if(l < SCAN_WORKGROUP_WARP_NUM)
			n_wg_head_flags = n_warp_ballot(temp.p_shared_head_flags[l] != 0); // if there are less than 32 warps, the other threads contribute zeros
		// reduce head flags (don't really need the value - only zero / nonzero, could simply use atomic or)

		if(!l) {
			p_tail_counts[g] = n_tile_tail_num;
			p_tile_head_flags[g] = n_wg_head_flags;
		}
	}
	// this is fragile code on K40, not using n_wg_head_flags the whole way through leads to errors
	// could have been caused by n_my_head_flags declared in both internal and
	// the wrapping scope and different behavior of different driver versions
}

#ifdef USE_STRIDED_PACKED_HEAD_FLAGS

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegReduce_Bootstrap_PackedNatural(__global uint32_t *p_tail_counts,
	__global uint32_t *p_tile_head_flags, __global const uintwarp_t *p_packed_head_flags, const unsigned int n_size)
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD < 32, SEGMENTED_REDUCE_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags_1 and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_packed_head_flags += n_start >> LOG_WARP_SIZE;

	__local union {
		THeadFlag_ReadStorage1 headflag_temp;
		TLocalReduceStorage_Int reduce_temp; // need reduction
		uintwarp_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // head flags ballot for each warp
	} temp;

	const unsigned int n_warp = l >> LOG_WARP_SIZE;
	const unsigned int n_lane = l & (WARP_SIZE - 1);

	uint32_t n_my_head_flags = n_Read_PackedUninterleaved_HeadFlags(p_packed_head_flags,
		n_remainder, l, n_warp, n_lane, &temp.headflag_temp);
	// read up to SCAN_TILE_SIZE head flags, use the nice function

	// each thread counts tails, which are heads shifted right + the first head from the next tile
	// this is the same as counting heads, with the exception in the first thread

	uint32_t n_my_tail_flags = n_my_head_flags;
	//n_my_tail_flags &= ~(uint32_t)((!l)? 1 : 0); // ignore the first head flag in the first thread (makes a difference in other than the first tiles)
	n_my_tail_flags >>= (!l)? 1 : 0; // maybe easier
	// the least signigficant bit is the first head flag (1 << 0), the rest can be interpreted as tail flags

	int n_tail_flag_num = n_popc32(n_my_tail_flags);
	// count tail flags of this thread

	if((!l) && (n_remainder <= SCAN_TILE_SIZE || (p_packed_head_flags[SCAN_TILE_SIZE_BY_WARP_SIZE] & 1) != 0)) // the first condition not lazily evaluated, the second one needs to be
		++ n_tail_flag_num;
	// only the first thread reads (p_packed_head_flags[SCAN_TILE_SIZE_BY_WARP_SIZE] & 1) to avoid larger than needed memory traffic

	barrier(CLK_LOCAL_MEM_FENCE); // wait for everyone to stop using temp.headflag_temp

	int n_tile_tail_num = n_LocalReduce_Int(l, n_tail_flag_num, &temp.reduce_temp) /*+
		((n_remainder <= SCAN_TILE_SIZE)? 1 : // count an extra tail at the end
		(((p_packed_head_flags[SCAN_TILE_SIZE_BY_WARP_SIZE] & 1) != 0)? 1 : 0))*/; // count an extra tail if the first element of the next tile is head
	// reduce to get tile count

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.reduce_temp

	{
		uintwarp_t n_wg_head_flags = n_warp_ballot(n_my_head_flags != 0); // all the threads need to contribute!
		if(!n_lane) // the first thread of each warp
			temp.p_shared_head_flags[n_warp] = n_wg_head_flags;

		barrier(CLK_LOCAL_MEM_FENCE); // wait for all warp representatives to write

		if(l < SCAN_WORKGROUP_WARP_NUM)
			n_wg_head_flags = n_warp_ballot(temp.p_shared_head_flags[l] != 0); // if there are less than 32 warps, the other threads contribute zeros
		// reduce head flags (don't really need the value - only zero / nonzero, could simply use atomic or)

		if(!l) {
			p_tail_counts[g] = n_tile_tail_num;
			p_tile_head_flags[g] = n_wg_head_flags;
		}
	}
	// this is fragile code on K40, not using n_wg_head_flags the whole way through leads to errors
	// could have been caused by n_my_head_flags declared in both internal and
	// the wrapping scope and different behavior of different driver versions
}

#endif // USE_STRIDED_PACKED_HEAD_FLAGS

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegReduceSingle(
	__global _TyScalar *p_reductions, /*__global _TyScalar *p_tile_sums,
	__global const uint32_t *p_tail_counts_scan,*/ __global const _TyScalar *p_data, // p_tail_counts_scan is exclusive and does not need to be one larger
	__global const uint32_t *p_head_flags, const unsigned int n_size)
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD < 32, SEGMENTED_REDUCE_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags_1 (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_data += n_start;
	p_head_flags += n_start;
	//p_reductions += p_tail_counts_scan[g]; // need that much later

	typedef union {
		THeadFlag_DecodeStorage_OneMore headflag_temp;
		TLocalScanStorage_Int int_scan_temp; // need int scan to recalculate where each value goes
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)]; // todo - CFI only needed if SCAN_ITEMS_THREAD is even (and not 1 but that is implied by being even)
		TLocalReduceStorage reduce_temp; // need scalar reduction to calculate the values if there are no tails
		TLocalSegScanStorage seg_scan_temp; // need scalar seg scan to calculate the values if there are tails
	} TOtherData;

	enum {
		n_other_data_size = sizeof(TOtherData) / sizeof(_TyScalar),
		n_coop_write_min_size = (SCAN_TILE_SIZE + 1) / 2,
		n_max_coop_write_size = (n_other_data_size > n_coop_write_min_size)? n_other_data_size : n_coop_write_min_size
	};

	__local union {
		THeadFlag_DecodeStorage_OneMore headflag_temp;
		TLocalScanStorage_Int int_scan_temp; // need int scan to recalculate where each value goes
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)]; // todo - CFI only needed if SCAN_ITEMS_THREAD is even (and not 1 but that is implied by being even)
		TLocalReduceStorage reduce_temp; // need scalar reduction to calculate the values if there are no tails
		TLocalSegScanStorage seg_scan_temp; // need scalar seg scan to calculate the values if there are tails

		_TyScalar p_shared_dest[n_max_coop_write_size]; // needed to cooperatively write out the scans
		// this is likely the biggest array in the pack (it is not, p_shared_hf and p_shared_data are just as big, unless those are unused)
	} temp;

	uint32_t n_my_head_flags_1 = n_Decode_HeadFlags_OneMore(p_head_flags,
		n_remainder, l, l >> LOG_WARP_SIZE, l & (WARP_SIZE - 1), &temp.headflag_temp);
	// use a modified function

	if((l == SCAN_LOCAL_WORK_SIZE - 1) & (n_remainder <= SCAN_TILE_SIZE /*|| p_head_flags[SCAN_TILE_SIZE] != 0*/)) // the first condition not lazily evaluated, the second one needs to be
		n_my_head_flags_1 |= 1 << SCAN_ITEMS_THREAD; // or with a constant
	// item with index SCAN_TILE_SIZE maps to the last thread // no need to read p_head_flags[SCAN_TILE_SIZE], that would already have been done in n_Decode_HeadFlags_OneMore()

	barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_hf ----------

	const int n_my_tail_num = n_popc32(n_my_head_flags_1 >> 1);
	// count tail flags to see how much do I write

	int n_workgroup_write_num;
	int n_my_write_pos = n_LocalExScan_Int(l, n_my_tail_num,
		&n_workgroup_write_num, &temp.int_scan_temp);
	// calculate where each thread writes

	barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.int_scan_temp ----------

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	_TyScalar sum;
	/*if(!n_workgroup_write_num) { // convergent branch
		// use a simple tile reduce, there are no segment boundaries here (fast path for long segments)

		_TyScalar partial = t_ReductionElemOp(p_my_data[0]);
		#pragma unroll
		for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i)
			partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		// perform thread-local reduction

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_data ----------

		sum = t_LocalReduce(l, partial, &temp.reduce_temp);
	} else*/ {
		_TyScalar partial = t_ReductionElemOp(p_my_data[0]), p_my_scan[SCAN_ITEMS_THREAD]; // ignore the first head flag since there is no carry-in
		p_my_scan[0] = partial;
		#pragma unroll
		for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i) {
			if(n_my_head_flags_1 & (1 << i)) // can't access temp.p_shared_hf anymore
				partial = REDUCTION_IDENTITY;
			partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
			p_my_scan[i] = partial;
		}
		// perform thread-local scan
		// partial holds this thread sum

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_data ----------

		_TyScalar scan = t_LocalSegScan_CalcDist(l, (n_my_head_flags_1 &
			n_Mask_32(SCAN_ITEMS_THREAD)) != 0, partial, &temp.seg_scan_temp, &sum);
		// use segmented tile reduce

		if(n_my_head_flags_1 & 1) // can't access temp.p_shared_hf anymore
			scan = REDUCTION_IDENTITY;
		// zero the scan if the first head flag is set

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.seg_scan_temp ----------

		if(n_workgroup_write_num <= n_max_coop_write_size) { // convergent branch
			// write it to p_shared_dest first, then cooperatively write that to global memory

			// scan zeroed above if the first head flag is set
			#pragma unroll
			for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
				p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);

				if(n_my_head_flags_1 & (1 << (i + 1))) { // and with compile-time const
					temp.p_shared_dest[n_my_write_pos] = p_my_scan[i];
					++ n_my_write_pos;
					scan = REDUCTION_IDENTITY;
				}
				// tail flag? write it out // could also compact it in registers first and then
				// write the first n registers (might be difficult for the compiler to unroll but
				// could use sort of reversed duff's device)
			}
			// reduce all

			barrier(CLK_LOCAL_MEM_FENCE); // wait for everyone to fill temp.p_shared_dest

			for(unsigned int i = l; i < n_workgroup_write_num; i += SCAN_LOCAL_WORK_SIZE)
				p_reductions[i] = temp.p_shared_dest[i];
			// write it to global memory cooperatively (loops not unrolled though, the number
			// of writes depends on the values on the flags and is relatively unbounded; could
			// likely use Duff's device)
		} else {
			// too much data to hold on local memory, write it to global directly (likely not the
			// worst waste as the writes would be quite dense and so almost coalesced)

			// scan zeroed above if the first head flag is set
			#pragma unroll
			for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
				p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);

				if(n_my_head_flags_1 & (1 << (i + 1))) { // and with compile-time const
					p_reductions[n_my_write_pos] = p_my_scan[i];
					++ n_my_write_pos;
					scan = REDUCTION_IDENTITY;
				}
				// tail flag? write it out // could also compact it in registers first and then
				// write the first n registers (might be difficult for the compiler to unroll but
				// could use sort of reversed duff's device)
			}
			// reduce all, write it to global memory immediately (causes uncoalesced writes)
		}
	}
	// reduce, write reductions under the tail flags

	//if(!l)
	//	p_tile_sums[g] = sum;
	// write tile sum always
}

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegReduceSingle_Packed(
	__global _TyScalar *p_reductions, /*__global _TyScalar *p_tile_sums,
	__global const uint32_t *p_tail_counts_scan,*/ __global const _TyScalar *p_data, // p_tail_counts_scan is exclusive and does not need to be one larger
	__global const uintwarp_t *p_packed_head_flags, const unsigned int n_size)
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD < 32, SEGMENTED_REDUCE_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags_1 (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_data += n_start;
	p_packed_head_flags += n_start >> LOG_WARP_SIZE;
	//p_reductions += p_tail_counts_scan[g]; // need that much later

	typedef union {
		THeadFlag_ReadStorage_OneMore headflag_temp;
		TLocalScanStorage_Int int_scan_temp; // need int scan to recalculate where each value goes
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)]; // todo - CFI only needed if SCAN_ITEMS_THREAD is even (and not 1 but that is implied by being even)
		TLocalReduceStorage reduce_temp; // need scalar reduction to calculate the values if there are no tails
		TLocalSegScanStorage seg_scan_temp; // need scalar seg scan to calculate the values if there are tails
	} TOtherData;

	enum {
		n_other_data_size = sizeof(TOtherData) / sizeof(_TyScalar),
		n_coop_write_min_size = (SCAN_TILE_SIZE + 1) / 2,
		n_max_coop_write_size = (n_other_data_size > n_coop_write_min_size)? n_other_data_size : n_coop_write_min_size
	};

	__local union {
		THeadFlag_ReadStorage_OneMore headflag_temp;
		TLocalScanStorage_Int int_scan_temp; // need int scan to recalculate where each value goes
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)]; // todo - CFI only needed if SCAN_ITEMS_THREAD is even (and not 1 but that is implied by being even)
		TLocalReduceStorage reduce_temp; // need scalar reduction to calculate the values if there are no tails
		TLocalSegScanStorage seg_scan_temp; // need scalar seg scan to calculate the values if there are tails

		_TyScalar p_shared_dest[n_max_coop_write_size]; // needed to cooperatively write out the scans
		// this is likely the biggest array in the pack (it is not, p_shared_hf and p_shared_data are just as big, unless those are unused)
	} temp;

	uint32_t n_my_head_flags_1 = n_Read_Packed_HeadFlags_OneMore(p_packed_head_flags,
		n_remainder, l, l >> LOG_WARP_SIZE, l & (WARP_SIZE - 1), &temp.headflag_temp);
	// use a modified function

	if((l == SCAN_LOCAL_WORK_SIZE - 1) && (n_remainder <= SCAN_TILE_SIZE /*||
	   (p_packed_head_flags[SCAN_WORKGROUP_WARP_NUM] & 1) != 0*/)) // the first condition not lazily evaluated, the second one needs to be
		n_my_head_flags_1 |= 1 << SCAN_ITEMS_THREAD; // or with a constant
	// item with index SCAN_TILE_SIZE maps to the last thread // no need to read (p_packed_head_flags[SCAN_WORKGROUP_WARP_NUM] & 1) != 0, that would already have been done in n_Decode_HeadFlags_OneMore()

	barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_hf ----------

	const int n_my_tail_num = n_popc32(n_my_head_flags_1 >> 1);
	// count tail flags to see how much do I write

	int n_workgroup_write_num;
	int n_my_write_pos = n_LocalExScan_Int(l, n_my_tail_num,
		&n_workgroup_write_num, &temp.int_scan_temp);
	// calculate where each thread writes

	barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.int_scan_temp ----------

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	_TyScalar sum;
	/*if(!n_workgroup_write_num) { // convergent branch
		// use a simple tile reduce, there are no segment boundaries here (fast path for long segments)

		_TyScalar partial = t_ReductionElemOp(p_my_data[0]);
		#pragma unroll
		for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i)
			partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		// perform thread-local reduction

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_data ----------

		sum = t_LocalReduce(l, partial, &temp.reduce_temp);
	} else*/ {
		_TyScalar partial = t_ReductionElemOp(p_my_data[0]), p_my_scan[SCAN_ITEMS_THREAD]; // ignore the first head flag since there is no carry-in
		p_my_scan[0] = partial;
		#pragma unroll
		for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i) {
			if(n_my_head_flags_1 & (1 << i)) // can't access temp.p_shared_hf anymore
				partial = REDUCTION_IDENTITY;
			partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
			p_my_scan[i] = partial;
		}
		// perform thread-local scan
		// partial holds this thread sum

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_data ----------

		_TyScalar scan = t_LocalSegScan_CalcDist(l, (n_my_head_flags_1 &
			n_Mask_32(SCAN_ITEMS_THREAD)) != 0, partial, &temp.seg_scan_temp, &sum);
		// use segmented tile reduce

		if(n_my_head_flags_1 & 1) // can't access temp.p_shared_hf anymore
			scan = REDUCTION_IDENTITY;
		// zero the scan if the first head flag is set

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.seg_scan_temp ----------

		if(n_workgroup_write_num <= n_max_coop_write_size) { // convergent branch
			// write it to p_shared_dest first, then cooperatively write that to global memory

			// scan zeroed above if the first head flag is set
			#pragma unroll
			for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
				p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);

				if(n_my_head_flags_1 & (1 << (i + 1))) { // and with compile-time const
					temp.p_shared_dest[n_my_write_pos] = p_my_scan[i];
					++ n_my_write_pos;
					scan = REDUCTION_IDENTITY;
				}
				// tail flag? write it out // could also compact it in registers first and then
				// write the first n registers (might be difficult for the compiler to unroll but
				// could use sort of reversed duff's device)
			}
			// reduce all

			barrier(CLK_LOCAL_MEM_FENCE); // wait for everyone to fill temp.p_shared_dest

			for(unsigned int i = l; i < n_workgroup_write_num; i += SCAN_LOCAL_WORK_SIZE)
				p_reductions[i] = temp.p_shared_dest[i];
			// write it to global memory cooperatively (loops not unrolled though, the number
			// of writes depends on the values on the flags and is relatively unbounded; could
			// likely use Duff's device)
		} else {
			// too much data to hold on local memory, write it to global directly (likely not the
			// worst waste as the writes would be quite dense and so almost coalesced)

			// scan zeroed above if the first head flag is set
			#pragma unroll
			for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
				p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);

				if(n_my_head_flags_1 & (1 << (i + 1))) { // and with compile-time const
					p_reductions[n_my_write_pos] = p_my_scan[i];
					++ n_my_write_pos;
					scan = REDUCTION_IDENTITY;
				}
				// tail flag? write it out // could also compact it in registers first and then
				// write the first n registers (might be difficult for the compiler to unroll but
				// could use sort of reversed duff's device)
			}
			// reduce all, write it to global memory immediately (causes uncoalesced writes)
		}
	}
	// reduce, write reductions under the tail flags

	//if(!l)
	//	p_tile_sums[g] = sum;
	// write tile sum always
}

#ifdef USE_STRIDED_PACKED_HEAD_FLAGS

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegReduceSingle_PackedNatural(
	__global _TyScalar *p_reductions, /*__global _TyScalar *p_tile_sums,
	__global const uint32_t *p_tail_counts_scan,*/ __global const _TyScalar *p_data, // p_tail_counts_scan is exclusive and does not need to be one larger
	__global const uintwarp_t *p_packed_head_flags, const unsigned int n_size)
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD < 32, SEGMENTED_REDUCE_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags_1 (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;// = min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_data += n_start;
	p_packed_head_flags += n_start >> LOG_WARP_SIZE;
	//p_reductions += p_tail_counts_scan[g]; // need that much later

	__local union {
		THeadFlag_ReadStorage1 headflag_temp;
		TLocalScanStorage_Int int_scan_temp; // need int scan to recalculate where each value goes
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)]; // todo - CFI only needed if SCAN_ITEMS_THREAD is even (and not 1 but that is implied by being even)
		TLocalReduceStorage reduce_temp; // need scalar reduction to calculate the values if there are no tails
		TLocalSegScanStorage seg_scan_temp; // need scalar seg scan to calculate the values if there are tails

		_TyScalar p_shared_dest[(SCAN_TILE_SIZE + 1) / 2]; // needed to cooperatively write out the scans
		// this is likely the biggest array in the pack (it is not, p_shared_hf and p_shared_data are just as big, unless those are unused)
	} temp;

	enum {
		n_max_coop_write_size = sizeof(temp.p_shared_dest) / sizeof(temp.p_shared_dest[0]) // so that we can change the size of the array above
	};

	uint32_t n_my_head_flags_1 = n_Read_PackedUninterleaved_HeadFlags_OneMore(p_packed_head_flags,
		n_remainder, l, l >> LOG_WARP_SIZE, l & (WARP_SIZE - 1), &temp.headflag_temp);
	// use a modified function

	if((l == SCAN_LOCAL_WORK_SIZE - 1) & (n_remainder <= SCAN_TILE_SIZE /*|| (p_packed_head_flags[SCAN_WORKGROUP_WARP_NUM] & 1) != 0*/)) // the first condition not lazily evaluated, the second one needs to be
		n_my_head_flags_1 |= 1 << SCAN_ITEMS_THREAD; // or with a constant
	// item with index SCAN_TILE_SIZE maps to the last thread // no need to read (p_packed_head_flags[SCAN_WORKGROUP_WARP_NUM] & 1) != 0, that would already have been done in n_Decode_HeadFlags_OneMore()

	barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_hf ----------

	const int n_my_tail_num = n_popc32(n_my_head_flags_1 >> 1);
	// count tail flags to see how much do I write

	int n_workgroup_write_num;
	int n_my_write_pos = n_LocalExScan_Int(l, n_my_tail_num,
		&n_workgroup_write_num, &temp.int_scan_temp);
	// calculate where each thread writes

	barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.int_scan_temp ----------

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	_TyScalar sum;
	/*if(!n_workgroup_write_num) { // convergent branch
		// use a simple tile reduce, there are no segment boundaries here (fast path for long segments)

		_TyScalar partial = t_ReductionElemOp(p_my_data[0]);
		#pragma unroll
		for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i)
			partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		// perform thread-local reduction

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_data ----------

		sum = t_LocalReduce(l, partial, &temp.reduce_temp);
	} else*/ {
		_TyScalar partial = t_ReductionElemOp(p_my_data[0]), p_my_scan[SCAN_ITEMS_THREAD]; // ignore the first head flag since there is no carry-in
		p_my_scan[0] = partial;
		#pragma unroll
		for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i) {
			if(n_my_head_flags_1 & (1 << i)) // can't access temp.p_shared_hf anymore
				partial = REDUCTION_IDENTITY;
			partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
			p_my_scan[i] = partial;
		}
		// perform thread-local scan
		// partial holds this thread sum

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.p_shared_data ----------

		_TyScalar scan = t_LocalSegScan_CalcDist(l, (n_my_head_flags_1 &
			n_Mask_32(SCAN_ITEMS_THREAD)) != 0, partial, &temp.seg_scan_temp, &sum);
		// use segmented tile reduce

		if(n_my_head_flags_1 & 1) // can't access temp.p_shared_hf anymore
			scan = REDUCTION_IDENTITY;
		// zero the scan if the first head flag is set

		barrier(CLK_LOCAL_MEM_FENCE); // ---------- wait for everyone to stop using temp.seg_scan_temp ----------

		if(n_workgroup_write_num <= n_max_coop_write_size) { // convergent branch
			// write it to p_shared_dest first, then cooperatively write that to global memory

			// scan zeroed above if the first head flag is set
			#pragma unroll
			for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
				p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);

				if(n_my_head_flags_1 & (1 << (i + 1))) { // and with compile-time const
					temp.p_shared_dest[n_my_write_pos] = p_my_scan[i];
					++ n_my_write_pos;
					scan = REDUCTION_IDENTITY;
				}
				// tail flag? write it out // could also compact it in registers first and then
				// write the first n registers (might be difficult for the compiler to unroll but
				// could use sort of reversed duff's device)
			}
			// reduce all

			barrier(CLK_LOCAL_MEM_FENCE); // wait for everyone to fill temp.p_shared_dest

			for(unsigned int i = l; i < n_workgroup_write_num; i += SCAN_LOCAL_WORK_SIZE)
				p_reductions[i] = temp.p_shared_dest[i];
			// write it to global memory cooperatively (loops not unrolled though, the number
			// of writes depends on the values on the flags and is relatively unbounded; could
			// likely use Duff's device)
		} else {
			// too much data to hold on local memory, write it to global directly (likely not the
			// worst waste as the writes would be quite dense and so almost coalesced)

			// scan zeroed above if the first head flag is set
			#pragma unroll
			for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
				p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);

				if(n_my_head_flags_1 & (1 << (i + 1))) { // and with compile-time const
					p_reductions[n_my_write_pos] = p_my_scan[i];
					++ n_my_write_pos;
					scan = REDUCTION_IDENTITY;
				}
				// tail flag? write it out // could also compact it in registers first and then
				// write the first n registers (might be difficult for the compiler to unroll but
				// could use sort of reversed duff's device)
			}
			// reduce all, write it to global memory immediately (causes uncoalesced writes)
		}
	}
	// reduce, write reductions under the tail flags

	//if(!l)
	//	p_tile_sums[g] = sum;
	// write tile sum always
}

#endif // USE_STRIDED_PACKED_HEAD_FLAGS

#endif // BUILD_SEG_REDUCE_KERNELS

#ifdef BUILD_SEG_REDUCE_SPINE_ADJUST_KERNELS

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegReduce_SpineAdjust(
	__global _TyScalar *p_reductions, __global const uint32_t *p_tail_counts_scan, // p_tail_counts_scan on the same granularity as n_size (per reduction tile)
	__global const uint32_t *p_reduction_head_flags, const unsigned int n_reduction_tile_size, // p_reduction_head_flags on finer granularity than n_size (per reduction element, tiled with n_reduction_tile_size)
	//const int b_packed_reduction, // always set
	__global const _TyScalar *p_data, __global const uint32_t *p_head_flags, const unsigned int n_size) // 
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD <= 32, SEGMENTED_SCAN_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;//min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_data += n_start;
	p_head_flags += n_start;
	//p_reductions += n_start; // nope, indexed by p_tail_counts_scan
	p_tail_counts_scan += n_start;
	p_reduction_head_flags += n_start * n_reduction_tile_size;

	__local union {
		THeadFlag_DecodeStorage headflag_temp;
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalSegScanStorage seg_scan_temp; // need seg scan
		//uint32_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // tail flags ballot for each warp
	} temp;

	uint32_t n_my_head_flags = n_Decode_HeadFlags(p_head_flags,
		n_remainder, l, l >> LOG_WARP_SIZE, l & (WARP_SIZE - 1), &temp.headflag_temp);
	// collect head flags

	barrier(CLK_LOCAL_MEM_FENCE);

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	//uint32_t n_my_tail_flags = 0;
	_TyScalar partial = t_ReductionElemOp(p_my_data[0]); // ignore the first head flag since there is no carry-in
	_TyScalar p_my_scan[SCAN_ITEMS_THREAD]; // todo - see if doing this inplace reduces the number of registers

	p_my_scan[0] = partial;
    #pragma unroll
	for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			partial = REDUCTION_IDENTITY;
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		p_my_scan[i] = partial;
	}
	// perform local scan

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	_TyScalar carry_out;
	_TyScalar scan = t_LocalSegScan_CalcDist(l, n_my_head_flags != 0, partial, &temp.seg_scan_temp, &carry_out);
	// spine scan

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			scan = REDUCTION_IDENTITY;
		p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);
	}
	// reduce all

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.seg_scan_temp

	// could load p_tail_counts_scan to shared memory, now it is accessed poorly, will suffer horrible performance if there are many reductions
	//		not true. even if the average segment length is 1, this only fixes the first segment in each tile which is a relaitively small cost!

	// need to figure out how to sample the head flags or how to send it from the bootstrap kernel (maybe better)
	// note that the head flags passed to this kernel do not coincide with the sampled flags at all

	// todo - use tail counts rather than scan of it, will halve the bandwidth for very little extra memory!
	//		  not exactly true, will still need to read p_tail_counts_scan[n_idx + 1] to know which reduction to fix
	//		- it is unlikely that the number of tiles of reductions would surpass 4 GB on current hardware (then, assuming tile size of 1024, there would be 4 TB of reduced data which doesn't even fit main memory of most systems nowadays)
	//		  so it would be possible to encode tail count as the high bit in tail counts scan (would require a custom intscan kernel)
	//		- or just read it into shared memory for all threads to easily access (eats a lot of shared memory)
	//		- or on NVIDIA platforms read directly to registers and shuffle up

	//if(b_packed_reduction) {
		#pragma unroll
		for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
			unsigned int n_idx = i + l * SCAN_ITEMS_THREAD; // global tile index
			const unsigned int n_tile_num = n_remainder; // to this kernel, size is tile num

			if(n_idx + 2 <= n_tile_num) { // we are not past the end of the tile
				unsigned int n_tc_1 = p_tail_counts_scan[n_idx + 1];
				if((n_idx + 2 == n_tile_num || n_tc_1 < p_tail_counts_scan[n_idx + 2]) && // the tile has tails in it or it is the last one (implicit tail)
				   (p_reduction_head_flags[(n_idx + 1) * n_reduction_tile_size] & 1) == 0) { // and there is no head flag on the first element
					p_reductions[n_tc_1] = t_ReductionReduceOp(p_reductions[n_tc_1], p_my_scan[i]);
					// fixup this tail index
				}
			}
		}
	/*} else {
		#pragma unroll
		for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
			unsigned int n_idx = i + l * SCAN_ITEMS_THREAD; // global tile index
			const unsigned int n_tile_num = n_remainder; // to this kernel, size is tile num

			if(n_idx + 2 <= n_tile_num) { // we are not past the end of the tile
				unsigned int n_tc_1 = p_tail_counts_scan[n_idx + 1];
				if((n_idx + 2 == n_tile_num || n_tc_1 < p_tail_counts_scan[n_idx + 2]) && // the tile has tails in it or it is the last one (implicit tail)
				   p_reduction_head_flags[(n_idx + 1) * n_reduction_tile_size] == 0) { // and there is no head flag on the first element
					p_reductions[n_tc_1] = t_ReductionReduceOp(p_reductions[n_tc_1], p_my_scan[i]);
					// fixup this tail index
				}
			}
		}
	}*/
}

// t_odo this could use packed p_reduction_head_flags as well (the complexity of interleaved / natural does not matter as the first flag is always on the same place)
__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegReduce_SpineAdjust_Packed(
	__global _TyScalar *p_reductions, __global const uint32_t *p_tail_counts_scan, // p_tail_counts_scan on the same granularity as n_size (per reduction tile)
	__global const uint32_t *p_reduction_head_flags, const unsigned int n_reduction_tile_size, // p_reduction_head_flags on finer granularity than n_size (per reduction element, tiled with n_reduction_tile_size)
	//const int b_packed_reduction, // always set
	__global const _TyScalar *p_data, __global const uintwarp_t *p_packed_head_flags, const unsigned int n_size) // 
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD <= 32, SEGMENTED_SCAN_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;//min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_data += n_start;
	p_packed_head_flags += n_start >> LOG_WARP_SIZE;
	//p_reductions += n_start; // nope, indexed by p_tail_counts_scan
	p_tail_counts_scan += n_start;
	p_reduction_head_flags += n_start * n_reduction_tile_size;

	__local union {
		THeadFlag_ReadStorage headflag_temp;
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalSegScanStorage seg_scan_temp; // need seg scan
		//uint32_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // tail flags ballot for each warp
	} temp;

	uint32_t n_my_head_flags = n_Read_Packed_HeadFlags(p_packed_head_flags,
		n_remainder, l, l >> LOG_WARP_SIZE, l & (WARP_SIZE - 1), &temp.headflag_temp);
	// collect head flags

	barrier(CLK_LOCAL_MEM_FENCE);

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	//uint32_t n_my_tail_flags = 0;
	_TyScalar partial = t_ReductionElemOp(p_my_data[0]); // ignore the first head flag since there is no carry-in
	_TyScalar p_my_scan[SCAN_ITEMS_THREAD]; // todo - see if doing this inplace reduces the number of registers

	p_my_scan[0] = partial;
    #pragma unroll
	for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			partial = REDUCTION_IDENTITY;
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		p_my_scan[i] = partial;
	}
	// perform local scan

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	_TyScalar carry_out;
	_TyScalar scan = t_LocalSegScan_CalcDist(l, n_my_head_flags != 0, partial, &temp.seg_scan_temp, &carry_out);
	// spine scan

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			scan = REDUCTION_IDENTITY;
		p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);
	}
	// reduce all

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.seg_scan_temp

	// could load p_tail_counts_scan to shared memory, now it is accessed poorly, will suffer horrible performance if there are many reductions
	//		not true. even if the average segment length is 1, this only fixes the first segment in each tile which is a relaitively small cost!

	// need to figure out how to sample the head flags or how to send it from the bootstrap kernel (maybe better)
	// note that the head flags passed to this kernel do not coincide with the sampled flags at all

	// todo - use tail counts rather than scan of it, will halve the bandwidth for very little extra memory!
	//		  not exactly true, will still need to read p_tail_counts_scan[n_idx + 1] to know which reduction to fix
	//		- it is unlikely that the number of tiles of reductions would surpass 4 GB on current hardware (then, assuming tile size of 1024, there would be 4 TB of reduced data which doesn't even fit main memory of most systems nowadays)
	//		  so it would be possible to encode tail count as the high bit in tail counts scan (would require a custom intscan kernel)
	//		- or just read it into shared memory for all threads to easily access (eats a lot of shared memory)
	//		- or on NVIDIA platforms read directly to registers and shuffle up

	//if(b_packed_reduction) {
		#pragma unroll
		for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
			unsigned int n_idx = i + l * SCAN_ITEMS_THREAD; // global tile index
			const unsigned int n_tile_num = n_remainder; // to this kernel, size is tile num

			if(n_idx + 2 <= n_tile_num) { // we are not past the end of the tile
				unsigned int n_tc_1 = p_tail_counts_scan[n_idx + 1];
				if((n_idx + 2 == n_tile_num || n_tc_1 < p_tail_counts_scan[n_idx + 2]) && // the tile has tails in it or it is the last one (implicit tail)
				   (p_reduction_head_flags[(n_idx + 1) * n_reduction_tile_size] & 1) == 0) { // and there is no head flag on the first element
					p_reductions[n_tc_1] = t_ReductionReduceOp(p_reductions[n_tc_1], p_my_scan[i]);
					// fixup this tail index
				}
			}
		}
	/*} else {
		#pragma unroll
		for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
			unsigned int n_idx = i + l * SCAN_ITEMS_THREAD; // global tile index
			const unsigned int n_tile_num = n_remainder; // to this kernel, size is tile num

			if(n_idx + 2 <= n_tile_num) { // we are not past the end of the tile
				unsigned int n_tc_1 = p_tail_counts_scan[n_idx + 1];
				if((n_idx + 2 == n_tile_num || n_tc_1 < p_tail_counts_scan[n_idx + 2]) && // the tile has tails in it or it is the last one (implicit tail)
				   p_reduction_head_flags[(n_idx + 1) * n_reduction_tile_size] == 0) { // and there is no head flag on the first element
					p_reductions[n_tc_1] = t_ReductionReduceOp(p_reductions[n_tc_1], p_my_scan[i]);
					// fixup this tail index
				}
			}
		}
	}*/
}

#ifdef USE_STRIDED_PACKED_HEAD_FLAGS

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegReduce_SpineAdjust_PackedNatural(
	__global _TyScalar *p_reductions, __global const uint32_t *p_tail_counts_scan, // p_tail_counts_scan on the same granularity as n_size (per reduction tile)
	__global const uint32_t *p_reduction_head_flags, const unsigned int n_reduction_tile_size, // p_reduction_head_flags on finer granularity than n_size (per reduction element, tiled with n_reduction_tile_size)
	//const int b_packed_reduction, // always set
	__global const _TyScalar *p_data, __global const uintwarp_t *p_packed_head_flags, const unsigned int n_size) // 
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD <= 32, SEGMENTED_SCAN_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;//min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_data += n_start;
	p_packed_head_flags += n_start >> LOG_WARP_SIZE;
	//p_reductions += n_start; // nope, indexed by p_tail_counts_scan
	p_tail_counts_scan += n_start;
	p_reduction_head_flags += n_start * n_reduction_tile_size;

	__local union {
		THeadFlag_ReadStorage1 headflag_temp;
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalSegScanStorage seg_scan_temp; // need seg scan
		//uint32_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // tail flags ballot for each warp
	} temp;

	uint32_t n_my_head_flags = n_Read_PackedUninterleaved_HeadFlags(p_packed_head_flags,
		n_remainder, l, l >> LOG_WARP_SIZE, l & (WARP_SIZE - 1), &temp.headflag_temp);
	// collect head flags

	barrier(CLK_LOCAL_MEM_FENCE);

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	//uint32_t n_my_tail_flags = 0;
	_TyScalar partial = t_ReductionElemOp(p_my_data[0]); // ignore the first head flag since there is no carry-in
	_TyScalar p_my_scan[SCAN_ITEMS_THREAD]; // todo - see if doing this inplace reduces the number of registers

	p_my_scan[0] = partial;
    #pragma unroll
	for(int i = 1; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			partial = REDUCTION_IDENTITY;
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		p_my_scan[i] = partial;
	}
	// perform local scan

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	_TyScalar carry_out;
	_TyScalar scan = t_LocalSegScan_CalcDist(l, n_my_head_flags != 0, partial, &temp.seg_scan_temp, &carry_out);
	// spine scan

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			scan = REDUCTION_IDENTITY;
		p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);
	}
	// reduce all

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.seg_scan_temp

	// could load p_tail_counts_scan to shared memory, now it is accessed poorly, will suffer horrible performance if there are many reductions
	//		not true. even if the average segment length is 1, this only fixes the first segment in each tile which is a relaitively small cost!

	// need to figure out how to sample the head flags or how to send it from the bootstrap kernel (maybe better)
	// note that the head flags passed to this kernel do not coincide with the sampled flags at all

	// todo - use tail counts rather than scan of it, will halve the bandwidth for very little extra memory!
	//		  not exactly true, will still need to read p_tail_counts_scan[n_idx + 1] to know which reduction to fix
	//		- it is unlikely that the number of tiles of reductions would surpass 4 GB on current hardware (then, assuming tile size of 1024, there would be 4 TB of reduced data which doesn't even fit main memory of most systems nowadays)
	//		  so it would be possible to encode tail count as the high bit in tail counts scan (would require a custom intscan kernel)
	//		- or just read it into shared memory for all threads to easily access (eats a lot of shared memory)
	//		- or on NVIDIA platforms read directly to registers and shuffle up

	//if(b_packed_reduction) {
		#pragma unroll
		for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
			unsigned int n_idx = i + l * SCAN_ITEMS_THREAD; // global tile index
			const unsigned int n_tile_num = n_remainder; // to this kernel, size is tile num

			if(n_idx + 2 <= n_tile_num) { // we are not past the end of the tile
				unsigned int n_tc_1 = p_tail_counts_scan[n_idx + 1];
				if((n_idx + 2 == n_tile_num || n_tc_1 < p_tail_counts_scan[n_idx + 2]) && // the tile has tails in it or it is the last one (implicit tail)
				   (p_reduction_head_flags[(n_idx + 1) * n_reduction_tile_size] & 1) == 0) { // and there is no head flag on the first element
					p_reductions[n_tc_1] = t_ReductionReduceOp(p_reductions[n_tc_1], p_my_scan[i]);
					// fixup this tail index
				}
			}
		}
	/*} else {
		#pragma unroll
		for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
			unsigned int n_idx = i + l * SCAN_ITEMS_THREAD; // global tile index
			const unsigned int n_tile_num = n_remainder; // to this kernel, size is tile num

			if(n_idx + 2 <= n_tile_num) { // we are not past the end of the tile
				unsigned int n_tc_1 = p_tail_counts_scan[n_idx + 1];
				if((n_idx + 2 == n_tile_num || n_tc_1 < p_tail_counts_scan[n_idx + 2]) && // the tile has tails in it or it is the last one (implicit tail)
				   p_reduction_head_flags[(n_idx + 1) * n_reduction_tile_size] == 0) { // and there is no head flag on the first element
					p_reductions[n_tc_1] = t_ReductionReduceOp(p_reductions[n_tc_1], p_my_scan[i]);
					// fixup this tail index
				}
			}
		}
	}*/
}

#endif // USE_STRIDED_PACKED_HEAD_FLAGS

__kernel REQUIRE_WG_SIZE(SCAN_LOCAL_WORK_SIZE) void TileSegReduce_SpineAdjust_Downsweep_Packed(
	__global _TyScalar *p_reductions, __global const uint32_t *p_tail_counts_scan, // p_tail_counts_scan on the same granularity as n_size (per reduction tile)
	__global const uint32_t *p_reduction_head_flags, const unsigned int n_reduction_tile_size, // p_reduction_head_flags on finer granularity than n_size (per reduction element, tiled with n_reduction_tile_size)
	//const int b_packed_reduction, // always set
	__global const _TyScalar *p_data, __global const _TyScalar *p_tile_carry_in,
	__global const uintwarp_t *p_packed_head_flags, const unsigned int n_size) // 
{
	STATIC_ASSERT(SCAN_ITEMS_THREAD <= 32, SEGMENTED_SCAN_ONLY_WORKS_WITH_UP_TO_31_ELEMENTS_PER_THREAD); // need head and tail flags both in a single var ... maybe
	// if >32, would have to use 64-bit type for n_my_head_flags and p_tile_head_flags (but would likely be killed by the lack of local memory before reaching 32 so no point in handling that)

	const unsigned int l = get_local_id(0), g = get_group_id(0);

	const unsigned int n_start = g * SCAN_TILE_SIZE;
	//const unsigned int n_remainder = min(max(n_data_size, n_start), n_start + SCAN_TILE_SIZE) - n_start; // avoid underflow ... complicated - t_odo
	const unsigned int n_remainder = max(n_size, n_start) - n_start;//min(n_size, n_start + SCAN_TILE_SIZE) - n_start; // data size is always smaller than start (if not then we launched too many work-groups)

	p_data += n_start;
	p_packed_head_flags += n_start >> LOG_WARP_SIZE;
	//p_reductions += n_start; // nope, indexed by p_tail_counts_scan
	p_tail_counts_scan += n_start;
	p_reduction_head_flags += n_start * n_reduction_tile_size;

	__local union {
		THeadFlag_ReadStorage headflag_temp;
		_TyScalar p_shared_data[ORDERED_LOAD_TEMP_SIZE(CFI_FLAG, SCAN_ITEMS_THREAD, SCAN_TILE_SIZE)];
		TLocalSegScanStorage seg_scan_temp; // need seg scan
		//uint32_t p_shared_head_flags[SCAN_WORKGROUP_WARP_NUM]; // tail flags ballot for each warp
	} temp;

	uint32_t n_my_head_flags = n_Read_Packed_HeadFlags(p_packed_head_flags,
		n_remainder, l, l >> LOG_WARP_SIZE, l & (WARP_SIZE - 1), &temp.headflag_temp);
	// collect head flags

	barrier(CLK_LOCAL_MEM_FENCE);

	_TyScalar p_my_data[SCAN_ITEMS_THREAD]; // all accesses unrolled, can reside in regs
	GLOBAL_TO_REGISTER_ORDERED_CFI_COND(CFI_FLAG, p_my_data, l, SCAN_LOCAL_WORK_SIZE,
		SCAN_ITEMS_THREAD, p_data, n_remainder, REDUCTION_IDENTITY, temp.p_shared_data);
	// cooperatively read all the values

	_TyScalar partial = REDUCTION_IDENTITY;
	if((!l) & (g != 0))
		partial = p_tile_carry_in[g - 1]; // convert carry ins to exclusive scan, sort of (we only care about the elems which are nonzero and those are correct, the zero ones will be zeroed out below anyway)
	// the first thread of each workgroup reads partial, except for the first workgroup

	_TyScalar p_my_scan[SCAN_ITEMS_THREAD]; // todo - see if doing this inplace reduces the number of registers

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			partial = REDUCTION_IDENTITY;
        partial = t_ReductionReduceOp(partial, t_ReductionElemOp(p_my_data[i]));
		p_my_scan[i] = partial;
	}
	// perform local scan

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all threads to finish UNINTERLEAVE_LOCAL_TO_REGISTER() before reusing the local storage

	_TyScalar carry_out;
	_TyScalar scan = t_LocalSegScan_CalcDist(l, n_my_head_flags != 0, partial, &temp.seg_scan_temp, &carry_out);
	// spine scan

    #pragma unroll
	for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
		if(n_my_head_flags & (1 << i)) // can't access temp.p_shared_hf anymore
			scan = REDUCTION_IDENTITY;
		p_my_scan[i] = t_ReductionReduceOp(scan, p_my_scan[i]);
	}
	// reduce all

	barrier(CLK_LOCAL_MEM_FENCE); // need to wait for all the threads to stop using temp.seg_scan_temp

	// could load p_tail_counts_scan to shared memory, now it is accessed poorly, will suffer horrible performance if there are many reductions
	//		not true. even if the average segment length is 1, this only fixes the first segment in each tile which is a relaitively small cost!

	// need to figure out how to sample the head flags or how to send it from the bootstrap kernel (maybe better)
	// note that the head flags passed to this kernel do not coincide with the sampled flags at all

	// todo - use tail counts rather than scan of it, will halve the bandwidth for very little extra memory!
	//		  not exactly true, will still need to read p_tail_counts_scan[n_idx + 1] to know which reduction to fix
	//		- it is unlikely that the number of tiles of reductions would surpass 4 GB on current hardware (then, assuming tile size of 1024, there would be 4 TB of reduced data which doesn't even fit main memory of most systems nowadays)
	//		  so it would be possible to encode tail count as the high bit in tail counts scan (would require a custom intscan kernel)
	//		- or just read it into shared memory for all threads to easily access (eats a lot of shared memory)
	//		- or on NVIDIA platforms read directly to registers and shuffle up

	//if(b_packed_reduction) {
		#pragma unroll
		for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
			unsigned int n_idx = i + l * SCAN_ITEMS_THREAD; // global tile index
			const unsigned int n_tile_num = n_remainder; // to this kernel, size is tile num

			if(n_idx + 2 <= n_tile_num) { // we are not past the end of the tile
				unsigned int n_tc_1 = p_tail_counts_scan[n_idx + 1];
				if((n_idx + 2 == n_tile_num || n_tc_1 < p_tail_counts_scan[n_idx + 2]) && // the tile has tails in it or it is the last one (implicit tail)
				   (p_reduction_head_flags[(n_idx + 1) * n_reduction_tile_size] & 1) == 0) { // and there is no head flag on the first element
					p_reductions[n_tc_1] = t_ReductionReduceOp(p_reductions[n_tc_1], p_my_scan[i]);
					// fixup this tail index
				}
			}
		}
	/*} else {
		#pragma unroll
		for(int i = 0; i < SCAN_ITEMS_THREAD; ++ i) {
			unsigned int n_idx = i + l * SCAN_ITEMS_THREAD; // global tile index
			const unsigned int n_tile_num = n_remainder; // to this kernel, size is tile num

			if(n_idx + 2 <= n_tile_num) { // we are not past the end of the tile
				unsigned int n_tc_1 = p_tail_counts_scan[n_idx + 1];
				if((n_idx + 2 == n_tile_num || n_tc_1 < p_tail_counts_scan[n_idx + 2]) && // the tile has tails in it or it is the last one (implicit tail)
				   p_reduction_head_flags[(n_idx + 1) * n_reduction_tile_size] == 0) { // and there is no head flag on the first element
					p_reductions[n_tc_1] = t_ReductionReduceOp(p_reductions[n_tc_1], p_my_scan[i]);
					// fixup this tail index
				}
			}
		}
	}*/
}

#endif // BUILD_SEG_REDUCE_SPINE_ADJUST_KERNELS

/*
 *	end-of-file
 */
