#ifndef __BLOCKED_LOAD_STORE_MACROS_INCLUDED
#define __BLOCKED_LOAD_STORE_MACROS_INCLUDED

/**
 *	@file gpgpu/kernel_utils/LoadStore.h
 *	@brief load / store primitives for tiled computing
 *	@date 2016
 *	@author -tHE SWINe-
 *
 *	This adds load and store templates for tiled kernels (each workgroup
 *	processes a single tile of data) where each thread may process one or
 *	more elements.
 *
 *	Assume there is the following array in global memory:
 *	@code
 *	[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d]
 *	@endcode
 *
 *	Also assume that each thread wants to load 4 values.
 *
 *	There are the following primitives:
 *
 *	\ref GLOBAL_TO_REGISTER
 *	@code
 *	[0, 4, 8, c] // thread 0
 *	[1, 5, 9, d] // thread 1
 *	[2, 6, a, X] // thread 2
 *	[3, 7, b, X] // thread 3
 *	@endcode
 *
 *	Where <tt>X</tt> is a specified value (via the <tt>t_padding_value</tt>
 *	argument). \ref REGISTER_TO_GLOBAL is the reverse function.
 *
 *	\ref GLOBAL_TO_LOCAL
 *	@code
 *	[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, a, b, c, d, X, X] // in local memory
 *	(^  ^  ^  ^ thread 0)   (^  ^  ^  ^ thread 2)
 *	            (^  ^  ^  ^ thread 1)   (^  ^  ^  ^ thread 3)
 *	@endcode
 *	This copies global memory to local memory without changing the order.
 *	This can cause bank conflicts in the number of values per thread is even
 *	(e.g. if the number of local memory banks is 8, then there are two-way
 *	conflicts in the above example). \ref LOCAL_TO_GLOBAL is the reverse function.
 *
 *	\ref GLOBAL_TO_LOCAL_CFI
 *	@code
 *	[0, 1, 2, 3, 4, 5, 6, 7, P, 8, 9, a, b, c, d, X, X] // in local memory, note padding P
 *	(^  ^  ^  ^ thread 0)      (^  ^  ^  ^ thread 2)
 *	            (^  ^  ^  ^ thread 1)      (^  ^  ^  ^ thread 3)
 *                          (^ unused element for conflict-free indexing (assuming 8 local memory banks))
 *	@endcode
 *	The same as \ref GLOBAL_TO_LOCAL but inserts some unused elements in the
 *	local array, in order to avoid the bank conflicts. \ref LOCAL_CFI_TO_GLOBAL
 *	is the reverse function.
 *
 *	Note that in the above examples, the size of the data in global memory is not
 *	an exact multiple of the data in local memory or in registers. These functions
 *	shield the user from having to deal with the array bounds.
 *
 *	The below functions which work with local memory always assume the size of data
 *	to be exact multiple of the number of threads times the number of values per thread.
 *
 *	\ref UNINTERLEAVE_LOCAL_TO_REGISTER gets data in local memory (e.g. loaded
 *	using \ref GLOBAL_TO_LOCAL and exchanges it between the threads so that each
 *	thread gets consecutive elements in registers.
 *	@code
 *	[0, 1, 2, 3] // thread 0
 *	[4, 5, 6, 7] // thread 1
 *	[8, 9, a, b] // thread 2
 *	[c, d, X, X] // thread 3
 *	@endcode
 *	\ref UNINTERLEAVE_LOCAL_CFI_TO_REGISTER works with \ref GLOBAL_TO_LOCAL_CFI.
 *	\ref UNINTERLEAVE_LOCAL_TO_REGISTER and \ref INTERLEAVE_REGISTER_TO_LOCAL_CFI
 *	are reverse functions.
 *
 *	There are two basic usage patterns. If one does not care about the order
 *	of the inputs (e.g. in histogram calculation or in reduction), one uses
 *	\ref GLOBAL_TO_REGISTER (or \ref REGISTER_TO_GLOBAL for writing the results).
 *	If the order matters, one first loads the data to local memorz using one of
 *	\ref GLOBAL_TO_LOCAL or \ref GLOBAL_TO_LOCAL_CFI and theb exchanges the values
 *	using \ref UNINTERLEAVE_LOCAL_TO_REGISTER or \ref UNINTERLEAVE_LOCAL_CFI_TO_REGISTER.
 *	The sequence for writing is reverse, i.e. \ref INTERLEAVE_REGISTER_TO_LOCAL or
 *	\ref INTERLEAVE_REGISTER_TO_LOCAL_CFI followed by \ref LOCAL_TO_GLOBAL or
 *	\ref LOCAL_CFI_TO_GLOBAL, respectively.
 *
 *	Also note that the functions involving local memory require synchronization
 *	which is not part of those functions, for performance reasons.
 */

/**
 *	@def GLOBAL_TO_REGISTER
 *
 *	@brief copies a block of globnal memory to registers, in strided manner
 *
 *	Copies global memory to local memory, employing coalesced reads. The advantage
 *	is perfect bandwidth, the disadvantage is that each thread has values far apart
 *	in the original array rather than contiguous values.
 *
 *	@param[out] p_reg is pointer to register array
 *	@param[in] n_tid is thread id
 *	@param[in] n_thread_num is number of threads
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] p_data is pointer to input values
 *	@param[in] n_data_num is number of input values in the buffer (but only up to n_thread_num * n_value_per_thread_num are actually copied)
 *	@param[in] t_padding_value is value to pad the array of the last block with
 */
#define GLOBAL_TO_REGISTER(p_reg,n_tid,n_thread_num,n_value_per_thread_num,p_data,n_data_num,t_padding_value) \
	do { \
		if((n_data_num) >= (n_thread_num) * (n_value_per_thread_num)) { \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) \
				(p_reg)[i] = (p_data)[(n_thread_num) * i + (n_tid)]; \
		} else { \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int n_src = (n_thread_num) * i + (n_tid); /* keep this out to make the shortest branch possible */ \
				(p_reg)[i] = (t_padding_value); /* keep this out to make the shortest branch possible */ \
				if(n_src < (n_data_num)) \
					(p_reg)[i] = (p_data)[n_src]; \
			} \
		} \
	} while(0)

/**
 *	@def GLOBAL_TO_REGISTER_OVERLAP
 *
 *	@brief copies a block of globnal memory to registers, in strided manner, with overlap
 *
 *	Copies global memory to local memory, employing coalesced reads. The advantage
 *	is good bandwidth, the disadvantage is that each thread has values far apart
 *	in the original array rather than contiguous values.
 *
 *	@param[out] p_reg is pointer to register array
 *	@param[in] n_tid is thread id
 *	@param[in] n_thread_num is number of threads
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] n_overlap_value_num is number of extra values per thread which overlap the first values of the next thread
 *	@param[in] p_data is pointer to input values
 *	@param[in] n_data_num is number of input values in the buffer (but only up to n_thread_num * n_value_per_thread_num are actually copied)
 *	@param[in] t_padding_value is value to pad the array of the last block with
 *
 *	@note The overlapping values are re-read from the global memory, possibly through
 *		cache, rather than being exchanged through local memory or shuffles. THis is
 *		only efficient up to a small number of overlapping values.
 */
#define GLOBAL_TO_REGISTER_OVERLAP(p_reg,n_tid,n_thread_num,n_value_per_thread_num,n_overlap_value_num,p_data,n_data_num,t_padding_value) \
	do { \
		if((n_data_num) >= ((n_thread_num) - 1) * (n_value_per_thread_num) + (n_overlap_value_num)) { \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num) + (n_overlap_value_num); ++ i) \
				(p_reg)[i] = (p_data)[(n_thread_num) * i + (n_tid)]; \
		} else { \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num) + (n_overlap_value_num); ++ i) { \
				unsigned int n_src = (n_thread_num) * i + (n_tid); /* keep this out to make the shortest branch possible */ \
				(p_reg)[i] = (t_padding_value); /* keep this out to make the shortest branch possible */ \
				if(n_src < (n_data_num)) \
					(p_reg)[i] = (p_data)[n_src]; \
			} \
		} \
	} while(0)

/**
 *	@def REGISTER_TO_GLOBAL
 *
 *	@brief copies a block from registers to globnal memory, in strided manner (reverse of \ref GLOBAL_TO_REGISTER)
 *
 *	@param[out] p_dest is pointer to output values
 *	@param[in] n_dest_len is number of output values to be written (but only up to n_thread_num * n_value_per_thread_num are actually copied)
 *	@param[in] n_tid is thread id
 *	@param[in] n_thread_num is number of threads
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] p_reg is pointer to register array
 */
#define REGISTER_TO_GLOBAL(p_dest,n_dest_len,n_tid,n_thread_num,n_value_per_thread_num,p_reg) \
	do { \
		if((n_dest_len) >= (n_thread_num) * (n_value_per_thread_num)) { \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) \
				(p_dest)[(n_thread_num) * i + (n_tid)] = (p_reg)[i]; \
		} else { \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int n_dest = (n_thread_num) * i + (n_tid); /* keep this out to make the shortest branch possible */ \
				if(n_dest < (n_dest_len)) \
					(p_dest)[n_dest] = (p_reg)[i]; \
			} \
		} \
	} while(0)

/**
 *	@def GLOBAL_TO_LOCAL
 *
 *	@brief copies a block of global memory to local memory, in strided manner (without changing the element order)
 *
 *	Copies global memory to local memory, employing coalesced reads. The advantage
 *	is having the values in unchanged order in local memory, the disadvantage is that
 *	even numbers of values per thread cause multiple bank conflicts.
 *
 *	@param[out] p_local is pointer to local memory (must be allocated to n_thread_num * n_value_per_thread_num)
 *	@param[in] n_tid is thread id
 *	@param[in] n_thread_num is number of threads
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] p_data is pointer to input values
 *	@param[in] n_data_num is number of input values in the buffer (but only up to n_thread_num * n_value_per_thread_num are actually copied)
 *	@param[in] t_padding_value is value to pad the array of the last block with
 */
#define GLOBAL_TO_LOCAL(p_local,n_tid,n_thread_num,n_value_per_thread_num,p_data,n_data_num,t_padding_value) \
	do { \
		if((n_data_num) >= (n_thread_num) * (n_value_per_thread_num)) { \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int n_idx = (n_thread_num) * i + (n_tid); /* keep this out of the for() statement to make sure it unrolls */ \
				(p_local)[n_idx] = (p_data)[n_idx]; \
			} \
		} else { \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int n_idx = (n_thread_num) * i + (n_tid); /* keep this out to make the shortest branch possible */ \
				(p_local)[n_idx] = (t_padding_value); /* keep this out to make the shortest branch possible */ \
				if(n_idx < (n_data_num)) \
					(p_local)[n_idx] = (p_data)[n_idx]; \
			} \
		} \
	} while(0)

/**
 *	@def GLOBAL_TO_LOCAL_N_MORE
 *
 *	@brief copies a block of global memory to local memory, in strided manner (without changing the element order)
 *
 *	Copies global memory to local memory, employing coalesced reads. The advantage
 *	is having the values in unchanged order in local memory, the disadvantage is that
 *	even numbers of values per thread cause multiple bank conflicts.
 *
 *	@param[out] p_local is pointer to local memory (must be allocated to n_thread_num * n_value_per_thread_num)
 *	@param[in] n_tid is thread id
 *	@param[in] n_thread_num is number of threads
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] n_extra_value_num is number of extra values to load at the end
 *	@param[in] p_data is pointer to input values
 *	@param[in] n_data_num is number of input values in the buffer (but only up to n_thread_num * n_value_per_thread_num are actually copied)
 *	@param[in] t_padding_value is value to pad the array of the last block with
 */
#define GLOBAL_TO_LOCAL_N_MORE(p_local,n_tid,n_thread_num,n_value_per_thread_num,n_extra_value_num,p_data,n_data_num,t_padding_value) \
	do { \
		if((n_data_num) >= (n_thread_num) * (n_value_per_thread_num) + (n_extra_value_num)) { /* all values fit */ \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int n_idx = (n_thread_num) * i + (n_tid); /* keep this out of the for() statement to make sure it unrolls */ \
				(p_local)[n_idx] = (p_data)[n_idx]; \
			} \
			if((n_extra_value_num) == 0) { \
			} else if((n_extra_value_num) == 1) { \
				unsigned int n_idx = (n_thread_num) * (n_value_per_thread_num); \
				if(!n_tid) \
					(p_local)[n_idx] = (p_data)[n_idx]; \
			} else { \
				for(unsigned int i = n_tid; i < (n_extra_value_num); i += (n_thread_num)) { \
					unsigned int n_idx = (n_thread_num) * (n_value_per_thread_num) + i; \
					(p_local)[n_idx] = (p_data)[n_idx]; \
				} \
			} \
			break; /* all of the values loaded */ \
		} else if((n_data_num) >= (n_thread_num) * (n_value_per_thread_num)) { /* all "regular" values fir, some of the extra values do not */ \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int n_idx = (n_thread_num) * i + (n_tid); /* keep this out of the for() statement to make sure it unrolls */ \
				(p_local)[n_idx] = (p_data)[n_idx]; \
			} \
		} else { /* not all "regular" values fit */ \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int n_idx = (n_thread_num) * i + (n_tid); /* keep this out to make the shortest branch possible */ \
				(p_local)[n_idx] = (t_padding_value); /* keep this out to make the shortest branch possible */ \
				if(n_idx < (n_data_num)) \
					(p_local)[n_idx] = (p_data)[n_idx]; \
			} \
		} \
		/* now handle the extra values, some of which do not fit */ \
		if((n_extra_value_num) == 0) { \
		} else if((n_extra_value_num) == 1) { \
			unsigned int n_idx = (n_thread_num) * (n_value_per_thread_num); \
			if(!n_tid) \
				(p_local)[n_idx] = (t_padding_value); \
			/*if((!n_tid) & (n_idx < (n_data_num)))*/ \
			/*	(p_local)[n_idx] = (p_data)[n_idx];*/ /* never true, would have taken the first branch */ \
		} else { \
			for(unsigned int i = n_tid; i < (n_extra_value_num); i += (n_thread_num)) { \
				unsigned int n_idx = (n_thread_num) * (n_value_per_thread_num) + i; \
				(p_local)[n_idx] = (t_padding_value); /* shorter branch */ \
				if(n_idx < (n_data_num)) \
					(p_local)[n_idx] = (p_data)[n_idx]; \
			} \
		} \
	} while(0)

/**
 *	@def GLOBAL_TO_LOCAL_CFI
 *
 *	@brief copies a block of global memory to local memory, in strided manner and using conflict-free indexing
 *
 *	Copies global memory to local memory, employing coalesced reads. The advantage
 *	is having the values in unchanged order in local memory, the disadvantage is that
 *	the values are stored in the local memory using conflict-free indexing which requires
 *	slightly more local memory.
 *
 *	@param[out] p_local is pointer to local memory (must be allocated to
 *		<tt>n_thread_num * n_value_per_thread_num + CONFLICT_FREE_OFFSET(n_thread_num * n_value_per_thread_num - 1)</tt>)
 *	@param[in] n_tid is thread id
 *	@param[in] n_thread_num is number of threads
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] p_data is pointer to input values
 *	@param[in] n_data_num is number of input values in the buffer (but only up to n_thread_num * n_value_per_thread_num are actually copied)
 *	@param[in] t_padding_value is value to pad the array of the last block with
 */
#define GLOBAL_TO_LOCAL_CFI(p_local,n_tid,n_thread_num,n_value_per_thread_num,p_data,n_data_num,t_padding_value) \
	do { \
		if((n_data_num) >= (n_thread_num) * (n_value_per_thread_num)) { \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int n_idx = (n_thread_num) * i + (n_tid); /* keep this out of the for() statement to make sure it unrolls */ \
				(p_local)[CONFLICT_FREE_INDEX(n_idx)] = (p_data)[n_idx]; \
			} \
		} else { \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int n_idx = (n_thread_num) * i + (n_tid); /* keep this out to make the shortest branch possible */ \
				(p_local)[CONFLICT_FREE_INDEX(n_idx)] = (t_padding_value); /* keep this out to make the shortest branch possible */ \
				if(n_idx < (n_data_num)) \
					(p_local)[CONFLICT_FREE_INDEX(n_idx)] = (p_data)[n_idx]; \
			} \
		} \
	} while(0)

/**
 *	@def GLOBAL_TO_LOCAL_N_MORE_CFI
 *
 *	@brief copies a block of global memory to local memory, in strided manner and using conflict-free indexing
 *
 *	Copies global memory to local memory, employing coalesced reads. The advantage
 *	is having the values in unchanged order in local memory, the disadvantage is that
 *	the values are stored in the local memory using conflict-free indexing which requires
 *	slightly more local memory.
 *
 *	@param[out] p_local is pointer to local memory (must be allocated to
 *		<tt>n_thread_num * n_value_per_thread_num + CONFLICT_FREE_OFFSET(n_thread_num * n_value_per_thread_num - 1)</tt>)
 *	@param[in] n_tid is thread id
 *	@param[in] n_thread_num is number of threads
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] n_extra_value_num is number of extra values to load at the end
 *	@param[in] p_data is pointer to input values
 *	@param[in] n_data_num is number of input values in the buffer (but only up to n_thread_num * n_value_per_thread_num are actually copied)
 *	@param[in] t_padding_value is value to pad the array of the last block with
 */
#define GLOBAL_TO_LOCAL_N_MORE_CFI(p_local,n_tid,n_thread_num,n_value_per_thread_num,n_extra_value_num,p_data,n_data_num,t_padding_value) \
	do { \
		if((n_data_num) >= (n_thread_num) * (n_value_per_thread_num) + (n_extra_value_num)) { /* all values fit */ \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int n_idx = (n_thread_num) * i + (n_tid); /* keep this out of the for() statement to make sure it unrolls */ \
				(p_local)[CONFLICT_FREE_INDEX(n_idx)] = (p_data)[n_idx]; \
			} \
			if((n_extra_value_num) == 0) { \
			} else if((n_extra_value_num) == 1) { \
				unsigned int n_idx = (n_thread_num) * (n_value_per_thread_num); \
				if(!n_tid) \
					(p_local)[CONFLICT_FREE_INDEX(n_idx)] = (p_data)[n_idx]; \
			} else { \
				for(unsigned int i = n_tid; i < (n_extra_value_num); i += (n_thread_num)) { \
					unsigned int n_idx = (n_thread_num) * (n_value_per_thread_num) + i; \
					(p_local)[CONFLICT_FREE_INDEX(n_idx)] = (p_data)[n_idx]; \
				} \
			} \
			break; /* all of the values loaded */ \
		} else if((n_data_num) >= (n_thread_num) * (n_value_per_thread_num)) { /* all "regular" values fir, some of the extra values do not */ \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int n_idx = (n_thread_num) * i + (n_tid); /* keep this out of the for() statement to make sure it unrolls */ \
				(p_local)[CONFLICT_FREE_INDEX(n_idx)] = (p_data)[n_idx]; \
			} \
		} else { /* not all "regular" values fit */ \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int n_idx = (n_thread_num) * i + (n_tid); /* keep this out to make the shortest branch possible */ \
				(p_local)[CONFLICT_FREE_INDEX(n_idx)] = (t_padding_value); /* keep this out to make the shortest branch possible */ \
				if(n_idx < (n_data_num)) \
					(p_local)[CONFLICT_FREE_INDEX(n_idx)] = (p_data)[n_idx]; \
			} \
		} \
		/* now handle the extra values, some of which do not fit */ \
		if((n_extra_value_num) == 0) { \
		} else if((n_extra_value_num) == 1) { \
			unsigned int n_idx = (n_thread_num) * (n_value_per_thread_num); \
			if(!n_tid) \
				(p_local)[CONFLICT_FREE_INDEX(n_idx)] = (t_padding_value); \
			/*if((!n_tid) & (n_idx < (n_data_num)))*/ \
			/*	(p_local)[CONFLICT_FREE_INDEX(n_idx)] = (p_data)[n_idx];*/ /* never true, would have taken the first branch */ \
		} else { \
			for(unsigned int i = n_tid; i < (n_extra_value_num); i += (n_thread_num)) { \
				unsigned int n_idx = (n_thread_num) * (n_value_per_thread_num) + i; \
				(p_local)[CONFLICT_FREE_INDEX(n_idx)] = (t_padding_value); /* shorter branch */ \
				if(n_idx < (n_data_num)) \
					(p_local)[CONFLICT_FREE_INDEX(n_idx)] = (p_data)[n_idx]; \
			} \
		} \
	} while(0)

/**
 *	@def LOCAL_TO_GLOBAL
 *
 *	@brief copies a block of local memory to global memory, in strided manner (without changing the element order)
 *
 *	@param[out] p_dest is pointer to output values
 *	@param[in] n_dest_len is number of output values to be written (but only up to n_thread_num * n_value_per_thread_num are actually copied)
 *	@param[in] n_tid is thread id
 *	@param[in] n_thread_num is number of threads
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] p_local is pointer to local memory (must be allocated to n_thread_num * n_value_per_thread_num)
 */
#define LOCAL_TO_GLOBAL(p_dest,n_dest_len,n_tid,n_thread_num,n_value_per_thread_num,p_local) \
	do { \
		if((n_dest_len) >= (n_thread_num) * (n_value_per_thread_num)) { \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int n_idx = (n_thread_num) * i + (n_tid); /* keep this out of the for() statement to make sure it unrolls */ \
				(p_dest)[n_idx] = (p_local)[n_idx]; \
			} \
		} else { \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int n_idx = (n_thread_num) * i + (n_tid); /* keep this out to make the shortest branch possible */ \
				if(n_idx < (n_dest_len)) \
					(p_dest)[n_idx] = (p_local)[n_idx]; \
			} \
		} \
	} while(0)

/**
 *	@def LOCAL_CFI_TO_GLOBAL
 *
 *	@brief copies a block of local memory to global memory, in strided manner and using conflict-free indexing
 *
 *	@param[out] p_dest is pointer to output values
 *	@param[in] n_dest_len is number of output values to be written (but only up to n_thread_num * n_value_per_thread_num are actually copied)
 *	@param[in] n_tid is thread id
 *	@param[in] n_thread_num is number of threads
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[out] p_local is pointer to local memory (must be allocated to
 *		<tt>n_thread_num * n_value_per_thread_num + CONFLICT_FREE_OFFSET(n_thread_num * n_value_per_thread_num - 1)</tt>)
 */
#define LOCAL_CFI_TO_GLOBAL(p_dest,n_dest_len,n_tid,n_thread_num,n_value_per_thread_num,p_local) \
	do { \
		if((n_dest_len) >= (n_thread_num) * (n_value_per_thread_num)) { \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int n_idx = (n_thread_num) * i + (n_tid); /* keep this out of the for() statement to make sure it unrolls */ \
				(p_dest)[n_idx] = (p_local)[CONFLICT_FREE_INDEX(n_idx)]; \
			} \
		} else { \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int n_idx = (n_thread_num) * i + (n_tid); /* keep this out to make the shortest branch possible */ \
				if(n_idx < (n_dest_len)) \
					(p_dest)[n_idx] = (p_local)[CONFLICT_FREE_INDEX(n_idx)]; \
			} \
		} \
	} while(0)

// copies a block from local memory to registers, so that the threads have the values in the original order
// calling GLOBAL_TO_LOCAL() to load data to local memory leaves us with
//	0 1 2 3 4 5 6 7 8 9 ... n
// we want to get
//  0 1 2 3 for the first thread ~ reg[i] = shared[i + n_tid * n_value_per_thread_num]
//  4 5 6 7 for the second thread
// which means bank conflicts unless doing some wrapping (the first LOCAL_MEMORY_BANK_NUM / n_value_per_thread_num threads copy with offset 0, the next with offset 1 and so on; this requires integer division though, not nice)
// trying differently ... copying first GLOBAL_TO_REGISTER yields (for 32 threads)
//  0 32 64 96 128 for the first thread
//  1 33 65 97 129 for the second thread
// putting these to local memory yields via local[n_thread_num * i + n_tid] = reg[i]
//  0 1 2 3 .... n, not what we want
#if 0
// this works on paper but causes the register accesses to be unpredictable so the array which was supposed to be in registers spills to cache

#define UNINTERLEAVE_LOCAL_TO_REGISTER(p_reg,n_tid,n_value_per_thread_num,p_local) \
	do { \
		if((n_value_per_thread_num) & 1) { /* there are no bank conflicts for even numbers */ \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) \
				p_reg[i] = p_local[i + (n_tid) * (n_value_per_thread_num)]; \
			/* each thread gets n_value_per_thread_num consecutive elements */ \
		} else { /* avoids bank conflicts for powers of two or more items per thread than banks and reduces bank conflicts for even numbers to a few of two-way conflicts */ \
			enum { n_bank_stride = ((n_value_per_thread_num) < LOCAL_MEMORY_BANK_NUM)? LOCAL_MEMORY_BANK_NUM / (n_value_per_thread_num) : 1 }; \
			const unsigned int n_bank = ((n_tid) / n_bank_stride); /* a division in case n_value_per_thread_num is not a POT, otherwise a shift */ \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int j = (i + n_bank) % (n_value_per_thread_num); /* precalculated as this loop unrolls */ \
				p_reg[j] = p_local[j + (n_tid) * (n_value_per_thread_num)]; \
				/* each thread gets n_value_per_thread_num consecutive elements; using j avoids bank conflicts */ \
			} \
		} \
	} while(0)

#define INTERLEAVE_REGISTER_TO_LOCAL(p_local,n_tid,n_value_per_thread_num,p_reg) \
	do { \
		if((n_value_per_thread_num) & 1) { /* there are no bank conflicts for even numbers */ \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) \
				p_local[i + (n_tid) * (n_value_per_thread_num)] = p_reg[i]; \
			/* each thread gets n_value_per_thread_num consecutive elements */ \
		} else { \
			enum { n_bank_stride = ((n_value_per_thread_num) < LOCAL_MEMORY_BANK_NUM)? LOCAL_MEMORY_BANK_NUM / (n_value_per_thread_num) : 1 }; \
			const unsigned int n_bank = ((n_tid) / n_bank_stride); /* a division in case n_value_per_thread_num is not a POT, otherwise a shift */ \
			_Pragma("unroll") \
			for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) { \
				unsigned int j = (i + n_bank) % (n_value_per_thread_num); /* precalculated as this loop unrolls */ \
				p_local[j + (n_tid) * (n_value_per_thread_num)] = p_reg[j]; \
				/* each thread gets n_value_per_thread_num consecutive elements; using j avoids bank conflicts */ \
			} \
		} \
	} while(0)
#else // 0

/**
 *	@def UNINTERLEAVE_LOCAL_TO_REGISTER
 *
 *	@brief copies a block of local memory to registers, in contiguous manner
 *
 *	Copies local memory to registers, so that each thread gets several consecutive
 *	values. The disadvantage is the presence of bank conflicts in the number of values
 *	per thread is even. This assumes that the correct number of threads enter and
 *	that the size of the data in local memory is not smaller than the product of
 *	values per thread and the number of threads.
 *
 *	@param[out] p_reg is pointer to register array
 *	@param[in] n_tid is thread id
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] p_local is pointer to input values
 */
#define UNINTERLEAVE_LOCAL_TO_REGISTER(p_reg,n_tid,n_value_per_thread_num,p_local) \
	do { \
		_Pragma("unroll") \
		for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) \
			p_reg[i] = p_local[i + (n_tid) * (n_value_per_thread_num)]; \
		/* each thread gets n_value_per_thread_num consecutive elements */ \
	} while(0)

/**
 *	@def UNINTERLEAVE_LOCAL_TO_REGISTER_OVERLAP
 *
 *	@brief copies a block of local memory to registers, in contiguous manner
 *
 *	Copies local memory to registers, so that each thread gets several consecutive
 *	values. The disadvantage is the presence of bank conflicts in the number of values
 *	per thread is even. This assumes that the correct number of threads enter and
 *	that the size of the data in local memory is not smaller than the product of
 *	values per thread and the number of threads.
 *
 *	@param[out] p_reg is pointer to register array
 *	@param[in] n_tid is thread id
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] n_overlap_value_num is number of extra values per thread which overlap the first values of the next thread
 *	@param[in] p_local is pointer to input values
 */
#define UNINTERLEAVE_LOCAL_TO_REGISTER_OVERLAP(p_reg,n_tid,n_value_per_thread_num,n_overlap_value_num,p_local) \
	do { \
		_Pragma("unroll") \
		for(unsigned int i = 0; i < (n_value_per_thread_num) + (n_overlap_value_num); ++ i) \
			p_reg[i] = p_local[i + (n_tid) * (n_value_per_thread_num)]; \
		/* each thread gets n_value_per_thread_num consecutive elements */ \
	} while(0)

/**
 *	@def INTERLEAVE_REGISTER_TO_LOCAL
 *
 *	@brief copies data from registers to a block of local memory, in contiguous manner
 *
 *	Copies registers to local memory, so that the elements of each thread end up in
 *	consecutive addresses. The disadvantage is the presence of bank conflicts in the
 *	number of values per thread is even. This assumes that the correct number of
 *	threads enter and that the size of the data in local memory is not smaller than
 *	the product of values per thread and the number of threads.
 *
 *	@param[out] p_local is pointer to local memory
 *	@param[in] n_tid is thread id
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] p_reg is pointer to input values
 */
#define INTERLEAVE_REGISTER_TO_LOCAL(p_local,n_tid,n_value_per_thread_num,p_reg) \
	do { \
		_Pragma("unroll") \
		for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) \
			p_local[i + (n_tid) * (n_value_per_thread_num)] = p_reg[i]; \
		/* each thread gets n_value_per_thread_num consecutive elements */ \
	} while(0)
#endif // 0

/**
 *	@def UNINTERLEAVE_LOCAL_CFI_TO_REGISTER
 *
 *	@brief copies a block of local memory to registers, in contiguous manner and using conflict-free indexing
 *
 *	Copies local memory to registers, so that each thread gets several consecutive
 *	values. The disadvantage is the extra local memory needed for padding for
 *	conflict-free indexing. This assumes that the correct number of threads enter
 *	and that the size of the data in local memory is not smaller than the product
 *	of values per thread and the number of threads.
 *
 *	@param[out] p_reg is pointer to register array
 *	@param[in] n_tid is thread id
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] p_local is pointer to input values
 */
#define UNINTERLEAVE_LOCAL_CFI_TO_REGISTER(p_reg,n_tid,n_value_per_thread_num,p_local) \
	do { \
		_Pragma("unroll") \
		for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) \
			p_reg[i] = p_local[CONFLICT_FREE_INDEX(i + (n_tid) * (n_value_per_thread_num))]; \
		/* each thread gets n_value_per_thread_num consecutive elements */ \
	} while(0)

/**
 *	@def UNINTERLEAVE_LOCAL_CFI_TO_REGISTER_OVERLAP
 *
 *	@brief copies a block of local memory to registers, in contiguous manner and using conflict-free indexing
 *
 *	Copies local memory to registers, so that each thread gets several consecutive
 *	values. The disadvantage is the extra local memory needed for padding for
 *	conflict-free indexing. This assumes that the correct number of threads enter
 *	and that the size of the data in local memory is not smaller than the product
 *	of values per thread and the number of threads.
 *
 *	@param[out] p_reg is pointer to register array
 *	@param[in] n_tid is thread id
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] n_overlap_value_num is number of extra values per thread which overlap the first values of the next thread
 *	@param[in] p_local is pointer to input values
 */
#define UNINTERLEAVE_LOCAL_CFI_TO_REGISTER_OVERLAP(p_reg,n_tid,n_value_per_thread_num,n_overlap_value_num,p_local) \
	do { \
		_Pragma("unroll") \
		for(unsigned int i = 0; i < (n_value_per_thread_num) + (n_overlap_value_num); ++ i) \
			p_reg[i] = p_local[CONFLICT_FREE_INDEX(i + (n_tid) * (n_value_per_thread_num))]; \
		/* each thread gets n_value_per_thread_num consecutive elements */ \
	} while(0)

/**
 *	@def INTERLEAVE_REGISTER_TO_LOCAL_CFI
 *
 *	@brief copies data from registers to a block of local memory, in contiguous manner and using conflict-free indexing
 *
 *	Copies registers to local memory, so that the elements of each thread end up in
 *	consecutive addresses. The disadvantage is the extra local memory needed for padding for
 *	conflict-free indexing. This assumes that the correct number of threads enter and that
 *	the size of the data in local memory is not smaller than the product of values per
 *	thread and the number of threads.
 *
 *	@param[out] p_local is pointer to local memory
 *	@param[in] n_tid is thread id
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] p_reg is pointer to input values
 */
#define INTERLEAVE_REGISTER_TO_LOCAL_CFI(p_local,n_tid,n_value_per_thread_num,p_reg) \
	do { \
		_Pragma("unroll") \
		for(unsigned int i = 0; i < (n_value_per_thread_num); ++ i) \
			p_local[CONFLICT_FREE_INDEX(i + (n_tid) * (n_value_per_thread_num))] = p_reg[i]; \
		/* each thread gets n_value_per_thread_num consecutive elements */ \
	} while(0)

/**
 *	@def GLOBAL_TO_LOCAL_CFI_COND
 *
 *	@brief copies a block of global memory to local memory, in strided manner and using conflict-free indexing
 *
 *	Copies global memory to local memory, employing coalesced reads. The advantage
 *	is having the values in unchanged order in local memory, the disadvantage is that
 *	the values are stored in the local memory using conflict-free indexing which requires
 *	slightly more local memory.
 *
 *	@param[in] b_use_CFI is CFI usage flag (should be a compile-time constant)
 *	@param[out] p_local is pointer to local memory (must be allocated to
 *		<tt>n_thread_num * n_value_per_thread_num + CONFLICT_FREE_OFFSET(n_thread_num * n_value_per_thread_num - 1)</tt>)
 *	@param[in] n_tid is thread id
 *	@param[in] n_thread_num is number of threads
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] p_data is pointer to input values
 *	@param[in] n_data_num is number of input values in the buffer (but only up to n_thread_num * n_value_per_thread_num are actually copied)
 *	@param[in] t_padding_value is value to pad the array of the last block with
 */
#define GLOBAL_TO_LOCAL_CFI_COND(b_use_CFI,p_local,n_tid,n_thread_num,n_value_per_thread_num,p_data,n_data_num,t_padding_value) \
	do { \
		if(b_use_CFI) \
			GLOBAL_TO_LOCAL_CFI((p_local), (n_tid), (n_thread_num), (n_value_per_thread_num), (p_data), (n_data_num), (t_padding_value)); \
		else \
			GLOBAL_TO_LOCAL((p_local), (n_tid), (n_thread_num), (n_value_per_thread_num), (p_data), (n_data_num), (t_padding_value)); \
	} while(0)

/**
 *	@def GLOBAL_TO_LOCAL_N_MORE_CFI_COND
 *
 *	@brief copies a block of global memory to local memory, in strided manner and using conflict-free indexing
 *
 *	Copies global memory to local memory, employing coalesced reads. The advantage
 *	is having the values in unchanged order in local memory, the disadvantage is that
 *	the values are stored in the local memory using conflict-free indexing which requires
 *	slightly more local memory.
 *
 *	@param[in] b_use_CFI is CFI usage flag (should be a compile-time constant)
 *	@param[out] p_local is pointer to local memory (must be allocated to
 *		<tt>n_thread_num * n_value_per_thread_num + CONFLICT_FREE_OFFSET(n_thread_num * n_value_per_thread_num - 1)</tt>)
 *	@param[in] n_tid is thread id
 *	@param[in] n_thread_num is number of threads
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] n_extra_value_num is number of extra values to load at the end
 *	@param[in] p_data is pointer to input values
 *	@param[in] n_data_num is number of input values in the buffer (but only up to n_thread_num * n_value_per_thread_num are actually copied)
 *	@param[in] t_padding_value is value to pad the array of the last block with
 */
#define GLOBAL_TO_LOCAL_N_MORE_CFI_COND(b_use_CFI,p_local,n_tid,n_thread_num,n_value_per_thread_num,n_extra_value_num,p_data,n_data_num,t_padding_value) \
	do { \
		if(b_use_CFI) \
			GLOBAL_TO_LOCAL_N_MORE_CFI((p_local), (n_tid), (n_thread_num), (n_value_per_thread_num), (n_extra_value_num), (p_data), (n_data_num), (t_padding_value)); \
		else \
			GLOBAL_TO_LOCAL_N_MORE((p_local), (n_tid), (n_thread_num), (n_value_per_thread_num), (n_extra_value_num), (p_data), (n_data_num), (t_padding_value)); \
	} while(0)

/**
 *	@def LOCAL_CFI_COND_TO_GLOBAL
 *
 *	@brief copies a block of local memory to global memory, in strided manner and using conflict-free indexing
 *
 *	@param[in] b_use_CFI is CFI usage flag (should be a compile-time constant)
 *	@param[out] p_dest is pointer to output values
 *	@param[in] n_dest_len is number of output values to be written (but only up to n_thread_num * n_value_per_thread_num are actually copied)
 *	@param[in] n_tid is thread id
 *	@param[in] n_thread_num is number of threads
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[out] p_local is pointer to local memory (must be allocated to
 *		<tt>n_thread_num * n_value_per_thread_num + CONFLICT_FREE_OFFSET(n_thread_num * n_value_per_thread_num - 1)</tt>)
 */
#define LOCAL_CFI_COND_TO_GLOBAL(b_use_CFI,p_dest,n_dest_len,n_tid,n_thread_num,n_value_per_thread_num,p_local) \
	do { \
		if(b_use_CFI) \
			LOCAL_CFI_TO_GLOBAL((p_dest), (n_dest_len), (n_tid), (n_thread_num), (n_value_per_thread_num), (p_local)); \
		else \
			LOCAL_TO_GLOBAL((p_dest), (n_dest_len), (n_tid), (n_thread_num), (n_value_per_thread_num), (p_local)); \
	} while(0)

/**
 *	@def UNINTERLEAVE_LOCAL_CFI_COND_TO_REGISTER
 *
 *	@brief copies a block of local memory to registers, in contiguous manner and using conflict-free indexing
 *
 *	Copies local memory to registers, so that each thread gets several consecutive
 *	values. The disadvantage is the extra local memory needed for padding for
 *	conflict-free indexing. This assumes that the correct number of threads enter
 *	and that the size of the data in local memory is not smaller than the product
 *	of values per thread and the number of threads.
 *
 *	@param[in] b_use_CFI is CFI usage flag (should be a compile-time constant)
 *	@param[out] p_reg is pointer to register array
 *	@param[in] n_tid is thread id
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] p_local is pointer to input values
 */
#define UNINTERLEAVE_LOCAL_CFI_COND_TO_REGISTER(b_use_CFI,p_reg,n_tid,n_value_per_thread_num,p_local) \
	do { \
		if(b_use_CFI) \
			UNINTERLEAVE_LOCAL_CFI_TO_REGISTER((p_reg), (n_tid), (n_value_per_thread_num), (p_local)); \
		else \
			UNINTERLEAVE_LOCAL_TO_REGISTER((p_reg), (n_tid), (n_value_per_thread_num), (p_local)); \
	} while(0)

/**
 *	@def UNINTERLEAVE_LOCAL_CFI_COND_TO_REGISTER_OVERLAP
 *
 *	@brief copies a block of local memory to registers, in contiguous manner and using conflict-free indexing
 *
 *	Copies local memory to registers, so that each thread gets several consecutive
 *	values. The disadvantage is the extra local memory needed for padding for
 *	conflict-free indexing. This assumes that the correct number of threads enter
 *	and that the size of the data in local memory is not smaller than the product
 *	of values per thread and the number of threads.
 *
 *	@param[in] b_use_CFI is CFI usage flag (should be a compile-time constant)
 *	@param[out] p_reg is pointer to register array
 *	@param[in] n_tid is thread id
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] n_overlap_value_num is number of extra values per thread which overlap the first values of the next thread
 *	@param[in] p_local is pointer to input values
 */
#define UNINTERLEAVE_LOCAL_CFI_COND_TO_REGISTER_OVERLAP(b_use_CFI,p_reg,n_tid,n_value_per_thread_num,n_overlap_value_num,p_local) \
	do { \
		if(b_use_CFI) \
			UNINTERLEAVE_LOCAL_CFI_TO_REGISTER_OVERLAP((p_reg), (n_tid), (n_value_per_thread_num), (n_overlap_value_num), (p_local)); \
		else \
			UNINTERLEAVE_LOCAL_TO_REGISTER_OVERLAP((p_reg), (n_tid), (n_value_per_thread_num), (n_overlap_value_num), (p_local)); \
	} while(0)

/**
 *	@def INTERLEAVE_REGISTER_TO_LOCAL_CFI_COND
 *
 *	@brief copies data from registers to a block of local memory, in contiguous manner and using conflict-free indexing
 *
 *	Copies registers to local memory, so that the elements of each thread end up in
 *	consecutive addresses. The disadvantage is the extra local memory needed for padding for
 *	conflict-free indexing. This assumes that the correct number of threads enter and that
 *	the size of the data in local memory is not smaller than the product of values per
 *	thread and the number of threads.
 *
 *	@param[in] b_use_CFI is CFI usage flag (should be a compile-time constant)
 *	@param[out] p_local is pointer to local memory
 *	@param[in] n_tid is thread id
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] p_reg is pointer to input values
 */
#define INTERLEAVE_REGISTER_TO_LOCAL_CFI_COND(b_use_CFI,p_local,n_tid,n_value_per_thread_num,p_reg) \
	do { \
		if(b_use_CFI) \
			INTERLEAVE_REGISTER_TO_LOCAL_CFI((p_local), (n_tid), (n_value_per_thread_num), (p_reg)); \
		else \
			INTERLEAVE_REGISTER_TO_LOCAL((p_local), (n_tid), (n_value_per_thread_num), (p_reg)); \
	} while(0)

/**
 *	@def ORDERED_LOAD_TEMP_SIZE
 *	
 *	@param[in] b_use_CFI is CFI usage flag (should be a compile-time constant)
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] n_block_size is size of the data block (the number of threads
 *		times n_value_per_thread_num), possibly with extra space for additional data
 */
#define ORDERED_LOAD_TEMP_SIZE(b_use_CFI,n_value_per_thread_num,n_block_size) \
	(((n_value_per_thread_num) == 1)? 1 : (b_use_CFI)? CONFLICT_FREE_SIZE(n_block_size) : (n_block_size))

/**
 *	@def ORDERED_LOAD_OVERLAP_TEMP_SIZE
 *	
 *	@param[in] b_use_CFI is CFI usage flag (should be a compile-time constant)
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] n_overlap_value_num is number of extra values per thread which overlap the first values of the next thread
 *	@param[in] n_block_size is size of the data block (the number of threads
 *		times n_value_per_thread_num), possibly with extra space for additional data (not including n_overlap_value_num)
 */
#define ORDERED_LOAD_OVERLAP_TEMP_SIZE(b_use_CFI,n_value_per_thread_num,n_overlap_value_num,n_block_size) \
	(((n_value_per_thread_num) == 1 && (n_overlap_value_num) == 1)? 1 : (b_use_CFI)? CONFLICT_FREE_SIZE((n_block_size) + (n_overlap_value_num)) : ((n_block_size) + (n_overlap_value_num)))

/**
 *	@def GLOBAL_TO_REGISTER_ORDERED_CFI_COND
 *
 *	@brief copies a block of globnal memory to registers, in ordered manner, using local memory for exchange
 *
 *	Copies global memory to local memory, employing coalesced reads. The advantage
 *	is perfect bandwidth, the disadvantage is that each thread has values far apart
 *	in the original array rather than contiguous values.
 *
 *	@param[in] b_use_CFI is CFI usage flag (should be a compile-time constant)
 *	@param[out] p_reg is pointer to register array
 *	@param[in] n_tid is thread id
 *	@param[in] n_thread_num is number of threads
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] p_data is pointer to input values
 *	@param[in] n_data_num is number of input values in the buffer (but only up to n_thread_num * n_value_per_thread_num are actually copied)
 *	@param[in] t_padding_value is value to pad the array of the last block with
 *	@param[in,out] p_shared_temp is shared temporary array of size at least \ref ORDERED_LOAD_TEMP_SIZE <tt>(b_use_CFI,
 *		n_value_per_thread_num, n_value_per_thread_num * n_thread_num)</tt>
 */
#define GLOBAL_TO_REGISTER_ORDERED_CFI_COND(b_use_CFI,p_reg,n_tid,n_thread_num,n_value_per_thread_num,p_data,n_data_num,t_padding_value,p_shared_temp) \
	do { \
		if((n_value_per_thread_num) == 1) \
			GLOBAL_TO_REGISTER((p_reg), (n_tid), (n_thread_num), (n_value_per_thread_num), (p_data), (n_data_num), (t_padding_value)); \
		else { \
			GLOBAL_TO_LOCAL_CFI_COND((b_use_CFI), (p_shared_temp), (n_tid), (n_thread_num), (n_value_per_thread_num), (p_data), (n_data_num), (t_padding_value)); \
			barrier(CLK_LOCAL_MEM_FENCE); \
			UNINTERLEAVE_LOCAL_CFI_COND_TO_REGISTER((b_use_CFI), (p_reg), (n_tid), (n_value_per_thread_num), (p_shared_temp)); \
		} \
	} while(0)

/**
 *	@def GLOBAL_TO_REGISTER_ORDERED_OVERLAP_CFI_COND
 *
 *	@brief copies a block of globnal memory to registers, in ordered manner, using local memory for exchange
 *
 *	Copies global memory to local memory, employing coalesced reads. The advantage
 *	is perfect bandwidth, the disadvantage is that each thread has values far apart
 *	in the original array rather than contiguous values.
 *
 *	@param[in] b_use_CFI is CFI usage flag (should be a compile-time constant)
 *	@param[out] p_reg is pointer to register array
 *	@param[in] n_tid is thread id
 *	@param[in] n_thread_num is number of threads
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] n_overlap_value_num is number of extra values per thread which overlap the first values of the next thread
 *	@param[in] p_data is pointer to input values
 *	@param[in] n_data_num is number of input values in the buffer (but only up to n_thread_num * n_value_per_thread_num are actually copied)
 *	@param[in] t_padding_value is value to pad the array of the last block with
 *	@param[in,out] p_shared_temp is shared temporary array of size at least \ref ORDERED_LOAD_TEMP_SIZE <tt>(b_use_CFI,
 *		n_value_per_thread_num, n_value_per_thread_num * n_thread_num)</tt>
 */
#define GLOBAL_TO_REGISTER_ORDERED_OVERLAP_CFI_COND(b_use_CFI,p_reg,n_tid,n_thread_num,n_value_per_thread_num,n_overlap_value_num,p_data,n_data_num,t_padding_value,p_shared_temp) \
	do { \
		if((n_value_per_thread_num) == 1 && (n_overlap_value_num) == 1) \
			GLOBAL_TO_REGISTER_OVERLAP((p_reg), (n_tid), (n_thread_num), (n_value_per_thread_num), (n_overlap_value_num), (p_data), (n_data_num), (t_padding_value)); \
		else { \
			GLOBAL_TO_LOCAL_N_MORE_CFI_COND((b_use_CFI), (p_shared_temp), (n_tid), (n_thread_num), (n_value_per_thread_num), (n_overlap_value_num), (p_data), (n_data_num), (t_padding_value)); \
			barrier(CLK_LOCAL_MEM_FENCE); \
			UNINTERLEAVE_LOCAL_CFI_COND_TO_REGISTER_OVERLAP((b_use_CFI), (p_reg), (n_tid), (n_value_per_thread_num), (n_overlap_value_num), (p_shared_temp)); \
		} \
	} while(0)

/**
 *	@def REGISTER_TO_GLOBAL_ORDERED_CFI_COND
 *
 *	@brief copies a block from registers to globnal memory, in ordered manner,
 *		using local memory for exchange (reverse of \ref GLOBAL_TO_REGISTER_ORDERED_CFI_COND)
 *
 *	@param[in] b_use_CFI is CFI usage flag (should be a compile-time constant)
 *	@param[out] p_dest is pointer to output values
 *	@param[in] n_dest_len is number of output values to be written (but only up to n_thread_num * n_value_per_thread_num are actually copied)
 *	@param[in] n_tid is thread id
 *	@param[in] n_thread_num is number of threads
 *	@param[in] n_value_per_thread_num is number of values per thread
 *	@param[in] p_reg is pointer to register array
 *	@param[in,out] p_shared_temp is shared temporary array of size at least \ref ORDERED_LOAD_TEMP_SIZE <tt>(b_use_CFI,
 *		n_value_per_thread_num, n_value_per_thread_num * n_thread_num)</tt>
 */
#define REGISTER_TO_GLOBAL_ORDERED_CFI_COND(b_use_CFI,p_dest,n_dest_len,n_tid,n_thread_num,n_value_per_thread_num,p_reg,p_shared_temp) \
	do { \
		if((n_value_per_thread_num) == 1) \
			REGISTER_TO_GLOBAL((p_dest), (n_dest_len), (n_tid), (n_thread_num), (n_value_per_thread_num), (p_reg)); \
		else { \
			barrier(CLK_LOCAL_MEM_FENCE); /* wait for everyone to presumably stop using shared storage */ \
			INTERLEAVE_REGISTER_TO_LOCAL_CFI_COND((b_use_CFI), (p_shared_temp), (n_tid), (n_value_per_thread_num), (p_reg)); \
			barrier(CLK_LOCAL_MEM_FENCE); \
			LOCAL_CFI_COND_TO_GLOBAL((b_use_CFI), (p_dest), (n_dest_len), (n_tid), (n_thread_num), (n_value_per_thread_num), (p_shared_temp)); \
		} \
	} while(0)

#endif // !__BLOCKED_LOAD_STORE_MACROS_INCLUDED
