#ifndef __CL_KERNEL_NVIDIA_CC20_SPECIFIC_EMULATION_INCLUDED
#define __CL_KERNEL_NVIDIA_CC20_SPECIFIC_EMULATION_INCLUDED

/**
 *	@file gpgpu/kernel_utils/NV20Emu.h
 *	@brief emulation of voting functionality of NVIDIA devices
 *		(implemented since compute capability 1.2, ballot since 2.0)
 */

#include "Integer.h"
#include "NVIDIA.h" // intwarp_t, WARP_SIZE

#if WARP_SIZE <= 32
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
// need extended for or

typedef uint32_t _TyVoteInt;
#elif WARP_SIZE <= 64
#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
// need extended for or

typedef uint64_t _TyVoteInt;
#else // WARP_SIZE <= 32
#error "WARP_SIZE must not be more than 64"
#endif // WARP_SIZE <= 32
// needs to fit in an integer

enum {
	n_max_warps_at_a_time = 32,
	n_max_threads_at_a_time = n_max_warps_at_a_time * WARP_SIZE
};

// ballot is since CC 2.0

inline uintwarp_t n_warp_ballot(bool b_flag)
{
	const unsigned int ls = get_local_size(0);
	const unsigned int li = get_local_id(0);

	STATIC_ASSERT(sizeof(uintwarp_t) >= sizeof(_TyVoteInt), WAPR_BALLOT_RESULTS_TRUNCATED); // shouldn't be smaller

	__local _TyVoteInt p_vote[n_max_warps_at_a_time]; // adds 128B to shared memory usage
	// this is only good up to 32 * WARP_SIZE threads which is enough on NVIDIA but may not be enough elsewhere

	if(ls <= n_max_warps_at_a_time) { // most of the time
		const unsigned int n_warp = li >> LOG_WARP_SIZE, n_lane = li & (WARP_SIZE - 1); // 0 .. n_max_warps_at_a_time-1

		if(!n_lane) // each first thread zeroes its voting variable
			p_vote[n_warp] = 0;

		mem_fence(CLK_LOCAL_MEM_FENCE); // warp synchronous

		atom_or(&p_vote[n_warp], ((_TyVoteInt)b_flag) << n_lane);

		mem_fence(CLK_LOCAL_MEM_FENCE); // warp synchronous

		return p_vote[n_warp];
	} else {
		const unsigned int n_warp = li >> LOG_WARP_SIZE, n_lane = li & (WARP_SIZE - 1);
		// make the branch in the loop shorter

		_TyVoteInt n_my_result;
		for(unsigned int n_first = 0; n_first < ls; n_first += n_max_threads_at_a_time) {
			barrier(CLK_LOCAL_MEM_FENCE);
			// otherwise threads from other warps skip ahead and enter the condition

			if((li >= n_first) & (li < n_first + n_max_threads_at_a_time)) {
				const unsigned int n_tid = li - n_first;
				const unsigned int n_warp = n_tid >> LOG_WARP_SIZE, n_lane = li & (WARP_SIZE - 1); // 0 .. n_max_warps_at_a_time-1

				if(!n_lane) // each first thread zeroes its voting variable
					p_vote[n_warp] = 0;

				mem_fence(CLK_LOCAL_MEM_FENCE); // warp synchronous

				atom_or(&p_vote[n_warp], ((_TyVoteInt)b_flag) << n_lane);

				mem_fence(CLK_LOCAL_MEM_FENCE); // warp synchronous

				n_my_result = p_vote[n_warp];
				// this could be return to make an early exit, then the condition of this branch could be only < because >= would be taken care of, there would be no such threads anymore
			}
		}
		// if there are more threads than n_max_threads_at_a_time then they need to take turns

		return n_my_result;
	}
}

// voting is actually since CC 1.2 but those GPUs are mostly historical today

inline bool b_warp_vote_all(bool b_flag)
{
	return n_warp_ballot(b_flag) == (uintwarp_t)-1; // all ones?
}

inline bool b_warp_vote_none(bool b_flag)
{
	return n_warp_ballot(b_flag) == 0; // all zeros?
}

inline bool b_warp_vote_any(bool b_flag)
{
	return n_warp_ballot(b_flag) != 0; // at least some ones?
}

inline bool b_warp_vote_not_all(bool b_flag)
{
	return n_warp_ballot(b_flag) != (uintwarp_t)-1; // at least some zeros?
}

inline bool b_warp_vote_uni(bool b_flag)
{
	uintwarp_t n_result = n_warp_ballot(b_flag);
	return n_result == 0 || n_result == (uintwarp_t)-1; // at least some zeros?
}

// bit counting, since CC 2.0

inline uint32_t n_popc32(uint32_t n_x)
{
	return n_SetBit_Num(n_x);
}

inline uint32_t n_popc64(uint64_t n_x)
{
	return n_SetBit_Num_64(n_x);
}

inline uint32_t n_lzcnt32(uint32_t n_x)
{
	return n_LeadingZero_Num(n_x);
}

inline uint32_t n_lzcnt64(uint64_t n_x)
{
	return n_LeadingZero_Num_64(n_x);
}

inline uint32_t n_bfind_uint32_t(uint32_t n_x)
{
	return n_SetBit_Num(n_RightFill_32(n_x)) - 1; // zero-based position
	// bfind returns 0xffffffff if no non-sign bit is found
}

inline uint32_t n_bfind_uint64_t(uint64_t n_x)
{
	return n_SetBit_Num_64(n_RightFill_64(n_x)) - 1; // zero-based position
	// bfind returns 0xffffffffffffffff if no non-sign bit is found
}

inline uint32_t n_bfind_int32_t(int32_t n_x)
{
	if(n_x < 0)
		n_x = ~n_x; // flip the sign bits
	return n_bfind_uint64_t((uint32_t)n_x); // unsigned version
}

inline uint32_t n_bfind_int64_t(int64_t n_x)
{
	if(n_x < 0)
		n_x = ~n_x; // flip the sign bits
	return n_bfind_uint64_t((uint64_t)n_x); // unsigned version
}

inline uint32_t n_bfind_shiftamt_uint32_t(uint32_t n_x)
{
	return 31 - n_bfind_shiftamt_uint32_t(n_x);
}

inline uint32_t n_bfind_shiftamt_uint64_t(uint64_t n_x)
{
	return 63 - n_bfind_shiftamt_uint64_t(n_x);
}

inline uint32_t n_bfind_shiftamt_int32_t(int32_t n_x)
{
	return 31 - n_bfind_shiftamt_int32_t(n_x);
}

inline uint32_t n_bfind_shiftamt_int64_t(int64_t n_x)
{
	return 63 - n_bfind_shiftamt_int64_t(n_x);
}

inline uint32_t n_brev32(uint32_t n_x)
{
	n_x = ((n_x >> 1) & 0x55555555U) | ((n_x & 0x55555555U) << 1); // swap odd and even bits
	n_x = ((n_x >> 2) & 0x33333333U) | ((n_x & 0x33333333U) << 2); // swap consecutive pairs
	n_x = ((n_x >> 4) & 0x0f0f0f0fU) | ((n_x & 0x0f0f0f0fU) << 4); // swap nibbles ... 
	n_x = ((n_x >> 8) & 0x00ff00ffU) | ((n_x & 0x00ff00ffU) << 8); // swap bytes
	n_x = (n_x >> 16) | (n_x << 16); // swap 2-byte long pairs
	return n_x;
}

inline uint64_t n_brev64(uint64_t n_x)
{
	n_x = ((n_x >> 1) & 0x5555555555555555UL) | ((n_x & 0x5555555555555555UL) << 1); // swap odd and even bits
	n_x = ((n_x >> 2) & 0x3333333333333333UL) | ((n_x & 0x3333333333333333UL) << 2); // swap consecutive pairs
	n_x = ((n_x >> 4) & 0x0f0f0f0f0f0f0f0fUL) | ((n_x & 0x0f0f0f0f0f0f0f0fUL) << 4); // swap nibbles ... 
	n_x = ((n_x >> 8) & 0x00ff00ff00ff00ffUL) | ((n_x & 0x00ff00ff00ff00ffUL) << 8); // swap bytes
	n_x = ((n_x >> 16) & 0x0000ffff0000ffffUL) | ((n_x & 0x0000ffff0000ffffUL) << 16); // swap 2-byte long pairs
	n_x = (n_x >> 23) | (n_x << 32); // swap 4-byte long pairs
	return n_x;
}

// todo - unit-test those on an nvidia platform and see if they give the results they are supposed to

#endif // !__CL_KERNEL_NVIDIA_CC20_SPECIFIC_EMULATION_INCLUDED
