#ifndef __CL_KERNEL_NVIDIA_CC30_SPECIFIC_INCLUDED
#define __CL_KERNEL_NVIDIA_CC30_SPECIFIC_INCLUDED

/**
 *	@file gpgpu/kernel_utils/NV30.h
 *	@brief functionality for devices with compute capability 3.0 and higher
 *
 *	To implement some sort of overloading, one can:
 *
 *	@code
 *	inline SCALAR_TYPE shfl_add(SCALAR_TYPE x, int offset, int width) // "overloaded" shfl_add function for the current type
 *	{
 *		return CONCAT(shfl_add, CONCAT(_, EXPAND(SCALAR_TYPE)))(x, offset, width);
 *	}
 *	@endcode
 *
 *	Where <tt>SCALAR_TYPE</tt> is one of the standard types.
 */

#include "Integer.h"

inline uint32_t shfl_add_uint32_t(uint32_t x, int offset, int width)
{
	uint32_t result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .u32 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p add.u32 r0, r0, %4;"
		"mov.u32 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}

inline int32_t shfl_add_int32_t(int32_t x, int offset, int width)
{
	int32_t result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .s32 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p add.s32 r0, r0, %4;"
		"mov.s32 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}

/*inline uint64_t shfl_add_uint64_t(uint32_t x, int offset, int width)
{
	uint64_t result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .u64 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p add.u64 r0, r0, %4;"
		"mov.u64 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}

inline int64_t shfl_add_int64_t(int32_t x, int offset, int width)
{
	int64_t result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .s64 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p add.s64 r0, r0, %4;"
		"mov.s64 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}*/

inline float shfl_add_float(float x, int offset, int width)
{
	float result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .f32 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p add.f32 r0, r0, %4;"
		"mov.f32 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}

/*inline double shfl_add_double(double x, int offset, int width)
{
	double result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .f64 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p add.f64 r0, r0, %4;"
		"mov.f64 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}*/


inline uint32_t shfl_sub_uint32_t(uint32_t x, int offset, int width)
{
	uint32_t result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .u32 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p sub.u32 r0, r0, %4;"
		"mov.u32 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}

inline int32_t shfl_sub_int32_t(int32_t x, int offset, int width)
{
	int32_t result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .s32 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p sub.s32 r0, r0, %4;"
		"mov.s32 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}

/*inline uint64_t shfl_sub_uint64_t(uint32_t x, int offset, int width)
{
	uint64_t result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .u64 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p sub.u64 r0, r0, %4;"
		"mov.u64 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}

inline int64_t shfl_sub_int64_t(int32_t x, int offset, int width)
{
	int64_t result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .s64 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p sub.s64 r0, r0, %4;"
		"mov.s64 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}*/

inline float shfl_sub_float(float x, int offset, int width)
{
	float result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .f32 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p sub.f32 r0, r0, %4;"
		"mov.f32 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}

/*inline double shfl_sub_double(double x, int offset, int width)
{
	double result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .f64 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p sub.f64 r0, r0, %4;"
		"mov.f64 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}*/

inline uint32_t shfl_mul_uint32_t(uint32_t x, int offset, int width)
{
	uint32_t result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .u32 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p mul.u32 r0, r0, %4;"
		"mov.u32 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}

inline int32_t shfl_mul_int32_t(int32_t x, int offset, int width)
{
	int32_t result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .s32 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p mul.s32 r0, r0, %4;"
		"mov.s32 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}

/*inline uint64_t shfl_mul_uint64_t(uint32_t x, int offset, int width)
{
	uint64_t result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .u64 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p mul.u64 r0, r0, %4;"
		"mov.u64 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}

inline int64_t shfl_mul_int64_t(int32_t x, int offset, int width)
{
	int64_t result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .s64 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p mul.s64 r0, r0, %4;"
		"mov.s64 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}*/

inline float shfl_mul_float(float x, int offset, int width)
{
	float result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .f32 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p mul.f32 r0, r0, %4;"
		"mov.f32 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}

/*inline double shfl_mul_double(double x, int offset, int width)
{
	double result = 0;
	int mask = (WARP_SIZE - width) << 8;
	asm("{.reg .f64 r0;"
		".reg .pred p;"
		"shfl.up.b32 r0|p, %1, %2, %3;"
		"@p mul.f64 r0, r0, %4;"
		"mov.f64 %0, r0; }"
		: "=r"(result) : "r"(x), "r"(offset), "r"(mask), "r"(x));
	return result;
}*/

#endif // !__CL_KERNEL_NVIDIA_CC30_SPECIFIC_INCLUDED
