/**
 *	@file CLKernel.c
 *	@author -tHE SWINe-
 *	@brief OpenCL kernels for random indexing
 *	@date 2010-08-13
 *
 *	Functions marked with _JIT suffix rely on the following values being defined when compiling this file:
 *
 *	- JIT_VECTOR_LENGTH is term vector length
 *	- JIT_WINDOW_SIZE is half-window size
 *	- JIT_SEED_LENGTH is seed vector length
 *	- JIT_MAX_SLICE_LENGTH is maximal slice length
 *	- JIT_OCCURENCE_SLICE is length of occurence slice
 *
 *	There are also some optimization-specific defines:
 *
 *	- __FORCE_WORK_BOUNDS_CHECK__ forces each thread to check it's global id for exceeding task size
 *	- __SEPARATE_LOOPS__ separates term window into two loops, for the left and for the right part of the window. it tends to be slower that way.
 *	- __ENABLE_DUFFS_DEVICE__ enables Duff's device in build term vectors step (otherwise there's a simple loop)
 */

/**
 *	@brief seed vector elements type (scalar)
 *
 *	Seed vector is a sparse vector, containing +1's, -1's and 0's (dummy vector).
 *	TSeed contains pair of values: value and index. Value is one of -1, 0 or +1,
 *	index is (zero-based) offset of this value in the vector.
 */
typedef unsigned short TSeed;

/**
 *	@brief seed vector constants
 */
enum {
	seed_SignBit = 1 << 15, /**< @brief location of sign bit */
	seed_ValueBit = 1 << 14, /**< @brief location of value bit */
	seed_ValueShift = 14, /**< @brief shift of value bit */
	seed_IndexMask = seed_ValueBit - 1 /**< @brief mask of index to term vector */
};

/**
 *	@brief gets seed value from seed element
 *
 *	@param[in] s is seed element (generated by n_RandomSeed() or n_DummySeed())
 *
 *	@return Returns seed value (-1, 0 or +1).
 */
#define n_Seed_Value(s) ((int)(short)(s) >> seed_ValueShift)

/**
 *	@brief gets index to term vector from seed element
 *
 *	@param[in] s is seed element (generated by n_RandomSeed() or n_DummySeed())
 *
 *	@return Returns index to term vector.
 */
#define n_Seed_Index(s) ((int)(s) & seed_IndexMask)

/**
 *	@brief equivalent of "C" stdlib memset()
 *
 *	@param[out] p_dest is pointer to the destination memory
 *	@param[in] n_size is size of erased array, in 32-bit integers
 *
 *	@remarks This kernel is launched with each thread writing to a single memory location (todo - for longer buffers, it would be probably better to have less threads, each writing small block of memory).
 */
__kernel void ZeroMemory32(__global unsigned int *p_dest, const unsigned int n_size)
{
	int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
	if(i >= n_size)
		return;
#endif //__FORCE_WORK_BOUNDS_CHECK__

	p_dest[i] = 0;
}

/**
 *	@brief loop unrolling timing stats and code
 *	@note We're able to save over one second (5% of computation time) with Duff's device on the "King James' Bible" corpus.
 */
void LoopUnrollingSpeedComparison()
{
#if 0
	unsigned int n_repeat_num = (n_seed_length + 15) >> 4;
	TSeed s;
	switch(n_seed_length & 15) {
		do {
	case 0: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 15: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 14: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 13: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 12: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 11: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 10: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 9: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 8: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		} while(-- n_repeat_num);
	}
	// large Duff's device (the fastest: 27.57 sec ~ 0.1% speedup over smaller Duff's device below)

	unsigned int n_repeat_num = (n_seed_length + 7) >> 3;
	TSeed s;
	switch(n_seed_length & 7) {
		do {
	case 0: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		} while(-- n_repeat_num);
	}
	// Duff's device (almost equaly fast: 27.59 sec; this is preferred)

	unsigned int n_repeat_num = n_seed_length >> 3;
	TSeed s;
	for(; n_repeat_num; -- n_repeat_num) {
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	}
	switch(n_seed_length & 7) {
	case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	}
	// decoupled Duff's device (slower: 27.77 sec)

	unsigned int n_repeat_num = n_seed_length >> 3;
	TSeed s;
	for(; n_repeat_num; -- n_repeat_num) {
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	}
	unsigned int n_remainder = n_seed_length & 7;
	if(n_remainder == 7) {
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	} else if(n_remainder == 6) {
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	} else if(n_remainder == 5) {
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	} else if(n_remainder == 4) {
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	} else if(n_remainder == 3) {
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	} else if(n_remainder == 2) {
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	} else if(n_remainder == 1) {
		s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
		// note curly branches are required here (despite they're on a single line, there are two statements above!)
	}
	// classic nvidia example of loop unrolling (equaly slow: 27.77 sec)

	for(unsigned int l = 0; l < n_seed_length; ++ l) {
		TSeed s = p_seed[l];
		p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
	}
	// no unrolling (the slowest: 28.60 sec)
#endif //0
}

/**
 *	@brief build term vectors kernel
 *
 *	Kernel is working with seed vectors as source, vector length does not need to be power of two,
 *	co-occuring terms are rotated based on their relative position (not just shift by 1)
 *
 *	@param[in] n_vector_length is term vector length
 *	@param[in,out] p_vectors_gpu is pointer to term vectors
 *	@param[in] p_chunk is pointer to current chunk
 *	@param[in] p_offset is pointer to offset list
 *	@param[in] n_offset_num is number of offsets to be processed by all the threads
 *	@param[in] p_occurence_list is pointer to term occurence list
 *	@param[in] n_seed_length is seed vector length
 *	@param[in] n_window_size is half-window size
 *	@param[in] p_seeds is pointer to seed vectors
 *
 *	@remarks This kernel is launched with each thread working on a single offset.
 */
__kernel void BuildTermVectors_Seed_NPOT_TrueRot(const unsigned int n_vector_length, __global int *p_vectors_gpu,
	__global const unsigned int *p_chunk, __global const unsigned int *p_offset, const unsigned int n_offset_num,
	__global const unsigned int *p_occurence_list, const unsigned int n_seed_length,
	const unsigned int n_window_size, __global const TSeed *p_seeds)
{
	unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
	if(i >= n_offset_num)
		return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
	// each thread handles a single term in the list

	__global const unsigned int *p_occurence = p_occurence_list + p_offset[i];
	// get head of occurence list

	unsigned int n_term = *p_occurence ++;
	unsigned int n_occurence_num = *p_occurence ++;
	// get term id and number of occurences

	__global int *p_vector = p_vectors_gpu + n_vector_length * n_term;
	// get term vector (output)

	/*const unsigned int n_nearest_greater_vector_length = (n_window_size + n_vector_length - 1) -
		(n_window_size + n_vector_length - 1) % n_vector_length;*/
	// smallest multiple of n_window_size greater or equal to n_vector_length

	const unsigned int n_start_rotation = (n_window_size + n_vector_length - 1) -
		(n_window_size + n_vector_length - 1) % n_vector_length - n_window_size;
	// offset of left-most term under the window (it's positive modulo)

	for(unsigned int j = 0; j < n_occurence_num; ++ j) {
		unsigned int n_position = p_occurence[j];

#ifndef __SEPARATE_LOOPS__
		{
			unsigned int n_rotation = n_start_rotation;
			const unsigned int n_max = n_position + n_window_size + 1;
			for(unsigned int k = n_position - n_window_size; k < n_max; ++ k, ++ n_rotation) {
				if(k == n_position)
					continue;
				// skip focused term

				__global const TSeed *p_seed = p_seeds + n_seed_length * p_chunk[k];

#ifndef __ENABLE_DUFFS_DEVICE__
				for(unsigned int l = 0; l < n_seed_length; ++ l) {
					TSeed s = p_seed[l];
					p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				}
#else //__ENABLE_DUFFS_DEVICE__
				unsigned int n_repeat_num = (n_seed_length + 7) >> 3;
				TSeed s;
				switch(n_seed_length & 7) {
					do {
				case 0: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
					} while(-- n_repeat_num);
				}
#endif //__ENABLE_DUFFS_DEVICE__
				// add seed to the vector
			}
			// terms left of focused term
		}
#else //__SEPARATE_LOOPS__
		{
			unsigned int n_rotation = n_start_rotation;
			for(unsigned int k = n_position - n_window_size; k < n_position; ++ k, ++ n_rotation) {
				__global const TSeed *p_seed = p_seeds + n_seed_length * p_chunk[k];

#ifndef __ENABLE_DUFFS_DEVICE__
				for(unsigned int l = 0; l < n_seed_length; ++ l) {
					TSeed s = p_seed[l];
					p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				}
#else //__ENABLE_DUFFS_DEVICE__
				unsigned int n_repeat_num = (n_seed_length + 7) >> 3;
				TSeed s;
				switch(n_seed_length & 7) {
					do {
				case 0: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
					} while(-- n_repeat_num);
				}
#endif //__ENABLE_DUFFS_DEVICE__
				// add seed to the vector
			}
			// terms left of focused term
		}
		{
			unsigned int n_rotation = 1;
			const unsigned int n_max = n_position + n_window_size + 1;
			for(unsigned int k = n_position + 1; k < n_max; ++ k, ++ n_rotation) {
				__global const TSeed *p_seed = p_seeds + n_seed_length * p_chunk[k];

#ifndef __ENABLE_DUFFS_DEVICE__
				for(unsigned int l = 0; l < n_seed_length; ++ l) {
					TSeed s = p_seed[l];
					p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				}
#else //__ENABLE_DUFFS_DEVICE__
				unsigned int n_repeat_num = (n_seed_length + 7) >> 3;
				TSeed s;
				switch(n_seed_length & 7) {
					do {
				case 0: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
					} while(-- n_repeat_num);
				}
#endif //__ENABLE_DUFFS_DEVICE__
				// add seed to the vector
			}
			// terms right of focused term
		}
#endif //__SEPARATE_LOOPS__
		// loop over window
	}
	// loop over term occurences
}

/**
 *	@brief build term vectors kernel, processing just a slice of term occurences
 *
 *	Kernel is working with seed vectors as source, vector length must be power of two,
 *	co-occuring terms are rotated based on their relative position (not just shift by 1)
 *
 *	@param[in,out] p_vector is pointer to term vectors
 *	@param[in] p_chunk is pointer to current chunk
 *	@param[in] p_offset is pointer to offset list
 *	@param[in] n_offset_num is number of offsets to be processed by all the threads
 *	@param[in] p_occurence is pointer to term occurence list
 *	@param[in] p_seeds is pointer to seed vectors
 *	@param[in] n_job_offset is offset in term occurences (slice start; slice length is given by JIT_OCCURENCE_SLICE)
 *
 *	@remarks This kernel is launched with each thread working on a single offset.
 */
__kernel void BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_OccSlice(/*unsigned int n_vector_length,*/ __global int *p_vector,
	__global const unsigned int *p_chunk, __global const unsigned int *p_offset, const unsigned int n_offset_num,
	__global const unsigned int *p_occurence, /*const unsigned int n_seed_length,*/
	/*const unsigned int n_window_size,*/ __global const TSeed *p_seeds, const unsigned int n_job_offset)
{
	{
		unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
		if(i >= n_offset_num)
			return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
		// each thread handles a single term in the list

		p_occurence += p_offset[i];
		// get head of occurence list
	}

	p_vector += JIT_VECTOR_LENGTH * *p_occurence ++;
	// get term vector (output)

	unsigned int n_occurence_num = min(*p_occurence ++ - n_job_offset, JIT_OCCURENCE_SLICE);
	// get number of occurences (slice size)

	p_occurence += n_job_offset;
	// offset occurences

	const unsigned int n_starting_rotation = ((JIT_WINDOW_SIZE + (JIT_VECTOR_LENGTH - 1)) & ~(JIT_VECTOR_LENGTH - 1)) - JIT_WINDOW_SIZE;
	for(; n_occurence_num; -- n_occurence_num, ++ p_occurence) {
		unsigned int n_position = *p_occurence;

		{
			unsigned int n_rotation = n_starting_rotation;
			const unsigned int n_max = n_position + JIT_WINDOW_SIZE;
			for(unsigned int k = n_position - JIT_WINDOW_SIZE; k <= n_max; ++ k, ++ n_rotation) {
				if(k == n_position)
					continue;
				// skip focused term

				__global const TSeed *p_seed = p_seeds + JIT_SEED_LENGTH * p_chunk[k];

				unsigned int n_repeat_num = (JIT_SEED_LENGTH + 7) >> 3;
				TSeed s;
				switch(JIT_SEED_LENGTH & 7) {
					do {
				case 0: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
					} while(-- n_repeat_num);
				}
				// add seed to the vector
			}
			// terms left of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

/**
 *	@brief build term vectors kernel, processing just a slice of term occurences
 *
 *	Kernel is working with seed vectors as source, vector length must be power of two,
 *	co-occuring terms are rotated based on their relative position (not just shift by 1)
 *
 *	@param[in,out] p_vector is pointer to term vectors
 *	@param[in] p_chunk is pointer to current chunk
 *	@param[in] p_offset is pointer to offset list
 *	@param[in] n_offset_num is number of offsets to be processed by all the threads
 *	@param[in] p_occurence is pointer to term occurence list
 *	@param[in] p_seeds is pointer to seed vectors
 *	@param[in] n_job_offset is offset in term occurences (slice start; slice length is given by number of remaining occurences)
 *
 *	@remarks This kernel is launched with each thread working on a single offset.
 */
__kernel void BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_OccRemaining(/*unsigned int n_vector_length,*/ __global int *p_vector,
	__global const unsigned int *p_chunk, __global const unsigned int *p_offset, const unsigned int n_offset_num,
	__global const unsigned int *p_occurence, /*const unsigned int n_seed_length,*/
	/*const unsigned int n_window_size,*/ __global const TSeed *p_seeds, const unsigned int n_job_offset)
{
	{
		unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
		if(i >= n_offset_num)
			return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
		// each thread handles a single term in the list

		p_occurence += p_offset[i];
		// get head of occurence list
	}

	p_vector += JIT_VECTOR_LENGTH * *p_occurence ++;
	// get term vector (output)

	unsigned int n_occurence_num = *p_occurence ++ - n_job_offset;
	// get number of occurences (all remaining)

	p_occurence += n_job_offset;
	// offset occurences

	const unsigned int n_starting_rotation = ((JIT_WINDOW_SIZE + (JIT_VECTOR_LENGTH - 1)) & ~(JIT_VECTOR_LENGTH - 1)) - JIT_WINDOW_SIZE;
	for(; n_occurence_num; -- n_occurence_num, ++ p_occurence) {
		unsigned int n_position = *p_occurence;

		{
			unsigned int n_rotation = n_starting_rotation;
			const unsigned int n_max = n_position + JIT_WINDOW_SIZE;
			for(unsigned int k = n_position - JIT_WINDOW_SIZE; k <= n_max; ++ k, ++ n_rotation) {
				if(k == n_position)
					continue;
				// skip focused term

				__global const TSeed *p_seed = p_seeds + JIT_SEED_LENGTH * p_chunk[k];

				unsigned int n_repeat_num = (JIT_SEED_LENGTH + 7) >> 3;
				TSeed s;
				switch(JIT_SEED_LENGTH & 7) {
					do {
				case 0: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
					} while(-- n_repeat_num);
				}
				// add seed to the vector
			}
			// terms left of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

/**
 *	@brief build term vectors kernel, optimized for low register usage (needs 15 registers only)
 *
 *	Kernel is working with seed vectors as source, vector length must be power of two,
 *	co-occuring terms are rotated based on their relative position (not just shift by 1)
 *
 *	@param[in,out] p_vector is pointer to term vectors
 *	@param[in] p_chunk is pointer to current chunk
 *	@param[in] p_offset is pointer to offset list
 *	@param[in] n_offset_num is number of offsets to be processed by all the threads
 *	@param[in] p_occurence is pointer to term occurence list
 *	@param[in] p_seeds is pointer to seed vectors
 *
 *	@remarks This kernel is launched with each thread working on a single offset.
 */
__kernel void BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT(/*unsigned int n_vector_length,*/ __global int *p_vector,
	__global const unsigned int *p_chunk, __global const unsigned int *p_offset, const unsigned int n_offset_num,
	__global const unsigned int *p_occurence, /*const unsigned int n_seed_length,*/
	/*const unsigned int n_window_size,*/ __global const TSeed *p_seeds)
{
	{
		unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
		if(i >= n_offset_num)
			return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
		// each thread handles a single term in the list

		p_occurence += p_offset[i];
		// get head of occurence list
	}

	p_vector += JIT_VECTOR_LENGTH * *p_occurence ++;
	// get term vector (output)

	unsigned int n_occurence_num = *p_occurence ++;
	// get term number of occurences

	const unsigned int n_starting_rotation = ((JIT_WINDOW_SIZE + (JIT_VECTOR_LENGTH - 1)) & ~(JIT_VECTOR_LENGTH - 1)) - JIT_WINDOW_SIZE;
	for(; n_occurence_num; -- n_occurence_num, ++ p_occurence) {
		unsigned int n_position = *p_occurence;

		{
			unsigned int n_rotation = n_starting_rotation;
			const unsigned int n_max = n_position + JIT_WINDOW_SIZE;
			for(unsigned int k = n_position - JIT_WINDOW_SIZE; k <= n_max; ++ k, ++ n_rotation) {
				if(k == n_position)
					continue;
				// skip focused term

				__global const TSeed *p_seed = p_seeds + JIT_SEED_LENGTH * p_chunk[k];

				unsigned int n_repeat_num = (JIT_SEED_LENGTH + 7) >> 3;
				TSeed s;
				switch(JIT_SEED_LENGTH & 7) {
					do {
				case 0: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
					} while(-- n_repeat_num);
				}
				// add seed to the vector
			}
			// terms left of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

/**
 *	@brief build term vectors kernel, optimized for low register usage (needs 17 registers)
 *
 *	Kernel is working with seed vectors as source, vector length must be power of two,
 *	co-occuring terms are rotated based on their relative position (not just shift by 1)
 *
 *	@param[in] n_vector_length is term vector length
 *	@param[in,out] p_vector is pointer to term vectors
 *	@param[in] p_chunk is pointer to current chunk
 *	@param[in] p_offset is pointer to offset list
 *	@param[in] n_offset_num is number of offsets to be processed by all the threads
 *	@param[in] p_occurence is pointer to term occurence list
 *	@param[in] n_seed_length is seed vector length
 *	@param[in] n_window_size is half-window size
 *	@param[in] p_seeds is pointer to seed vectors
 *
 *	@remarks This kernel is launched with each thread working on a single offset.
 */
__kernel void BuildTermVectors_Seed_POT_TrueRot_RegOpts(unsigned int n_vector_length, __global int *p_vector,
	__global const unsigned int *p_chunk, __global const unsigned int *p_offset, const unsigned int n_offset_num,
	__global const unsigned int *p_occurence, const unsigned int n_seed_length,
	const unsigned int n_window_size, __global const TSeed *p_seeds)
{
	{
		unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
		if(i >= n_offset_num)
			return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
		// each thread handles a single term in the list

		p_occurence += p_offset[i];
		// get head of occurence list
	}

	p_vector += n_vector_length * *p_occurence ++;
	// get term vector (output)

	unsigned int n_occurence_num = *p_occurence ++;
	// get term number of occurences

	-- n_vector_length;
	// n_vector_length is power of two, now it's a mask

	const unsigned int n_starting_rotation = ((n_window_size + n_vector_length) & ~n_vector_length) - n_window_size;
	for(; n_occurence_num; -- n_occurence_num, ++ p_occurence) {
		unsigned int n_position = *p_occurence;

		{
			unsigned int n_rotation = n_starting_rotation;
			const unsigned int n_max = n_position + n_window_size;
			for(unsigned int k = n_position - n_window_size; k <= n_max; ++ k, ++ n_rotation) {
				if(k == n_position)
					continue;
				// skip focused term

				__global const TSeed *p_seed = p_seeds + n_seed_length * p_chunk[k];

				unsigned int n_repeat_num = (n_seed_length + 7) >> 3;
				TSeed s;
				switch(n_seed_length & 7) {
					do {
				case 0: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length] += n_Seed_Value(s);
				case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length] += n_Seed_Value(s);
				case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length] += n_Seed_Value(s);
				case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length] += n_Seed_Value(s);
				case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length] += n_Seed_Value(s);
				case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length] += n_Seed_Value(s);
				case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length] += n_Seed_Value(s);
				case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length] += n_Seed_Value(s);
					} while(-- n_repeat_num);
				}
				// add seed to the vector
			}
			// terms left of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

/**
 *	@brief build term vectors kernel
 *
 *	Kernel is working with seed vectors as source, vector length must be power of two,
 *	co-occuring terms are rotated based on their relative position (not just shift by 1)
 *
 *	@param[in] n_vector_length is term vector length
 *	@param[in,out] p_vectors_gpu is pointer to term vectors
 *	@param[in] p_chunk is pointer to current chunk
 *	@param[in] p_offset is pointer to offset list
 *	@param[in] n_offset_num is number of offsets to be processed by all the threads
 *	@param[in] p_occurence_list is pointer to term occurence list
 *	@param[in] n_seed_length is seed vector length
 *	@param[in] n_window_size is half-window size
 *	@param[in] p_seeds is pointer to seed vectors
 *
 *	@remarks This kernel is launched with each thread working on a single offset.
 */
__kernel void BuildTermVectors_Seed_POT_TrueRot(const unsigned int n_vector_length, __global int *p_vectors_gpu,
	__global const unsigned int *p_chunk, __global const unsigned int *p_offset, const unsigned int n_offset_num,
	__global const unsigned int *p_occurence_list, const unsigned int n_seed_length,
	const unsigned int n_window_size, __global const TSeed *p_seeds)
{
	unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
	if(i >= n_offset_num)
		return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
	// each thread handles a single term in the list

	__global const unsigned int *p_occurence = p_occurence_list + p_offset[i];
	// get head of occurence list

	unsigned int n_term = *p_occurence ++;
	unsigned int n_occurence_num = *p_occurence ++;
	// get term id and number of occurences

	__global int *p_vector = p_vectors_gpu + n_vector_length * n_term;
	// get term vector (output)

	const unsigned int n_vector_length_mask = n_vector_length - 1;
	// n_vector_length is power of two

	/*const unsigned int n_nearest_greater_vector_length = (n_window_size + n_vector_length_mask) & ~n_vector_length_mask;*/
	// smallest multiple of n_window_size greater or equal to n_vector_length

	const unsigned int n_start_rotation = ((n_window_size + n_vector_length_mask) & ~n_vector_length_mask) - n_window_size;
	// offset of left-most term under the window (it's positive modulo)

	for(unsigned int j = 0; j < n_occurence_num; ++ j) {
		unsigned int n_position = p_occurence[j];

#ifndef __SEPARATE_LOOPS__
		{
			unsigned int n_rotation = n_start_rotation;
			const unsigned int n_max = n_position + n_window_size + 1;
			for(unsigned int k = n_position - n_window_size; k < n_max; ++ k, ++ n_rotation) {
				if(k == n_position)
					continue;
				// skip focused term

				__global const TSeed *p_seed = p_seeds + n_seed_length * p_chunk[k];

#ifndef __ENABLE_DUFFS_DEVICE__
				for(unsigned int l = 0; l < n_seed_length; ++ l) {
					TSeed s = p_seed[l];
					p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				}
#else //__ENABLE_DUFFS_DEVICE__
				unsigned int n_repeat_num = (n_seed_length + 7) >> 3;
				TSeed s;
				switch(n_seed_length & 7) {
					do {
				case 0: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
					} while(-- n_repeat_num);
				}
#endif //__ENABLE_DUFFS_DEVICE__
				// add seed to the vector
			}
			// terms left of focused term
		}
#else //__SEPARATE_LOOPS__
		{
			unsigned int n_rotation = n_start_rotation;
			for(unsigned int k = n_position - n_window_size; k < n_position; ++ k, ++ n_rotation) {
				__global const TSeed *p_seed = p_seeds + n_seed_length * p_chunk[k];

#ifndef __ENABLE_DUFFS_DEVICE__
				for(unsigned int l = 0; l < n_seed_length; ++ l) {
					TSeed s = p_seed[l];
					p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				}
#else //__ENABLE_DUFFS_DEVICE__
				unsigned int n_repeat_num = (n_seed_length + 7) >> 3;
				TSeed s;
				switch(n_seed_length & 7) {
					do {
				case 0: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
					} while(-- n_repeat_num);
				}
#endif //__ENABLE_DUFFS_DEVICE__
				// add seed to the vector
			}
			// terms left of focused term
		}
		{
			unsigned int n_rotation = 1;
			const unsigned int n_max = n_position + n_window_size + 1;
			for(unsigned int k = n_position + 1; k < n_max; ++ k, ++ n_rotation) {
				__global const TSeed *p_seed = p_seeds + n_seed_length * p_chunk[k];

#ifndef __ENABLE_DUFFS_DEVICE__
				for(unsigned int l = 0; l < n_seed_length; ++ l) {
					TSeed s = p_seed[l];
					p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				}
#else //__ENABLE_DUFFS_DEVICE__
				unsigned int n_repeat_num = (n_seed_length + 7) >> 3;
				TSeed s;
				switch(n_seed_length & 7) {
					do {
				case 0: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
					} while(-- n_repeat_num);
				}
#endif //__ENABLE_DUFFS_DEVICE__
				// add seed to the vector
			}
			// terms right of focused term
		}
#endif //__SEPARATE_LOOPS__
		// loop over window
	}
	// loop over term occurences
}

/*
__kernel void BuildTermVectors_Seed_NPOT_TrueRot_WorkOffset(const unsigned int n_vector_length, __global int *p_vectors_gpu,
	__global const unsigned int *p_chunk, __global const unsigned int *p_offset, const unsigned int n_work_offset, const unsigned int n_offset_num,
	__global const unsigned int *p_occurence_list, const unsigned int n_seed_length,
	const unsigned int n_window_size, __global const TSeed *p_seeds)
{
	unsigned int i = get_global_id(0) + n_work_offset;
#ifdef __FORCE_WORK_BOUNDS_CHECK__
	if(i >= n_offset_num)
		return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
	// each thread handles a single term in the list

	__global const unsigned int *p_occurence = p_occurence_list + p_offset[i];
	// get head of occurence list

	unsigned int n_term = *p_occurence ++;
	unsigned int n_occurence_num = *p_occurence ++;
	// get term id and number of occurences

	__global int *p_vector = p_vectors_gpu + n_vector_length * n_term;
	// get term vector (output)

	const unsigned int n_nearest_greater_vector_length = (n_window_size + n_vector_length - 1) -
		(n_window_size + n_vector_length - 1) % n_vector_length;
	// smallest multiple of n_window_size greater or equal to n_vector_length

	for(unsigned int j = 0; j < n_occurence_num; ++ j) {
		unsigned int n_position = p_occurence[j];

		{
			unsigned int n_rotation = n_nearest_greater_vector_length - n_window_size;
			unsigned int n_min = n_position - n_window_size;
			for(unsigned int k = n_min; k < n_position; ++ k, ++ n_rotation) {
				__global const TSeed *p_seed = p_seeds + n_seed_length * p_chunk[k];
				for(unsigned int l = 0; l < n_seed_length; ++ l) {
					TSeed s = p_seed[l];
					p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				}
				// add seed to the vector (todo - apply some kind of loop optimization / Duff's device)
			}
			// terms left of focused term
		}
		{
			unsigned int n_rotation = 1;
			unsigned int n_max = n_position + n_window_size + 1;
			for(unsigned int k = n_position + 1; k < n_max; ++ k, ++ n_rotation) {
				__global const TSeed *p_seed = p_seeds + n_seed_length * p_chunk[k];
				for(unsigned int l = 0; l < n_seed_length; ++ l) {
					TSeed s = p_seed[l];
					p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				}
				// add seed to the vector (todo - apply some kind of loop optimization / Duff's device)
			}
			// terms right of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

__kernel void BuildTermVectors_Seed_NPOT_TrueRot_NoConst(unsigned int n_vector_length, __global int *p_vectors_gpu,
	__global unsigned int *p_chunk, __global unsigned int *p_offset, unsigned int n_offset_num,
	__global unsigned int *p_occurence_list, unsigned int n_seed_length,
	unsigned int n_window_size, __global TSeed *p_seeds)
{
	unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
	if(i >= n_offset_num)
		return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
	// each thread handles a single term in the list

	__global unsigned int *p_occurence = p_occurence_list + p_offset[i];
	// get head of occurence list

	unsigned int n_term = *p_occurence ++;
	unsigned int n_occurence_num = *p_occurence ++;
	// get term id and number of occurences

	__global int *p_vector = p_vectors_gpu + n_vector_length * n_term;
	// get term vector (output)

	unsigned int n_nearest_greater_vector_length = (n_window_size + n_vector_length - 1) -
		(n_window_size + n_vector_length - 1) % n_vector_length;
	// smallest multiple of n_window_size greater or equal to n_vector_length

	for(unsigned int j = 0; j < n_occurence_num; ++ j) {
		unsigned int n_position = p_occurence[j];

		{
			unsigned int n_rotation = n_nearest_greater_vector_length - n_window_size;
			unsigned int n_min = n_position - n_window_size;
			for(unsigned int k = n_min; k < n_position; ++ k, ++ n_rotation) {
				__global TSeed *p_seed = p_seeds + n_seed_length * p_chunk[k];
				for(unsigned int l = 0; l < n_seed_length; ++ l) {
					TSeed s = p_seed[l];
					p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				}
				// add seed to the vector (todo - apply some kind of loop optimization / Duff's device)
			}
			// terms left of focused term
		}
		{
			unsigned int n_rotation = 1;
			unsigned int n_max = n_position + n_window_size + 1;
			for(unsigned int k = n_position + 1; k < n_max; ++ k, ++ n_rotation) {
				__global TSeed *p_seed = p_seeds + n_seed_length * p_chunk[k];
				for(unsigned int l = 0; l < n_seed_length; ++ l) {
					TSeed s = p_seed[l];
					p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				}
				// add seed to the vector (todo - apply some kind of loop optimization / Duff's device)
			}
			// terms right of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

__kernel void BuildTermVectors_Seed_NPOT_TrueRot_WorkOffset_NoConst(unsigned int n_vector_length, __global int *p_vectors_gpu,
	__global unsigned int *p_chunk, __global unsigned int *p_offset, unsigned int n_work_offset, unsigned int n_offset_num,
	__global unsigned int *p_occurence_list, unsigned int n_seed_length,
	unsigned int n_window_size, __global TSeed *p_seeds)
{
	unsigned int i = get_global_id(0) + n_work_offset;
#ifdef __FORCE_WORK_BOUNDS_CHECK__
	if(i >= n_offset_num)
		return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
	// each thread handles a single term in the list

	__global unsigned int *p_occurence = p_occurence_list + p_offset[i];
	// get head of occurence list

	unsigned int n_term = *p_occurence ++;
	unsigned int n_occurence_num = *p_occurence ++;
	// get term id and number of occurences

	__global int *p_vector = p_vectors_gpu + n_vector_length * n_term;
	// get term vector (output)

	const unsigned int n_nearest_greater_vector_length = (n_window_size + n_vector_length - 1) -
		(n_window_size + n_vector_length - 1) % n_vector_length;
	// smallest multiple of n_window_size greater or equal to n_vector_length

	for(unsigned int j = 0; j < n_occurence_num; ++ j) {
		unsigned int n_position = p_occurence[j];

		{
			unsigned int n_rotation = n_nearest_greater_vector_length - n_window_size;
			unsigned int n_min = n_position - n_window_size;
			for(unsigned int k = n_min; k < n_position; ++ k, ++ n_rotation) {
				__global TSeed *p_seed = p_seeds + n_seed_length * p_chunk[k];
				for(unsigned int l = 0; l < n_seed_length; ++ l) {
					TSeed s = p_seed[l];
					p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				}
				// add seed to the vector (todo - apply some kind of loop optimization / Duff's device)
			}
			// terms left of focused term
		}
		{
			unsigned int n_rotation = 1;
			unsigned int n_max = n_position + n_window_size + 1;
			for(unsigned int k = n_position + 1; k < n_max; ++ k, ++ n_rotation) {
				__global TSeed *p_seed = p_seeds + n_seed_length * p_chunk[k];
				for(unsigned int l = 0; l < n_seed_length; ++ l) {
					TSeed s = p_seed[l];
					p_vector[(n_Seed_Index(s) + n_rotation) % n_vector_length] += n_Seed_Value(s);
				}
				// add seed to the vector (todo - apply some kind of loop optimization / Duff's device)
			}
			// terms right of focused term
		}
		// loop over window
	}
	// loop over term occurences
}
*/

