/**
 *	@file CLKernel_v3.c
 *	@author -tHE SWINe-
 *	@brief OpenCL kernels for v2 random indexing (the third version)
 *	@date 2010-08-19
 *
 *	Functions marked with _JIT suffix rely on the following values being defined when compiling this file:
 *
 *	- JIT_VECTOR_LENGTH is term vector length
 *	- JIT_WINDOW_SIZE is half-window size
 *	- JIT_SEED_LENGTH is seed vector length
 *	- JIT_MAX_SLICE_LENGTH is maximal slice length
 *	- JIT_HALF_SLICE_LENGTH is half maximal slice length
 *	- JIT_QUARTER_SLICE_LENGTH is quarter maximal slice length
 *
 *	There are also some optimization-specific defines:
 *
 *	- __FORCE_WORK_BOUNDS_CHECK__ forces each thread to check it's global id for exceeding task size
 *	- __SUMMATION_STEP_FORCE_WORK_BOUNDS_CHECK__ forces each of summatin step thread to check it's global id for exceeding task size
 *	- __SUMMATION_STEP_ENABLE_DUFFS_DEVICE__ enables duff's device in summation step (otherwise there's simple loop)
 *	- __BTV_STEP_ENABLE_DUFFS_DEVICE__ enables Duff's device in build term vectors step (otherwise there's unrolled loop)
 *	- __RETRAIN__ enables compilation of retraining code and disables compilation of seed code
 *	- __NPOT__ enables compilation of NPOT code and disables compilation of POT code
 */

typedef int TTermScalar; /**< @brief term vector element type */

/**
 *	@brief seed vector elements type (scalar)
 *
 *	Seed vector is a sparse vector, containing +1's, -1's and 0's (dummy vector).
 *	TSeed contains pair of values: value and index. Value is one of -1, 0 or +1,
 *	index is (zero-based) offset of this value in the vector.
 */
typedef unsigned short TSeed;

/**
 *	@brief seed vector constants
 */
enum {
	seed_SignBit = 1 << 15,				/**< @brief location of sign bit */
	seed_ValueBit = 1 << 14,			/**< @brief location of value bit */
	seed_ValueShift = 14,				/**< @brief shift of value bit */
	seed_IndexMask = seed_ValueBit - 1	/**< @brief mask of index to term vector */
};

/**
 *	@brief gets seed value from seed element
 *
 *	@param[in] s is seed element (generated by n_RandomSeed() or n_DummySeed())
 *
 *	@return Returns seed value (-1, 0 or +1).
 */
#define n_Seed_Value(s) ((TTermScalar)((short)(s) >> seed_ValueShift))

/**
 *	@brief gets index to term vector from seed element
 *
 *	@param[in] s is seed element (generated by n_RandomSeed() or n_DummySeed())
 *
 *	@return Returns index to term vector.
 */
#define n_Seed_Index(s) ((int)(s) & seed_IndexMask)

/**
 *	@brief equivalent of "C" stdlib memset()
 *
 *	@param[out] p_dest is pointer to the destination memory
 *	@param[in] n_first is offset in erased array, in 16-bit integers
 *	@param[in] n_size is size of erased array, in 16-bit integers
 *
 *	@remarks This kernel is launched with each thread writing to a single memory location (todo - for longer buffers, it would be probably better to have less threads, each writing small block of memory).
 */
__kernel void ZeroMemory16(__global short *p_dest, const unsigned int n_first, const unsigned int n_size)
{
	int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
	if(i >= n_size)
		return;
#endif //__FORCE_WORK_BOUNDS_CHECK__

	p_dest[i + n_first] = 0;
}

/**
 *	@brief equivalent of "C" stdlib memset()
 *
 *	@param[out] p_dest is pointer to the destination memory
 *	@param[in] n_first is offset in erased array, in 32-bit integers
 *	@param[in] n_size is size of erased array, in 32-bit integers
 *
 *	@remarks This kernel is launched with each thread writing to a single memory location (todo - for longer buffers, it would be probably better to have less threads, each writing small block of memory).
 */
__kernel void ZeroMemory32(__global int *p_dest, const unsigned int n_first, const unsigned int n_size)
{
	int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
	if(i >= n_size)
		return;
#endif //__FORCE_WORK_BOUNDS_CHECK__

	p_dest[i + n_first] = 0;
}

#ifndef __RETRAIN__

/**
 *	@brief summation step kernel
 *
 *	@param[in,out] p_vector is pointer to term vectors
 *	@param[in] n_dest_vector_first_elem is zero-based index of first destination element (n_term_id times n_vector_length)
 *	@param[in] p_dummy_vector_banks is pointer to dummy term vector banks
 *	@param[in] n_first_bank_first_elem is zero-based index of first source element (n_dummy_term_id times n_vector_length)
 *	@param[in] n_sum_length is number of dummy vectors, participating in the sum
 *
 *	@remarks This kernel is launched with each thread working on a single vector element.
 */
__kernel void SummationStep_JIT(__global TTermScalar *p_vector,
	const unsigned int n_dest_vector_first_elem, __global const TTermScalar *p_dummy_vector_banks,
	const unsigned int n_first_bank_first_elem, const unsigned int n_sum_length)
{
	unsigned int i = get_global_id(0);
#ifdef __SUMMATION_STEP_FORCE_WORK_BOUNDS_CHECK__
	if(i >= JIT_VECTOR_LENGTH)
		return;
#endif //__SUMMATION_STEP_FORCE_WORK_BOUNDS_CHECK__
	// each thread handles a single term vector element

	TTermScalar n_sum = 0;
	p_dummy_vector_banks += n_first_bank_first_elem + i;

#ifndef __SUMMATION_STEP_ENABLE_DUFFS_DEVICE__
	for(unsigned int i = 0; i < n_sum_length; ++ i) {
		n_sum += *p_dummy_vector_banks;
		p_dummy_vector_banks += JIT_VECTOR_LENGTH;
	}
	// simple sum - it's usually relatively short (2 to 100 loops)
#else //__SUMMATION_STEP_ENABLE_DUFFS_DEVICE__
	unsigned int n_repeat_num = (n_sum_length + 7) >> 3;
	switch(n_sum_length & 7) {
		do {
	case 0: n_sum += *p_dummy_vector_banks; p_dummy_vector_banks += JIT_VECTOR_LENGTH;
	case 7: n_sum += *p_dummy_vector_banks; p_dummy_vector_banks += JIT_VECTOR_LENGTH;
	case 6: n_sum += *p_dummy_vector_banks; p_dummy_vector_banks += JIT_VECTOR_LENGTH;
	case 5: n_sum += *p_dummy_vector_banks; p_dummy_vector_banks += JIT_VECTOR_LENGTH;
	case 4: n_sum += *p_dummy_vector_banks; p_dummy_vector_banks += JIT_VECTOR_LENGTH;
	case 3: n_sum += *p_dummy_vector_banks; p_dummy_vector_banks += JIT_VECTOR_LENGTH;
	case 2: n_sum += *p_dummy_vector_banks; p_dummy_vector_banks += JIT_VECTOR_LENGTH;
	case 1: n_sum += *p_dummy_vector_banks; p_dummy_vector_banks += JIT_VECTOR_LENGTH;
		} while(-- n_repeat_num);
	}
	// Duff's device
#endif //__SUMMATION_STEP_ENABLE_DUFFS_DEVICE__

	p_vector[n_dest_vector_first_elem + i] += n_sum;
}

#ifndef __NPOT__

/**
 *	@brief build term vectors kernel
 *
 *	Kernel is working with seed vectors as source, vector length must be power of two,
 *	co-occuring terms are rotated based on their relative position (not just shift by 1)
 *
 *	@param[in,out] p_vector is pointer to term vectors
 *	@param[in] p_chunk is pointer to current chunk
 *	@param[in] p_work_items is pointer to work-items (CDocumentSplitter::TWorkItem, packed as triplet of integers in order n_offset, n_length, n_term_id)
 *	@param[in] n_first_work_item is pass offset
 *	@param[in] n_work_item_num is pass size
 *	@param[in] p_occurence is pointer to term occurence list
 *	@param[in] p_seeds is pointer to seed vectors
 *
 *	@remarks This kernel is launched with each thread working on a single work-item.
 *	@note In case all the work-items have length of n_max_slice_length, it's better to use BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned()
 */
__kernel void BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_v2WorkItems(__global TTermScalar *p_vector,
	__global const unsigned int *p_chunk, __global const unsigned int *p_work_items,
	const unsigned int n_first_work_item, const unsigned int n_work_item_num,
	__global const unsigned int *p_occurence, __global const TSeed *p_seeds)
{
	unsigned int n_occurence_num;
	{
		unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
		if(i >= n_work_item_num)
			return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
		// each thread handles a single work-item in the list

		i += n_first_work_item;
		// there's pass offset in work-items

		i *= 3;
		// work-items are triplets

		p_occurence += p_work_items[i]; // offset is first
		// get head of occurence list

		n_occurence_num = p_work_items[i + 1]; // length is second
		// get number of occurences (slice size)

		p_vector += JIT_VECTOR_LENGTH * p_work_items[i + 2]; // term id is third
		// get term vector (output)
	}

	const unsigned int n_starting_rotation = ((JIT_WINDOW_SIZE + (JIT_VECTOR_LENGTH - 1)) & ~(JIT_VECTOR_LENGTH - 1)) - JIT_WINDOW_SIZE;
	for(; n_occurence_num; -- n_occurence_num, ++ p_occurence) {
		unsigned int n_position = *p_occurence;

		{
			unsigned int n_rotation = n_starting_rotation;
			const unsigned int n_max = n_position + JIT_WINDOW_SIZE;
			for(unsigned int k = n_position - JIT_WINDOW_SIZE; k <= n_max; ++ k, ++ n_rotation) {
				if(k == n_position)
					continue;
				// skip focused term

				__global const TSeed *p_seed = p_seeds + JIT_SEED_LENGTH * p_chunk[k];

#ifndef __BTV_STEP_ENABLE_DUFFS_DEVICE__
				#pragma unroll
				for(unsigned int i = 0; i < JIT_SEED_LENGTH; ++ i, ++ p_seed)
					p_vector[(n_Seed_Index(*p_seed) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(*p_seed);
				// enable loop unrolling
#else //__BTV_STEP_ENABLE_DUFFS_DEVICE__
				unsigned int n_repeat_num = (JIT_SEED_LENGTH + 7) >> 3;
				TSeed s;
				switch(JIT_SEED_LENGTH & 7) {
					do {
				case 0: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
					} while(-- n_repeat_num);
				}
				// use Duff's device
#endif //__BTV_STEP_ENABLE_DUFFS_DEVICE__
				// add seed to the vector
			}
			// terms left of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

/**
 *	@brief build term vectors kernel for passes where all the work-items have length of n_max_slice_length
 *
 *	Kernel is working with seed vectors as source, vector length must be power of two,
 *	co-occuring terms are rotated based on their relative position (not just shift by 1)
 *
 *	@param[in,out] p_vector is pointer to term vectors
 *	@param[in] p_chunk is pointer to current chunk
 *	@param[in] p_work_items is pointer to work-items (CDocumentSplitter::TWorkItem, packed as triplet of integers in order n_offset, n_length, n_term_id)
 *	@param[in] n_first_work_item is pass offset
 *	@param[in] n_work_item_num is pass size
 *	@param[in] p_occurence is pointer to term occurence list
 *	@param[in] p_seeds is pointer to seed vectors
 *
 *	@remarks This kernel is launched with each thread working on a single work-item.
 */
__kernel void BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned(__global TTermScalar *p_vector,
	__global const unsigned int *p_chunk, __global const unsigned int *p_work_items,
	const unsigned int n_first_work_item, const unsigned int n_work_item_num,
	__global const unsigned int *p_occurence, __global const TSeed *p_seeds)
{
	{
		unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
		if(i >= n_work_item_num)
			return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
		// each thread handles a single work-item in the list

		i += n_first_work_item;
		// there's pass offset in work-items

		i *= 3;
		// work-items are triplets

		p_occurence += p_work_items[i]; // offset is first
		// get head of occurence list

		p_vector += JIT_VECTOR_LENGTH * p_work_items[i + 2]; // term id is third
		// get term vector (output)
	}

	const unsigned int n_starting_rotation = ((JIT_WINDOW_SIZE + (JIT_VECTOR_LENGTH - 1)) & ~(JIT_VECTOR_LENGTH - 1)) - JIT_WINDOW_SIZE;
	for(unsigned int n_occurence = 0; n_occurence < JIT_MAX_SLICE_LENGTH; ++ n_occurence, ++ p_occurence) {
		unsigned int n_position = *p_occurence;

		{
			unsigned int n_rotation = n_starting_rotation;
			const unsigned int n_max = n_position + JIT_WINDOW_SIZE;
			for(unsigned int k = n_position - JIT_WINDOW_SIZE; k <= n_max; ++ k, ++ n_rotation) {
				if(k == n_position)
					continue;
				// skip focused term

				__global const TSeed *p_seed = p_seeds + JIT_SEED_LENGTH * p_chunk[k];

#ifndef __BTV_STEP_ENABLE_DUFFS_DEVICE__
				#pragma unroll
				for(unsigned int i = 0; i < JIT_SEED_LENGTH; ++ i, ++ p_seed)
					p_vector[(n_Seed_Index(*p_seed) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(*p_seed);
				// enable loop unrolling
#else //__BTV_STEP_ENABLE_DUFFS_DEVICE__
				unsigned int n_repeat_num = (JIT_SEED_LENGTH + 7) >> 3;
				TSeed s;
				switch(JIT_SEED_LENGTH & 7) {
					do {
				case 0: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
				case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & (JIT_VECTOR_LENGTH - 1)] += n_Seed_Value(s);
					} while(-- n_repeat_num);
				}
				// use Duff's device
#endif //__BTV_STEP_ENABLE_DUFFS_DEVICE__
				// add seed to the vector
			}
			// terms left of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

#else //__NPOT__

/**
 *	@brief build term vectors kernel
 *
 *	Kernel is working with seed vectors as source, vector length does not need to be power of two,
 *	co-occuring terms are rotated based on their relative position (not just shift by 1)
 *
 *	@param[in,out] p_vector is pointer to term vectors
 *	@param[in] p_chunk is pointer to current chunk
 *	@param[in] p_work_items is pointer to work-items (CDocumentSplitter::TWorkItem, packed as triplet of integers in order n_offset, n_length, n_term_id)
 *	@param[in] n_first_work_item is pass offset
 *	@param[in] n_work_item_num is pass size
 *	@param[in] p_occurence is pointer to term occurence list
 *	@param[in] p_seeds is pointer to seed vectors
 *
 *	@remarks This kernel is launched with each thread working on a single work-item.
 *	@note In case all the work-items have length of n_max_slice_length, it's better to use BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned()
 */
__kernel void BuildTermVectors_Seed_NPOT_TrueRot_RegOpts_JIT_v2WorkItems(__global TTermScalar *p_vector,
	__global const unsigned int *p_chunk, __global const unsigned int *p_work_items,
	const unsigned int n_first_work_item, const unsigned int n_work_item_num,
	__global const unsigned int *p_occurence, __global const TSeed *p_seeds)
{
	unsigned int n_occurence_num;
	{
		unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
		if(i >= n_work_item_num)
			return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
		// each thread handles a single work-item in the list

		i += n_first_work_item;
		// there's pass offset in work-items

		i *= 3;
		// work-items are triplets

		p_occurence += p_work_items[i]; // offset is first
		// get head of occurence list

		n_occurence_num = p_work_items[i + 1]; // length is second
		// get number of occurences (slice size)

		p_vector += JIT_VECTOR_LENGTH * p_work_items[i + 2]; // term id is third
		// get term vector (output)
	}

	const unsigned int n_starting_rotation = (JIT_WINDOW_SIZE + JIT_VECTOR_LENGTH - 1) -
		(JIT_WINDOW_SIZE + JIT_VECTOR_LENGTH - 1) % JIT_VECTOR_LENGTH - JIT_WINDOW_SIZE;
	for(; n_occurence_num; -- n_occurence_num, ++ p_occurence) {
		unsigned int n_position = *p_occurence;

		{
			unsigned int n_rotation = n_starting_rotation;
			const unsigned int n_max = n_position + JIT_WINDOW_SIZE;
			for(unsigned int k = n_position - JIT_WINDOW_SIZE; k <= n_max; ++ k, ++ n_rotation) {
				if(k == n_position)
					continue;
				// skip focused term

				__global const TSeed *p_seed = p_seeds + JIT_SEED_LENGTH * p_chunk[k];

#ifndef __BTV_STEP_ENABLE_DUFFS_DEVICE__
				#pragma unroll
				for(unsigned int i = 0; i < JIT_SEED_LENGTH; ++ i, ++ p_seed)
					p_vector[(n_Seed_Index(*p_seed) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(*p_seed);
				// enable loop unrolling
#else //__BTV_STEP_ENABLE_DUFFS_DEVICE__
				unsigned int n_repeat_num = (JIT_SEED_LENGTH + 7) >> 3;
				TSeed s;
				switch(JIT_SEED_LENGTH & 7) {
					do {
				case 0: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(s);
				case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(s);
				case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(s);
				case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(s);
				case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(s);
				case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(s);
				case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(s);
				case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(s);
					} while(-- n_repeat_num);
				}
				// use Duff's device
#endif //__BTV_STEP_ENABLE_DUFFS_DEVICE__
				// add seed to the vector
			}
			// terms left of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

/**
 *	@brief build term vectors kernel for passes where all the work-items have length of n_max_slice_length
 *
 *	Kernel is working with seed vectors as source, vector length does not need to be power of two,
 *	co-occuring terms are rotated based on their relative position (not just shift by 1)
 *
 *	@param[in,out] p_vector is pointer to term vectors
 *	@param[in] p_chunk is pointer to current chunk
 *	@param[in] p_work_items is pointer to work-items (CDocumentSplitter::TWorkItem, packed as triplet of integers in order n_offset, n_length, n_term_id)
 *	@param[in] n_first_work_item is pass offset
 *	@param[in] n_work_item_num is pass size
 *	@param[in] p_occurence is pointer to term occurence list
 *	@param[in] p_seeds is pointer to seed vectors
 *
 *	@remarks This kernel is launched with each thread working on a single work-item.
 */
__kernel void BuildTermVectors_Seed_NPOT_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned(__global TTermScalar *p_vector,
	__global const unsigned int *p_chunk, __global const unsigned int *p_work_items,
	const unsigned int n_first_work_item, const unsigned int n_work_item_num,
	__global const unsigned int *p_occurence, __global const TSeed *p_seeds)
{
	{
		unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
		if(i >= n_work_item_num)
			return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
		// each thread handles a single work-item in the list

		i += n_first_work_item;
		// there's pass offset in work-items

		i *= 3;
		// work-items are triplets

		p_occurence += p_work_items[i]; // offset is first
		// get head of occurence list

		p_vector += JIT_VECTOR_LENGTH * p_work_items[i + 2]; // term id is third
		// get term vector (output)
	}

	const unsigned int n_starting_rotation = (JIT_WINDOW_SIZE + JIT_VECTOR_LENGTH - 1) -
		(JIT_WINDOW_SIZE + JIT_VECTOR_LENGTH - 1) % JIT_VECTOR_LENGTH - JIT_WINDOW_SIZE;
	for(unsigned int n_occurence = 0; n_occurence < JIT_MAX_SLICE_LENGTH; ++ n_occurence, ++ p_occurence) {
		unsigned int n_position = *p_occurence;

		{
			unsigned int n_rotation = n_starting_rotation;
			const unsigned int n_max = n_position + JIT_WINDOW_SIZE;
			for(unsigned int k = n_position - JIT_WINDOW_SIZE; k <= n_max; ++ k, ++ n_rotation) {
				if(k == n_position)
					continue;
				// skip focused term

				__global const TSeed *p_seed = p_seeds + JIT_SEED_LENGTH * p_chunk[k];

#ifndef __BTV_STEP_ENABLE_DUFFS_DEVICE__
				#pragma unroll
				for(unsigned int i = 0; i < JIT_SEED_LENGTH; ++ i, ++ p_seed)
					p_vector[(n_Seed_Index(*p_seed) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(*p_seed);
				// enable loop unrolling
#else //__BTV_STEP_ENABLE_DUFFS_DEVICE__
				unsigned int n_repeat_num = (JIT_SEED_LENGTH + 7) >> 3;
				TSeed s;
				switch(JIT_SEED_LENGTH & 7) {
					do {
				case 0: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(s);
				case 7: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(s);
				case 6: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(s);
				case 5: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(s);
				case 4: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(s);
				case 3: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(s);
				case 2: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(s);
				case 1: s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) % JIT_VECTOR_LENGTH] += n_Seed_Value(s);
					} while(-- n_repeat_num);
				}
				// use Duff's device
#endif //__BTV_STEP_ENABLE_DUFFS_DEVICE__
				// add seed to the vector
			}
			// terms left of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

#endif //__NPOT__

#else //__RETRAIN__

#ifndef __NPOT__

__kernel void BuildTermVectors_Retrain_POT_TrueRot_RegOpts_JIT(__global TTermScalar *p_vector,
	const unsigned int n_dest_vector_first_elem, __global const unsigned int *p_chunk,
	__global const unsigned int *p_occurence, const unsigned int n_first_occurence,
	unsigned int n_occurence_num, __global const TTermScalar *p_seed_vectors)
{
	unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
	if(i >= JIT_VECTOR_LENGTH)
		return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
	// each thread handles a single term vector element

	p_vector += n_dest_vector_first_elem + i;
	// move to the destination element

	p_occurence += n_first_occurence;
	// move to the first occurence

	const unsigned int n_starting_rotation = JIT_WINDOW_SIZE + JIT_VECTOR_LENGTH + i; // wow that's easy for a change
	for(; n_occurence_num; -- n_occurence_num, ++ p_occurence) {
		unsigned int n_position = *p_occurence;

		{
			unsigned int n_rotation = n_starting_rotation;
			const unsigned int n_max = n_position + JIT_WINDOW_SIZE;
			#pragma unroll
			for(unsigned int k = n_position - JIT_WINDOW_SIZE; k <= n_max; ++ k, -- n_rotation) { // !! rotation decreases
				if(k == n_position)
					continue;
				// skip focused term

				__global const TTermScalar *p_seed_vector = p_seed_vectors + JIT_VECTOR_LENGTH * p_chunk[k];

				*p_vector += p_seed_vector[n_rotation & (JIT_VECTOR_LENGTH - 1)];
				// add seed vector to term vector
			}
			// terms left of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

__kernel void BuildTermVectors_Retrain_POT_TrueRot_RegOpts_JIT_SliceAligned(__global TTermScalar *p_vector,
	const unsigned int n_dest_vector_first_elem, __global const unsigned int *p_chunk,
	__global const unsigned int *p_occurence, const unsigned int n_first_occurence,
	__global const TTermScalar *p_seed_vectors)
{
	unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
	if(i >= JIT_VECTOR_LENGTH)
		return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
	// each thread handles a single term vector element

	p_vector += n_dest_vector_first_elem + i;
	// move to the destination element

	p_occurence += n_first_occurence;
	// move to the first occurence

	const unsigned int n_starting_rotation = JIT_WINDOW_SIZE + JIT_VECTOR_LENGTH + i; // wow that's easy for a change
	for(unsigned int n_occurence = 0; n_occurence < JIT_MAX_SLICE_LENGTH; ++ n_occurence, ++ p_occurence) {
		unsigned int n_position = *p_occurence;

		{
			unsigned int n_rotation = n_starting_rotation;
			const unsigned int n_max = n_position + JIT_WINDOW_SIZE;
			#pragma unroll
			for(unsigned int k = n_position - JIT_WINDOW_SIZE; k <= n_max; ++ k, -- n_rotation) { // !! rotation decreases
				if(k == n_position)
					continue;
				// skip focused term

				__global const TTermScalar *p_seed_vector = p_seed_vectors + JIT_VECTOR_LENGTH * p_chunk[k];

				*p_vector += p_seed_vector[n_rotation & (JIT_VECTOR_LENGTH - 1)];
				// add seed vector to term vector
			}
			// terms left of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

__kernel void BuildTermVectors_Retrain_POT_TrueRot_RegOpts_JIT_HalfSliceAligned(__global TTermScalar *p_vector,
	const unsigned int n_dest_vector_first_elem, __global const unsigned int *p_chunk,
	__global const unsigned int *p_occurence, const unsigned int n_first_occurence,
	__global const TTermScalar *p_seed_vectors)
{
	unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
	if(i >= JIT_VECTOR_LENGTH)
		return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
	// each thread handles a single term vector element

	p_vector += n_dest_vector_first_elem + i;
	// move to the destination element

	p_occurence += n_first_occurence;
	// move to the first occurence

	const unsigned int n_starting_rotation = JIT_WINDOW_SIZE + JIT_VECTOR_LENGTH + i; // wow that's easy for a change
	for(unsigned int n_occurence = 0; n_occurence < JIT_HALF_SLICE_LENGTH; ++ n_occurence, ++ p_occurence) {
		unsigned int n_position = *p_occurence;

		{
			unsigned int n_rotation = n_starting_rotation;
			const unsigned int n_max = n_position + JIT_WINDOW_SIZE;
			#pragma unroll
			for(unsigned int k = n_position - JIT_WINDOW_SIZE; k <= n_max; ++ k, -- n_rotation) { // !! rotation decreases
				if(k == n_position)
					continue;
				// skip focused term

				__global const TTermScalar *p_seed_vector = p_seed_vectors + JIT_VECTOR_LENGTH * p_chunk[k];

				*p_vector += p_seed_vector[n_rotation & (JIT_VECTOR_LENGTH - 1)];
				// add seed vector to term vector
			}
			// terms left of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

__kernel void BuildTermVectors_Retrain_POT_TrueRot_RegOpts_JIT_QuarterSliceAligned(__global TTermScalar *p_vector,
	const unsigned int n_dest_vector_first_elem, __global const unsigned int *p_chunk,
	__global const unsigned int *p_occurence, const unsigned int n_first_occurence,
	__global const TTermScalar *p_seed_vectors)
{
	unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
	if(i >= JIT_VECTOR_LENGTH)
		return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
	// each thread handles a single term vector element

	p_vector += n_dest_vector_first_elem + i;
	// move to the destination element

	p_occurence += n_first_occurence;
	// move to the first occurence

	const unsigned int n_starting_rotation = JIT_WINDOW_SIZE + JIT_VECTOR_LENGTH + i; // wow that's easy for a change
	for(unsigned int n_occurence = 0; n_occurence < JIT_QUARTER_SLICE_LENGTH; ++ n_occurence, ++ p_occurence) {
		unsigned int n_position = *p_occurence;

		{
			unsigned int n_rotation = n_starting_rotation;
			const unsigned int n_max = n_position + JIT_WINDOW_SIZE;
			#pragma unroll
			for(unsigned int k = n_position - JIT_WINDOW_SIZE; k <= n_max; ++ k, -- n_rotation) { // !! rotation decreases
				if(k == n_position)
					continue;
				// skip focused term

				__global const TTermScalar *p_seed_vector = p_seed_vectors + JIT_VECTOR_LENGTH * p_chunk[k];

				*p_vector += p_seed_vector[n_rotation & (JIT_VECTOR_LENGTH - 1)];
				// add seed vector to term vector
			}
			// terms left of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

#else //__NPOT__

__kernel void BuildTermVectors_Retrain_NPOT_TrueRot_RegOpts_JIT(__global TTermScalar *p_vector,
	const unsigned int n_dest_vector_first_elem, __global const unsigned int *p_chunk,
	__global const unsigned int *p_occurence, const unsigned int n_first_occurence,
	unsigned int n_occurence_num, __global const TTermScalar *p_seed_vectors)
{
	unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
	if(i >= JIT_VECTOR_LENGTH)
		return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
	// each thread handles a single term vector element

	p_vector += n_dest_vector_first_elem + i;
	// move to the destination element

	p_occurence += n_first_occurence;
	// move to the first occurence

	const unsigned int n_starting_rotation = JIT_WINDOW_SIZE + JIT_VECTOR_LENGTH + i; // wow that's easy for a change
	for(; n_occurence_num; -- n_occurence_num, ++ p_occurence) {
		unsigned int n_position = *p_occurence;

		{
			unsigned int n_rotation = n_starting_rotation;
			const unsigned int n_max = n_position + JIT_WINDOW_SIZE;
			#pragma unroll
			for(unsigned int k = n_position - JIT_WINDOW_SIZE; k <= n_max; ++ k, -- n_rotation) { // !! rotation decreases
				if(k == n_position)
					continue;
				// skip focused term

				__global const TTermScalar *p_seed_vector = p_seed_vectors + JIT_VECTOR_LENGTH * p_chunk[k];

				*p_vector += p_seed_vector[n_rotation % JIT_VECTOR_LENGTH];
				// add seed vector to term vector
			}
			// terms left of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

__kernel void BuildTermVectors_Retrain_NPOT_TrueRot_RegOpts_JIT_SliceAligned(__global TTermScalar *p_vector,
	const unsigned int n_dest_vector_first_elem, __global const unsigned int *p_chunk,
	__global const unsigned int *p_occurence, const unsigned int n_first_occurence,
	__global const TTermScalar *p_seed_vectors)
{
	unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
	if(i >= JIT_VECTOR_LENGTH)
		return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
	// each thread handles a single term vector element

	p_vector += n_dest_vector_first_elem + i;
	// move to the destination element

	p_occurence += n_first_occurence;
	// move to the first occurence

	const unsigned int n_starting_rotation = JIT_WINDOW_SIZE + JIT_VECTOR_LENGTH + i; // wow that's easy for a change
	for(unsigned int n_occurence = 0; n_occurence < JIT_MAX_SLICE_LENGTH; ++ n_occurence, ++ p_occurence) {
		unsigned int n_position = *p_occurence;

		{
			unsigned int n_rotation = n_starting_rotation;
			const unsigned int n_max = n_position + JIT_WINDOW_SIZE;
			#pragma unroll
			for(unsigned int k = n_position - JIT_WINDOW_SIZE; k <= n_max; ++ k, -- n_rotation) { // !! rotation decreases
				if(k == n_position)
					continue;
				// skip focused term

				__global const TTermScalar *p_seed_vector = p_seed_vectors + JIT_VECTOR_LENGTH * p_chunk[k];

				*p_vector += p_seed_vector[n_rotation % JIT_VECTOR_LENGTH];
				// add seed vector to term vector
			}
			// terms left of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

__kernel void BuildTermVectors_Retrain_NPOT_TrueRot_RegOpts_JIT_HalfSliceAligned(__global TTermScalar *p_vector,
	const unsigned int n_dest_vector_first_elem, __global const unsigned int *p_chunk,
	__global const unsigned int *p_occurence, const unsigned int n_first_occurence,
	__global const TTermScalar *p_seed_vectors)
{
	unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
	if(i >= JIT_VECTOR_LENGTH)
		return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
	// each thread handles a single term vector element

	p_vector += n_dest_vector_first_elem + i;
	// move to the destination element

	p_occurence += n_first_occurence;
	// move to the first occurence

	const unsigned int n_starting_rotation = JIT_WINDOW_SIZE + JIT_VECTOR_LENGTH + i; // wow that's easy for a change
	for(unsigned int n_occurence = 0; n_occurence < JIT_HALF_SLICE_LENGTH; ++ n_occurence, ++ p_occurence) {
		unsigned int n_position = *p_occurence;

		{
			unsigned int n_rotation = n_starting_rotation;
			const unsigned int n_max = n_position + JIT_WINDOW_SIZE;
			#pragma unroll
			for(unsigned int k = n_position - JIT_WINDOW_SIZE; k <= n_max; ++ k, -- n_rotation) { // !! rotation decreases
				if(k == n_position)
					continue;
				// skip focused term

				__global const TTermScalar *p_seed_vector = p_seed_vectors + JIT_VECTOR_LENGTH * p_chunk[k];

				*p_vector += p_seed_vector[n_rotation % JIT_VECTOR_LENGTH];
				// add seed vector to term vector
			}
			// terms left of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

__kernel void BuildTermVectors_Retrain_NPOT_TrueRot_RegOpts_JIT_QuarterSliceAligned(__global TTermScalar *p_vector,
	const unsigned int n_dest_vector_first_elem, __global const unsigned int *p_chunk,
	__global const unsigned int *p_occurence, const unsigned int n_first_occurence,
	__global const TTermScalar *p_seed_vectors)
{
	unsigned int i = get_global_id(0);
#ifdef __FORCE_WORK_BOUNDS_CHECK__
	if(i >= JIT_VECTOR_LENGTH)
		return;
#endif //__FORCE_WORK_BOUNDS_CHECK__
	// each thread handles a single term vector element

	p_vector += n_dest_vector_first_elem + i;
	// move to the destination element

	p_occurence += n_first_occurence;
	// move to the first occurence

	const unsigned int n_starting_rotation = JIT_WINDOW_SIZE + JIT_VECTOR_LENGTH + i; // wow that's easy for a change
	for(unsigned int n_occurence = 0; n_occurence < JIT_QUARTER_SLICE_LENGTH; ++ n_occurence, ++ p_occurence) {
		unsigned int n_position = *p_occurence;

		{
			unsigned int n_rotation = n_starting_rotation;
			const unsigned int n_max = n_position + JIT_WINDOW_SIZE;
			#pragma unroll
			for(unsigned int k = n_position - JIT_WINDOW_SIZE; k <= n_max; ++ k, -- n_rotation) { // !! rotation decreases
				if(k == n_position)
					continue;
				// skip focused term

				__global const TTermScalar *p_seed_vector = p_seed_vectors + JIT_VECTOR_LENGTH * p_chunk[k];

				*p_vector += p_seed_vector[n_rotation % JIT_VECTOR_LENGTH];
				// add seed vector to term vector
			}
			// terms left of focused term
		}
		// loop over window
	}
	// loop over term occurences
}

#endif //__NPOT__

#endif //__RETRAIN__
