/**
 *	@file RandomIndexing_on_GPU/Main.cpp
 *	@author -tHE SWINe-
 *	@brief Random Indexing implemented on GPU
 *	@date 2010-08-06
 */

#include "../../UberLame_src/NewFix.h"
#include "../../UberLame_src/CallStack.h"
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <math.h>
#include <vector>
#include <string>
#include <map>
#include <algorithm>
#include <numeric>
#include <CL/opencl.h>
#include "../../UberLame_src/Integer.h"
#include "../../UberLame_src/StlUtils.h"
#include "../../UberLame_src/Timer.h"
#include "../../UberLame_src/MinMax.h"
#include "../../UberLame_src/gpgpu/ClUtils.h"
#include "../../UberLame_src/UniConv.h"
#include "../../UberLame_src/Mersene.h"
#include "../../UberLame_src/Dir.h"
#include "DocStorage.h"
#include "DocSplitter.h"

#if defined(_MSC_VER) && !defined(__MWERKS__) && !defined(for)
#define for if(0) {} else for
#endif

static bool b_verbose = false;

#define RETRAIN

/**
 *	@brief comverts unicode string to 8-bit encoding
 *
 *	@param[out] r_s_8bit_dest is destination (8-bit) string
 *	@param[in] r_s_unicode is source (unicode) string
 *	@param[in] r_uniconv is unicode mapping table
 *
 *	@return Returns null-teminated "c" string (result of r_s_8bit_dest.c_str()),
 *		containing translated input on success, or 0 on failure (not enough memory,
 *		bad mapping table or invalid unicode input).
 */
const char *p_s_DeUnicode(std::string &r_s_8bit_dest,
	const std::basic_string<wchar_t> &r_s_unicode, const CUnicodeMapping &r_uniconv)
{
	if(!r_uniconv.b_Status())
		return 0;
	if(CUniConv::n_Decode_UTF16(r_s_unicode.data(),
	   r_s_unicode.length() * sizeof(wchar_t), r_s_8bit_dest, r_uniconv, false, true) < 0)
		return 0;
	return r_s_8bit_dest.c_str();
}

/**
 *	@brief defines operations on seed vector elements
 */
class CSeedOps {
public:
	/**
	 *	@brief seed vector elements type (scalar)
	 *
	 *	Seed vector is a sparse vector, containing +1's, -1's and 0's (dummy vector).
	 *	TSeed contains pair of values: value and index. Value is one of -1, 0 or +1,
	 *	index is (zero-based) offset of this value in the vector.
	 */
	typedef unsigned short TSeed;

	/**
	 *	@brief seed vector constants
	 */
	enum {
		seed_SignBit = 1 << 15, /**< @brief location of sign bit */
		seed_ValueBit = 1 << 14, /**< @brief location of value bit */
		seed_ValueShift = 14, /**< @brief shift of value bit */
		seed_IndexMask = seed_ValueBit - 1 /**< @brief mask of index to term vector */
	};

	/**
	 *	@brief gets seed value from seed element
	 *
	 *	@param[in] s is seed element (generated by n_RandomSeed() or n_DummySeed())
	 *
	 *	@return Returns seed value (-1, 0 or +1).
	 */
	static inline int n_Seed_Value(TSeed s)
	{
		return (short(s) >> seed_ValueShift);
	}

	/**
	 *	@brief gets index to term vector from seed element
	 *
	 *	@param[in] s is seed element (generated by n_RandomSeed() or n_DummySeed())
	 *
	 *	@return Returns index to term vector.
	 */
	static inline int n_Seed_Index(TSeed s)
	{
		return s & seed_IndexMask;
	}

	/**
	 *	@brief generates a new random seed
	 *
	 *	@param[in] n_vector_length is term vector length
	 *	@param[in] r_twister is random number generator
	 *
	 *	@return Returns new random seed element.
	 */
	static inline TSeed n_RandomSeed(size_t n_vector_length, CMerseneTwister &r_twister)
	{
		uint32_t n_rand = r_twister.genrand_int32();
		return int((double(n_rand) / UINT32_MAX) * n_vector_length) | seed_ValueBit | ((n_rand & 1)? seed_SignBit : 0);
		// use lowest bit as sign; that doesn't matter as long as vector length stays below half of UINT32_MAX (safe assumption)
	}

	/**
	 *	@brief gets value of dummy seed element for dummy vector
	 *	@return Returns value of dummy seed element.
	 */
	static inline TSeed n_DummySeed()
	{
		return 0;
	}

	/**
	 *	@brief less-than ordering function for seed elements
	 *
	 *	@param[in] a is first seed element
	 *	@param[in] b is second seed element
	 *
	 *	@return Returns true if n_Seed_Index() of a is smaller than that of b, otherwise returns false.
	 */
	static inline bool b_Seed_Index_Smaller(TSeed a, TSeed b)
	{
		return n_Seed_Index(a) < n_Seed_Index(b);
	}
};

#if 0 // functions below are now in Integer.h

/**
 *	@brief determines whether input is power of two
 *
 *	@param[in] x is input number
 *
 *	@return Returns true if x is power of two, otherwise returns false.
 */
inline bool b_Is_POT(int x)
{
	return !(x & (x - 1));
}

/**
 *	@brief calculates nearest higher multiple of a value
 *
 *	@param[in] n_value is input value
 *	@param[in] n_align is alignment
 *
 *	@return Returns smallest possible integer multiple of n_align greater than n_value.
 */
inline size_t n_Align_Up(size_t n_value, size_t n_align)
{
	n_value += n_align - 1;
	return n_value - n_value % n_align;
}

#endif //0

typedef int TTermScalar; /**< @brief term vector element type */

/**
 *	@brief experimental v1 function for building term vectors on GPU
 *
 *	@param[out] p_vectors_gpu_ptr is pointer to pointer to term vectors, generated on GPU, it is written upon successful return; caller is responsible for freeing this pointer
 *	@param[in] n_vector_length is term vector length
 *	@param[in] n_seed_length is seed vector length
 *	@param[in] n_window_size is half-window size
 *	@param[in] b_profiling is GPU profiling flag
 *	@param[in] n_occurence_slice is maximal slice length
 *	@param[in] terms is list of all the terms, occuring in documents
 *	@param[in] n_dummy_term is dumym term id
 *	@param[in] documents is list of documents
 *	@param[in] p_seeds is list of seed vectors for all the terms (including the dummy term)
 *
 *	@return Returns 0 on success, -1 on failure.
 */
int n_BuildTermVectors_GPU_v1(TTermScalar **p_vectors_gpu_ptr,
	size_t n_vector_length, size_t n_seed_length, size_t n_window_size, bool b_profiling, size_t n_occurence_slice,
	const std::vector<std::basic_string<wchar_t> > &terms, size_t n_dummy_term,
	const std::vector<TDocument> &documents, const CSeedOps::TSeed *p_seeds)
{
	__FuncGuard("::n_BuildTermVectors_GPU_v1");

	CTimer timer;
	timer.ResetTimer();

	TTermScalar *p_vectors_gpu = 0;
	{
		cl_context h_context;
		if(CCLUtils::n_OpenCL_Init(&h_context) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to initialize OpenCL\n");
			return -1;
		}
		// init OpenCL

		cl_device_id h_device;
		if(CCLUtils::n_Get_MaxGFlops_DeviceId(&h_device, h_context)) {
			fprintf(stderr, "error: failed to select OpenCL device\n");
			return -1;
		}
		// get best OpenCL device

		cl_command_queue h_cmd_queue;
		{
			cl_int n_result;
			h_cmd_queue = clCreateCommandQueue(h_context, h_device,
				CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | ((b_profiling)? CL_QUEUE_PROFILING_ENABLE : 0), &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to create OpenCL command queue\n");
				return -1;
			}
		}
		// create command queue

		std::string s_params;
		if(!stl_ut::Format(s_params, "-D __ENABLE_DUFFS_DEVICE__ -D __FORCE_WORK_BOUNDS_CHECK__"
		   " -D JIT_VECTOR_LENGTH=%d -D JIT_SEED_LENGTH=%d -D JIT_WINDOW_SIZE=%d -D OCCURENCE_SLICE=%dU",
		   n_vector_length, n_seed_length, n_window_size, n_occurence_slice)) {
			fprintf(stderr, "error: not enough memory\n");
			return -1;
		}
		// change just-in-time parameters

		cl_program h_program;
		if(CCLProgramCompiler::n_CompileProgramFile(h_context, &h_program,
		   "CLKernel.c", 1, &h_device, s_params.c_str(), "CLKernel.clbin") != CL_SUCCESS) {
			fprintf(stderr, "error: failed to load OpenCL program\n");
			return -1;
		}
		// compile OpenCL program

		cl_kernel h_ZeroMemory;
		cl_kernel h_BuildTermVectors_Seed_NPOT_TrueRot;
		cl_kernel h_BuildTermVectors_Seed_POT_TrueRot;
		cl_kernel h_BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT;
		cl_kernel h_BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_OccSlice;
		cl_kernel h_BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_OccRemaining;
		//cl_kernel h_BuildTermVectors_Seed_NPOT_TrueRot_WorkOffset;
		{
			cl_int n_result;
			h_ZeroMemory = clCreateKernel(h_program, "ZeroMemory32", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return -1;
			}
			h_BuildTermVectors_Seed_NPOT_TrueRot = clCreateKernel(h_program, "BuildTermVectors_Seed_NPOT_TrueRot", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return -1;
			}
			h_BuildTermVectors_Seed_POT_TrueRot = clCreateKernel(h_program, "BuildTermVectors_Seed_POT_TrueRot"/*"_RegOpts"*/, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return -1;
			}
			h_BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT = clCreateKernel(h_program, "BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return -1;
			}
			h_BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_OccSlice = clCreateKernel(h_program, "BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_OccSlice", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return -1;
			}
			h_BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_OccRemaining = clCreateKernel(h_program, "BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_OccRemaining", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return -1;
			}
			/*h_BuildTermVectors_Seed_NPOT_TrueRot_WorkOffset = clCreateKernel(h_program, "BuildTermVectors_Seed_NPOT_TrueRot_WorkOffset", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return -1;
			}*/
		}
		// get OpenCL kernels

		const size_t n_chunk_size = 1 << 20; // 64k chunks turn out to work best with splitter, but GPU likes larger chunks

		_ASSERTE(sizeof(CDocumentSplitter::size_t) == sizeof(unsigned int));
		cl_mem dp_chunk, dp_offset, dp_occurence_list, dp_vectors_gpu, dp_seeds;
		size_t n_offset_buffer_size = min(n_chunk_size, terms.size()), n_occurence_list_size = n_chunk_size + 2 * n_offset_buffer_size;
		{
			if(!(p_vectors_gpu = new(std::nothrow) TTermScalar[terms.size() * n_vector_length])) {
				fprintf(stderr, "error: not enough memory\n");
				return -1;
			}
			// alloc memory for vectors, and for seeds

			cl_int n_result;
			dp_vectors_gpu = clCreateBuffer(h_context, CL_MEM_READ_WRITE, terms.size() * n_vector_length * sizeof(TTermScalar), NULL, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return -1;
			}
			dp_seeds = clCreateBuffer(h_context, CL_MEM_READ_ONLY, (terms.size() + 1) * n_seed_length * sizeof(CSeedOps::TSeed), NULL, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return -1;
			}
			dp_chunk = clCreateBuffer(h_context, CL_MEM_READ_ONLY, n_chunk_size * sizeof(unsigned int), NULL, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return -1;
			}
			dp_offset = clCreateBuffer(h_context, CL_MEM_READ_ONLY, n_offset_buffer_size * sizeof(unsigned int), NULL, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return -1;
			}
			dp_occurence_list = clCreateBuffer(h_context, CL_MEM_READ_ONLY, n_occurence_list_size * sizeof(unsigned int), NULL, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return -1;
			}
		}
		// alloc buffers

		/*memset(p_vectors_gpu, 0, terms.size() * n_vector_length * sizeof(TTermScalar));
		if(clEnqueueWriteBuffer(h_cmd_queue, dp_vectors_gpu, CL_TRUE, 0, terms.size() *
		   n_vector_length * sizeof(TTermScalar), p_vectors_gpu, 0, NULL, NULL) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to copy source data to GPU\n");
			return -1;
		}*/
		{
			clSetKernelArgs2(h_ZeroMemory, dp_vectors_gpu, int(terms.size() * n_vector_length));

			size_t n_global_work_size = terms.size() * n_vector_length;
			size_t n_local_work_size = 512;
			n_global_work_size += n_local_work_size - 1;
			n_global_work_size -= n_global_work_size % n_local_work_size;
			int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_ZeroMemory, 1,
				NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
		}
		// "memset" vectors on GPU

		if(clEnqueueWriteBuffer(h_cmd_queue, dp_seeds, CL_TRUE, 0, (terms.size() + 1) *
		   n_seed_length * sizeof(CSeedOps::TSeed), p_seeds, 0, NULL, NULL) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to copy source data to GPU\n");
			return -1;
		}
		// copy seeds to GPU

		double f_tv_generate_start = timer.f_Time();

		CDocumentSplitter splitter(documents, n_dummy_term, n_chunk_size, n_window_size);
		if(!splitter.Prepare_FirstChunk()) {
			fprintf(stderr, "error: splitter.Prepare_FirstChunk() failed\n");
			return -1;
		}
		// prepare splitter ...

		do {
			if(!splitter.Build_TermOccurenceTable()) {
				fprintf(stderr, "error: splitter.Build_TermOccurenceTable() failed\n");
				return -1;
			}
			const std::vector<CDocumentSplitter::size_t> &chunk = splitter.Get_Chunk();
			const std::vector<CDocumentSplitter::size_t> &offset_list = splitter.Get_TermOccurence_OffsetList();
			const std::vector<CDocumentSplitter::size_t> &occurence_list = splitter.Get_TermOccurenceList();
			// build term occurence table

			_ASSERTE(chunk.size() <= n_chunk_size);
			_ASSERTE(offset_list.size() <= n_offset_buffer_size);
			_ASSERTE(occurence_list.size() <= n_occurence_list_size);
			// make sure generated lists fit onto GPU lists (todo - crazy adaptive re-allocation?)

			/*{
				int n_result = clFinish(h_cmd_queue);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: clFinish() failed: %d\n", n_result);
					return -1;
				}
			}*/
			// wait for GPU to finish before writing new buffer contents

			{
				int n_result;
				if((n_result = clEnqueueWriteBuffer(h_cmd_queue, dp_chunk, CL_FALSE, 0, chunk.size() * sizeof(unsigned int), &chunk[0], 0, NULL, NULL)) != CL_SUCCESS ||
				   (n_result = clEnqueueWriteBuffer(h_cmd_queue, dp_offset, CL_FALSE, 0, offset_list.size() * sizeof(unsigned int), &offset_list[0], 0, NULL, NULL)) != CL_SUCCESS ||
				   (n_result = clEnqueueWriteBuffer(h_cmd_queue, dp_occurence_list, CL_FALSE, 0, occurence_list.size() * sizeof(unsigned int), &occurence_list[0], 0, NULL, NULL)) != CL_SUCCESS) {
					fprintf(stderr, "error: failed to copy command buffers to GPU (%d, %d, %d / %d)\n", chunk.size(), offset_list.size(), occurence_list.size(), n_result);
					return -1;
				}
			}
			// copy lists to GPU

#if 0
			const size_t n_slice_max_size = 32 * 27;
			size_t n_offset_num = offset_list.size();

			for(size_t i = 0; i < n_offset_num; i += n_slice_max_size) {
				size_t n_slice_length = min(n_slice_max_size, n_offset_num - i - 1);
				// determine work slice length

				clSetKernelArgs10(h_BuildTermVectors_Seed_NPOT_TrueRot_WorkOffset,
					n_vector_length, dp_vectors_gpu, dp_chunk, dp_offset, int(i), int(n_slice_length),
					dp_occurence_list, n_seed_length, n_window_size, dp_seeds);
				// set OpenCL kernel params

				size_t n_global_work_size = n_slice_length;
				int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_BuildTermVectors_Seed_NPOT_TrueRot_WorkOffset, 1,
					NULL, &n_slice_length, NULL, 0, NULL, NULL);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: failed to link all OpenCL kernels\n");
					return -1;
				}
				// launch 1D OpenCL kernel over n_slice_length terms
			}
#elif 0
			const size_t n_local_work_size = 432;//32;//112;
			if(!b_Is_POT(n_vector_length)) {
				// NPOT

				clSetKernelArgs9(h_BuildTermVectors_Seed_NPOT_TrueRot,
					n_vector_length, dp_vectors_gpu, dp_chunk, dp_offset, int(offset_list.size()),
					dp_occurence_list, n_seed_length, n_window_size, dp_seeds);
				// set OpenCL kernel params

				size_t n_global_work_size = offset_list.size();
				n_global_work_size += n_local_work_size - 1;
				n_global_work_size -= n_global_work_size % n_local_work_size;
				int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_BuildTermVectors_Seed_NPOT_TrueRot, 1,
					NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels\n");
					return -1;
				}
				// 41.45 sec
			} else {
				// POT

				clSetKernelArgs9(h_BuildTermVectors_Seed_POT_TrueRot,
					n_vector_length, dp_vectors_gpu, dp_chunk, dp_offset, int(offset_list.size()),
					dp_occurence_list, n_seed_length, n_window_size, dp_seeds);
				// set OpenCL kernel params

				size_t n_global_work_size = offset_list.size();
				n_global_work_size += n_local_work_size - 1;
				n_global_work_size -= n_global_work_size % n_local_work_size;
				int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_BuildTermVectors_Seed_POT_TrueRot, 1,
					NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels\n");
					return -1;
				}
				// 28.80 sec
			}
			// launch 1D OpenCL kernel over offset_list.size() terms with explicit block size
#elif 0
			//const size_t n_local_work_size = 32; // 128 / 256 / 512
			if(!b_Is_POT(n_vector_length)) {
				fprintf(stderr, "error: there's no BuildTermVectors_Seed_NPOT_TrueRot_RegOpts_JIT\n");
				return -1;
			} else {
				// POT

				clSetKernelArgs6(h_BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT,
					/*n_vector_length,*/ dp_vectors_gpu, dp_chunk, dp_offset, int(offset_list.size()),
					dp_occurence_list, /*n_seed_length, n_window_size,*/ dp_seeds);
				// set OpenCL kernel params

				size_t n_global_work_size = offset_list.size();
				/*n_global_work_size += n_local_work_size - 1;
				n_global_work_size -= n_global_work_size % n_local_work_size;*/
				int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT, 1,
					NULL, &n_global_work_size, NULL, 0, NULL, NULL);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels\n");
					return -1;
				}
				// 27.26 (auto) / 27.85 (32) / 28.02 sec (128)
			}
			// launch 1D OpenCL kernel over offset_list.size() terms with explicit block size
#elif 1
			const size_t n_local_work_size0 = 256; // 128 / 256 / 512
			const size_t n_local_work_size0_thresh = 500;
			const size_t n_local_work_size1 = 128; // 128 / 256 / 512
			const size_t n_local_work_size2 = 1; // 128 / 256 / 512
			if(!b_Is_POT(n_vector_length)) {
				fprintf(stderr, "error: there's no BuildTermVectors_Seed_NPOT_TrueRot_RegOpts_JIT\n");
				return -1;
			} else {
				// POT

				size_t n_slice_length = n_occurence_slice;
				_ASSERTE(occurence_list.size() > 2);
				size_t n_max_frequency = occurence_list[1];

				for(size_t n_job_offset = 0; n_job_offset < n_max_frequency; n_job_offset += n_slice_length) {
					size_t n_term_num = std::upper_bound(offset_list.begin(), offset_list.end(),
						n_job_offset, CDocumentSplitter::CSliceLengthBelow(n_job_offset, occurence_list)) - offset_list.begin();
					_ASSERTE(n_term_num > 0 && n_term_num <= offset_list.size());
					_ASSERTE(n_job_offset > 0 || n_term_num == offset_list.size()); // all of them should be above zero
					_ASSERTE(n_term_num == offset_list.size() ||
						(occurence_list[offset_list[n_term_num - 1] + 1] >= n_job_offset &&
						occurence_list[offset_list[n_term_num] + 1] < n_job_offset));
					// find how much terms have more occurences than n_job_offset

					if(n_term_num > 64 || n_max_frequency - n_job_offset > 2048) { // there must be enough terms, or there must be at least enough occurences to proceed in slices
						printf("slice of occurences %4d to %4d: %4d terms\n", n_job_offset,
							min(n_job_offset + n_slice_length, n_max_frequency), n_term_num);
						// process n_slice_length term occurences

						clSetKernelArgs7(h_BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_OccSlice,
							/*n_vector_length,*/ dp_vectors_gpu, dp_chunk, dp_offset, int(n_term_num),
							dp_occurence_list, /*n_seed_length, n_window_size,*/ dp_seeds, int(n_job_offset));
						// set OpenCL kernel params

						size_t n_local_work_size = (n_term_num >= n_local_work_size0_thresh)? n_local_work_size0 : n_local_work_size1;

						size_t n_global_work_size = n_term_num;
						n_global_work_size += n_local_work_size - 1;
						n_global_work_size -= n_global_work_size % n_local_work_size;
						int n_result = clEnqueueNDRangeKernel(h_cmd_queue,
							h_BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_OccSlice, 1,
							NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels\n");
							return -1;
						}
					} else {
						// this takes last 20 seconds, while a few processors are running. it would be better to
						// split this to multiple parallel tasks and then sum the results, which requires more preprocessing

						printf("slice of occurences %4d to %4d: %4d terms\n", n_job_offset, n_max_frequency, n_term_num);
						// process the rest of term occurences

						clSetKernelArgs7(h_BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_OccRemaining,
							/*n_vector_length,*/ dp_vectors_gpu, dp_chunk, dp_offset, int(n_term_num),
							dp_occurence_list, /*n_seed_length, n_window_size,*/ dp_seeds, int(n_job_offset));
						// set OpenCL kernel params

						size_t n_global_work_size = n_term_num;
						n_global_work_size += n_local_work_size2 - 1;
						n_global_work_size -= n_global_work_size % n_local_work_size2;
						int n_result = clEnqueueNDRangeKernel(h_cmd_queue,
							h_BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_OccRemaining, 1,
							NULL, &n_global_work_size, &n_local_work_size2, 0, NULL, NULL);
						if(n_result != CL_SUCCESS) {
							fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels\n");
							return -1;
						}

						break;
					}
					/*if((n_result = clFlush(h_cmd_queue)) != CL_SUCCESS) {
						fprintf(stderr, "error: failed to clFlush() (%d)\n", n_result);
						return -1;
					}
					if((n_result = clFinish(h_cmd_queue)) != CL_SUCCESS) {
						fprintf(stderr, "error: failed to clFinish() (%d)\n", n_result);
						return -1;
					}*/
				}
				// 24.75
			}
			// launch 1D OpenCL kernel over offset_list.size() terms with explicit block size
#elif 0
			if(!b_Is_POT(n_vector_length)) {
				// NPOT

				clSetKernelArgs9(h_BuildTermVectors_Seed_NPOT_TrueRot,
					n_vector_length, dp_vectors_gpu, dp_chunk, dp_offset, int(offset_list.size()),
					dp_occurence_list, n_seed_length, n_window_size, dp_seeds);
				// set OpenCL kernel params

				size_t n_global_work_size = offset_list.size();
				int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_BuildTermVectors_Seed_NPOT_TrueRot, 1,
					NULL, &n_global_work_size, NULL, 0, NULL, NULL);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels\n");
					return -1;
				}
				// 41.45 sec
			} else {
				// POT

				clSetKernelArgs9(h_BuildTermVectors_Seed_POT_TrueRot,
					n_vector_length, dp_vectors_gpu, dp_chunk, dp_offset, int(offset_list.size()),
					dp_occurence_list, n_seed_length, n_window_size, dp_seeds);
				// set OpenCL kernel params

				size_t n_global_work_size = offset_list.size();
				int n_result = clEnqueueNDRangeKernel(h_cmd_queue, h_BuildTermVectors_Seed_POT_TrueRot, 1,
					NULL, &n_global_work_size, NULL, 0, NULL, NULL);
				if(n_result != CL_SUCCESS) {
					fprintf(stderr, "error: failed to clEnqueueNDRangeKernel() all OpenCL kernels\n");
					return -1;
				}
				// 28.20 sec / 27.80 sec with duff's device / 27.64 with merged left/right window loops
			}
			// launch 1D OpenCL kernel over offset_list.size() terms (tends to be faster with automatic local work size)
#endif
		} while(splitter.Prepare_NextChunk());
		// split documents, let GPU do all the work

		if(clEnqueueReadBuffer(h_cmd_queue, dp_vectors_gpu, CL_TRUE, 0, terms.size() *
		   n_vector_length * sizeof(TTermScalar), p_vectors_gpu, 0, NULL, NULL) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to copy results from GPU\n");
			return -1;
		}
		// copy results back to CPU

		/*{
			int n_result = clFinish(h_cmd_queue);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: clFinish() failed: %d\n", n_result);
				return -1;
			}
		}*/
		// wait for GPU to finish // note the read above is blocking, rendering clFinish() useless

		double f_tv_generate_time = timer.f_Time() - f_tv_generate_start;
		printf("generating term vectors on GPU took " "|%.5f|"/*PRItimeprecise*/ " secs\n", /*PRItimeparams*/(f_tv_generate_time));

		clReleaseKernel(h_ZeroMemory);
		clReleaseKernel(h_BuildTermVectors_Seed_NPOT_TrueRot);
		clReleaseKernel(h_BuildTermVectors_Seed_POT_TrueRot);
		clReleaseKernel(h_BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT);
		clReleaseKernel(h_BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_OccSlice);
		clReleaseKernel(h_BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_OccRemaining);
		// free kernels

		clReleaseMemObject(dp_chunk);
		clReleaseMemObject(dp_offset);
		clReleaseMemObject(dp_occurence_list);
		clReleaseMemObject(dp_vectors_gpu);
		clReleaseMemObject(dp_seeds);
		// free GPU memory

		clReleaseProgram(h_program);
		clReleaseCommandQueue(h_cmd_queue);
		clReleaseContext(h_context);
		// shutdown OpenCL
	}

	*p_vectors_gpu_ptr = p_vectors_gpu;
	// write result

	return 0;
}

/**
 *	@brief equivalent of memset(dp_buffer, 0, n_size_elements * sizeof(uint32_t)) for GPU
 *
 *	@param[in] dp_buffer is destination buffer on the device
 *	@param[in] n_size_elements is size of memory to be erased, in elements (32-bit integers)
 *	@param[in] h_ZeroMemory32 is ZeroMemory32() kernel
 *	@param[in] h_cmd_queue is command queue
 *
 *	@return Returns OpenCL error codes (CL_SUCCESS on success).
 */
int n_ZeroMemory32_GPU(cl_mem dp_buffer, size_t n_size_elements, cl_kernel h_ZeroMemory32, cl_command_queue h_cmd_queue)
{
	if(n_size_elements > INT_MAX)
		return CL_INVALID_VALUE;
	clSetKernelArgs2(h_ZeroMemory32, dp_buffer, int(n_size_elements));
	size_t n_local_work_size = (n_size_elements > 16384)? 512 :
							   (n_size_elements > 8192)? 256 : 128; // no need to go lower here
	size_t n_global_work_size = n_Align_Up(n_size_elements, n_local_work_size);
	return clEnqueueNDRangeKernel(h_cmd_queue, h_ZeroMemory32, 1,
		NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
}

/**
 *	@def n_ZeroMemory32_GPU
 *	@brief simple macro, allowing to ommit last two parameters of n_ZeroMemory32_GPU()
 *
 *	@param[in] dp_buffer is destination buffer on the device
 *	@param[in] n_size_elements is size of memory to be erased, in elements (32-bit integers)
 *
 *	@return Returns OpenCL error codes (CL_SUCCESS on success).
 */
#define n_ZeroMemory32_GPU(dp_buffer,n_size_elements) n_ZeroMemory32_GPU(dp_buffer, n_size_elements, h_ZeroMemory32, h_cmd_queue)

/**
 *	@brief experimental v2 function for building term vectors on GPU
 *
 *	@param[out] p_vectors_gpu_ptr is pointer to pointer to term vectors, generated on GPU, it is written upon successful return; caller is responsible for freeing this pointer
 *	@param[in] n_vector_length is term vector length
 *	@param[in] n_seed_length is seed vector length
 *	@param[in] n_window_size is half-window size
 *	@param[in] b_profiling is GPU profiling flag
 *	@param[in] terms is list of all the terms, occuring in documents
 *	@param[in] n_dummy_term is dumym term id
 *	@param[in] documents is list of documents
 *	@param[in] p_seeds is list of seed vectors for all the terms (including the dummy term)
 *
 *	@return Returns true on success, false on failure.
 */
bool BuildTermVectors_GPU_v2(TTermScalar **p_vectors_gpu_ptr,
	size_t n_vector_length, size_t n_seed_length, size_t n_window_size, bool b_profiling,
	const std::vector<std::basic_string<wchar_t> > &terms, size_t n_dummy_term,
	const std::vector<TDocument> &documents, const CSeedOps::TSeed *p_seeds)
{
	__FuncGuard("::n_BuildTermVectors_GPU_v2");

	CTimer timer;
	timer.ResetTimer();
	// timer for meassuring timing

	const size_t n_chunk_size = 1 << 20; // 64k chunks turn out to work best with splitter, but GPU likes larger chunks
	const size_t n_max_pass_size = 12500; // given by GPU limits. scheduling too many threads will result in failure.
	const size_t n_dummy_vector_bank_num = n_Align_Up(n_max_pass_size, size_t(1024)); // there's no sense limiting that
#if 0
	const size_t n_max_slice_length = 16;
	const size_t n_min_primary_pass_size = 1000;
	const size_t n_min_last_primary_pass_size = 200;
	// 4.31 sec
#else
	const size_t n_max_slice_length = 32;
	const size_t n_min_primary_pass_size = 2000;
	const size_t n_min_last_primary_pass_size = 200;
	// 4.31 sec
#endif
	// document splitter config

	TTermScalar *p_vectors_gpu = 0;

	cl_context h_context;
	cl_device_id h_device;
	cl_command_queue h_cmd_queue;
	{
		if(b_verbose)
			printf("initializing OpenCL ...\n");

		if(CCLUtils::n_OpenCL_Init(&h_context) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to initialize OpenCL\n");
			return false;
		}
		// init OpenCL

		if(CCLUtils::n_Get_MaxGFlops_DeviceId(&h_device, h_context)) {
			fprintf(stderr, "error: failed to select OpenCL device\n");
			return false;
		}
		// get best OpenCL device

		{
			cl_int n_result;
			h_cmd_queue = clCreateCommandQueue(h_context, h_device,
				CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | ((b_profiling)? CL_QUEUE_PROFILING_ENABLE : 0), &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to create OpenCL command queue\n");
				return false;
			}
		}
		// create command queue
	}
	// initialize OpenCL

	const size_t n_sumstep_local_work_size = (n_vector_length > 16384)? 512 :
											 (n_vector_length > 8192)? 256 : 128; // no need to go lower here
	const size_t n_sumstep_global_work_size = n_Align_Up(n_vector_length, n_sumstep_local_work_size);
	// determine program block params now, we might save some work

	cl_program h_program;
	cl_kernel h_ZeroMemory32, h_SummationStep_JIT,
		h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems,
		h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned;
	{
		if(b_verbose)
			printf("loading program ...\n");

		std::string s_params;
		if(!stl_ut::Format(s_params, "-D __SUMMATION_STEP_ENABLE_DUFFS_DEVICE__"
		   " -D __FORCE_WORK_BOUNDS_CHECK__ -D __SUMMATION_STEP_%s_WORK_BOUNDS_CHECK__"
		   " -D __%sPOT__ -D JIT_VECTOR_LENGTH=%dU -D JIT_SEED_LENGTH=%dU"
		   " -D JIT_WINDOW_SIZE=%dU -D JIT_MAX_SLICE_LENGTH=%dU  -cl-nv-verbose",
		   (n_vector_length % n_sumstep_local_work_size)? "FORCE" : "LEAVE",
		   b_Is_POT(n_vector_length)? "" : "N", n_vector_length, n_seed_length,
		   n_window_size, n_max_slice_length)) {
			fprintf(stderr, "error: not enough memory\n");
			return false;
		}
		// format just-in-time parameters

		if(CCLProgramCompiler::n_CompileProgramFile(h_context, &h_program,
		   "CLKernel_v2.c", 1, &h_device, s_params.c_str(), "CLKernel_v2_seed.clbin") != CL_SUCCESS) {
			fprintf(stderr, "error: failed to load OpenCL program\n");
			return false;
		}
		// compile OpenCL program

		{
			cl_int n_result;
			h_ZeroMemory32 = clCreateKernel(h_program, "ZeroMemory32", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
			h_SummationStep_JIT = clCreateKernel(h_program, "SummationStep_JIT", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
			h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems =
				clCreateKernel(h_program, (b_Is_POT(n_vector_length))?
				"BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_v2WorkItems" :
				"BuildTermVectors_Seed_NPOT_TrueRot_RegOpts_JIT_v2WorkItems", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
			h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned =
				clCreateKernel(h_program, (b_Is_POT(n_vector_length))?
				"BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned" :
				"BuildTermVectors_Seed_NPOT_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
		}
		// get OpenCL kernels
	}
	// compile and link OpenCL program

	CDocumentSplitter splitter(documents, n_dummy_term, n_chunk_size, n_window_size);
	if(!splitter.Prepare_FirstChunk()) {
		fprintf(stderr, "error: splitter.Prepare_FirstChunk() failed\n");
		return false;
	}
	// prepare splitter ...

	const size_t n_work_item_list_max_size = splitter.n_Max_WorkItemList_v2_Size();
	const size_t n_occurence_list_max_size = splitter.n_Max_OccurenceList_v2_Size();
	// get maximal sizes of work buffers

	_ASSERTE(sizeof(CDocumentSplitter::size_t) == sizeof(unsigned int));
	_ASSERTE(sizeof(CDocumentSplitter::TWorkItem) == 3 * sizeof(unsigned int)); // make sure it's unaligned
	// data size checks

	cl_mem dp_chunk, dp_work_item_list, dp_occurence_list,
		dp_vectors, dp_dummy_vector_banks, dp_seeds;
	{
		if(!(p_vectors_gpu = new(std::nothrow) TTermScalar[terms.size() * n_vector_length])) {
			fprintf(stderr, "error: not enough memory\n");
			return false;
		}
		// alloc memory for vectors, and for seeds

		cl_int n_result;
		dp_vectors = clCreateBuffer(h_context, CL_MEM_READ_WRITE, terms.size() * n_vector_length * sizeof(TTermScalar), NULL, &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		dp_dummy_vector_banks = clCreateBuffer(h_context, CL_MEM_READ_WRITE, n_dummy_vector_bank_num * n_vector_length * sizeof(TTermScalar), NULL, &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		dp_seeds = clCreateBuffer(h_context, CL_MEM_READ_ONLY, (terms.size() + 1) * n_seed_length * sizeof(CSeedOps::TSeed), NULL, &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		dp_chunk = clCreateBuffer(h_context, CL_MEM_READ_ONLY, n_chunk_size * sizeof(unsigned int), NULL, &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		dp_occurence_list = clCreateBuffer(h_context, CL_MEM_READ_ONLY, n_occurence_list_max_size * sizeof(unsigned int), NULL, &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		dp_work_item_list = clCreateBuffer(h_context, CL_MEM_READ_ONLY, n_work_item_list_max_size * 3 * sizeof(unsigned int), NULL, &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
	}
	// alloc buffers on GPU

	clFlush(h_cmd_queue);
	clFinish(h_cmd_queue);
	// we start timing here, make sure GPU isn't busy

	if(b_verbose)
		printf("running ...\n");

	double f_tv_generate_start = timer.f_Time();

	if(n_ZeroMemory32_GPU(dp_vectors, terms.size() * n_vector_length) != CL_SUCCESS) {
		fprintf(stderr, "error: ZeroMemory32() failed\n");
		return false;
	}
	// "memset" vectors on GPU

	if(clEnqueueWriteBuffer(h_cmd_queue, dp_seeds, CL_FALSE, 0, (terms.size() + 1) *
	   n_seed_length * sizeof(CSeedOps::TSeed), p_seeds, 0, NULL, NULL) != CL_SUCCESS) {
		fprintf(stderr, "error: failed to copy source data to GPU\n");
		return false;
	}
	// copy seeds to GPU

	do {
		if(!splitter.Build_TermOccurenceTable_v2(n_max_slice_length, n_max_pass_size,
		   n_min_primary_pass_size, n_min_last_primary_pass_size, n_dummy_vector_bank_num)) {
			fprintf(stderr, "error: splitter.Build_TermOccurenceTable() failed\n");
			return false;
		}
		const std::vector<CDocumentSplitter::size_t> &chunk = splitter.Get_Chunk();
		const std::vector<CDocumentSplitter::TWorkItem> &work_item_list = splitter.Get_WorkItemList_v2();
		const std::vector<CDocumentSplitter::size_t> &occurence_list = splitter.Get_TermOccurenceList_v2();
		const std::vector<CDocumentSplitter::TPass> &pass_list = splitter.Get_PassList_v2();
		// build term occurence table

		_ASSERTE(chunk.size() <= n_chunk_size);
		_ASSERTE(work_item_list.size() <= n_work_item_list_max_size);
		_ASSERTE(occurence_list.size() <= n_occurence_list_max_size);
		// make sure generated lists fit onto GPU lists

		{
			int n_result;
			if((n_result = clEnqueueWriteBuffer(h_cmd_queue, dp_chunk, CL_FALSE, 0, chunk.size() * sizeof(unsigned int), &chunk[0], 0, NULL, NULL)) != CL_SUCCESS ||
			   (n_result = clEnqueueWriteBuffer(h_cmd_queue, dp_work_item_list, CL_FALSE, 0, work_item_list.size() * 3 * sizeof(unsigned int), &work_item_list[0], 0, NULL, NULL)) != CL_SUCCESS ||
			   (n_result = clEnqueueWriteBuffer(h_cmd_queue, dp_occurence_list, CL_FALSE, 0, occurence_list.size() * sizeof(unsigned int), &occurence_list[0], 0, NULL, NULL)) != CL_SUCCESS) {
				fprintf(stderr, "error: failed to copy command buffers to GPU (%d, %d, %d / %d)\n", chunk.size(), work_item_list.size(), occurence_list.size(), n_result);
				return false;
			}
		}
		// copy lists to GPU

		for(size_t i = 0, n = pass_list.size(); i < n; ++ i) {
			const CDocumentSplitter::TPass &r_t_pass = pass_list[i];
			// get pass

			printf("%s %s pass %2d : ", (r_t_pass.b_primary)? "pri" : "sec",
				(r_t_pass.b_slice_aligned)? "aligned" : "unalign", i);
			// verbose

			_ASSERTE(r_t_pass.summation_list.empty() == r_t_pass.b_primary);
			if(!r_t_pass.b_primary) {
				const CDocumentSplitter::TPass::TSummationStep &r_t_sum = r_t_pass.summation_list.back();
				size_t n_vector_bank_usage = r_t_sum.n_offset + r_t_sum.n_length;
				_ASSERTE(n_vector_bank_usage < n_dummy_vector_bank_num);
				// determine how much dummy vector banks is going to be used

				printf("clearing %d dummy vector banks\n%20s: ", n_vector_bank_usage, "");
				// verbose

				if(n_ZeroMemory32_GPU(dp_dummy_vector_banks, n_vector_bank_usage * n_vector_length) != CL_SUCCESS) {
					fprintf(stderr, "error: ZeroMemory32() failed\n");
					return false;
				}
				// clear dummy vector banks
			}
			// clear dummy vector bansk for secondary passess

			printf("running btv kernel for %d work-items (%d to %d)\n", r_t_pass.n_length,
				r_t_pass.n_offset, r_t_pass.n_offset + r_t_pass.n_length);
			// verbose

			cl_kernel h_kernel = ((r_t_pass.b_slice_aligned)?
				h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned :
				h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems);
			// choose which kernel to use

			clSetKernelArgs7(h_kernel, ((r_t_pass.b_primary)? dp_vectors : dp_dummy_vector_banks), dp_chunk,
				dp_work_item_list, int(r_t_pass.n_offset), int(r_t_pass.n_length), dp_occurence_list, dp_seeds);
			// set kernel params

			size_t n_local_work_size = (r_t_pass.n_length > 16384)? 512 :
									   (r_t_pass.n_length > 8192)? 256 :
									   (r_t_pass.n_length > 4096)? 128 : 32; // aim to fill ~30 GPU's multiprocessors
			size_t n_global_work_size = n_Align_Up(size_t(r_t_pass.n_length), n_local_work_size);
			if(clEnqueueNDRangeKernel(h_cmd_queue, h_kernel, 1,
			   NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL) != CL_SUCCESS) {
				fprintf(stderr, "error: BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_v2WorkItems() failed\n");
				return false;
			}
			// call the kernel

			if(!r_t_pass.b_primary)
				printf("%20s: running %d summation kernels\n", "", r_t_pass.summation_list.size());
			// verbose

			for(size_t j = 0, m = r_t_pass.summation_list.size(); j < m; ++ j) {
				const CDocumentSplitter::TPass::TSummationStep &r_t_sum = r_t_pass.summation_list[j];
				// get summation pass

				printf("\trunning summation kernel for term %5d (dummy vector banks %3d to %3d)\n", r_t_sum.n_term_id,
					r_t_sum.n_offset, r_t_sum.n_offset + r_t_sum.n_length);
				// verbose

				size_t n_dest_vector_first_elem = n_vector_length * r_t_sum.n_term_id;
				size_t n_first_bank_first_elem = n_vector_length * r_t_sum.n_offset;
				clSetKernelArgs5(h_SummationStep_JIT, dp_vectors, int(n_dest_vector_first_elem),
					dp_dummy_vector_banks, int(n_first_bank_first_elem), int(r_t_sum.n_length));
				// set kernel params

				if(clEnqueueNDRangeKernel(h_cmd_queue, h_SummationStep_JIT, 1,
				   NULL, &n_sumstep_global_work_size, &n_sumstep_local_work_size, 0, NULL, NULL) != CL_SUCCESS) {
					fprintf(stderr, "error: SummationStep_JIT() failed\n");
					return false;
				}
				// call the kernel
			}
			// proceed with summation steps
		}
	} while(splitter.Prepare_NextChunk());
	// split documents, let GPU do all the work

	if(clEnqueueReadBuffer(h_cmd_queue, dp_vectors, CL_TRUE, 0, terms.size() *
	   n_vector_length * sizeof(TTermScalar), p_vectors_gpu, 0, NULL, NULL) != CL_SUCCESS) {
		fprintf(stderr, "error: failed to copy results from GPU\n");
		return false;
	}
	// copy results back to CPU (blocking read)

	double f_tv_generate_time = timer.f_Time() - f_tv_generate_start;
	printf("generating term vectors on GPU took " "|%.5f|"/*PRItimeprecise*/ " secs\n", /*PRItimeparams*/(f_tv_generate_time));

	{
		clReleaseKernel(h_ZeroMemory32);
		clReleaseKernel(h_SummationStep_JIT);
		clReleaseKernel(h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems);
		clReleaseKernel(h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned);
		// free kernels

		clReleaseMemObject(dp_vectors);
		clReleaseMemObject(dp_dummy_vector_banks);
		clReleaseMemObject(dp_seeds);
		clReleaseMemObject(dp_chunk);
		clReleaseMemObject(dp_occurence_list);
		clReleaseMemObject(dp_work_item_list);
		// free GPU memory

		clReleaseProgram(h_program);
		clReleaseCommandQueue(h_cmd_queue);
		clReleaseContext(h_context);
		// release other OpenCL objects
	}
	// shutdown OpenCL

	*p_vectors_gpu_ptr = p_vectors_gpu;
	// output result

	return true;
}

/**
 *	@brief function for retraining term vectors on GPU
 *
 *	@param[out] p_vectors_gpu_ptr is pointer to pointer to term vectors, generated on GPU, it is written upon successful return; caller is responsible for freeing this pointer
 *	@param[in] n_vector_length is term vector length
 *	@param[in] n_window_size is half-window size
 *	@param[in] b_profiling is GPU profiling flag
 *	@param[in] terms is list of all the terms, occuring in documents
 *	@param[in] n_dummy_term is dumym term id
 *	@param[in] documents is list of documents
 *	@param[in] p_seed_vectors is list of seed vectors for all the terms (including the dummy term)
 *
 *	@return Returns true on success, false on failure.
 */
bool RetrainTermVectors_GPU(TTermScalar **p_vectors_gpu_ptr,
	size_t n_vector_length, size_t n_window_size, bool b_profiling,
	const std::vector<std::basic_string<wchar_t> > &terms, size_t n_dummy_term,
	const std::vector<TDocument> &documents, const TTermScalar *p_seed_vectors)
{
	__FuncGuard("::n_BuildTermVectors_GPU_v2");

	CTimer timer;
	timer.ResetTimer();
	// timer for meassuring timing

	const size_t n_chunk_size = 1 << 20; // 64k chunks turn out to work best with splitter, but GPU likes larger chunks
	const size_t n_max_slice_length = 1024;
	// document splitter config

	TTermScalar *p_vectors_gpu = 0;

	cl_context h_context;
	cl_device_id h_device;
	cl_command_queue h_cmd_queue;
	{
		if(b_verbose)
			printf("initializing OpenCL ...\n");

		if(CCLUtils::n_OpenCL_Init(&h_context) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to initialize OpenCL\n");
			return false;
		}
		// init OpenCL

		if(CCLUtils::n_Get_MaxGFlops_DeviceId(&h_device, h_context)) {
			fprintf(stderr, "error: failed to select OpenCL device\n");
			return false;
		}
		// get best OpenCL device

		{
			cl_int n_result;
			h_cmd_queue = clCreateCommandQueue(h_context, h_device,
				CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | ((b_profiling)? CL_QUEUE_PROFILING_ENABLE : 0), &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to create OpenCL command queue\n");
				return false;
			}
		}
		// create command queue
	}
	// initialize OpenCL

	const size_t n_local_work_size = (n_vector_length > 16384)? 512 :
									 (n_vector_length > 8192)? 256 : 128; // no need to go lower here
	const size_t n_global_work_size = n_Align_Up(n_vector_length, n_local_work_size);
	// determine program block params now, we might save some work

	cl_program h_program;
	cl_kernel h_ZeroMemory32,
		h_BuildTermVectors_Retrain_TrueRot_RegOpts_JIT,
		h_BuildTermVectors_Retrain_TrueRot_RegOpts_JIT_SliceAligned,
		h_BuildTermVectors_Retrain_TrueRot_RegOpts_JIT_HalfSliceAligned,
		h_BuildTermVectors_Retrain_TrueRot_RegOpts_JIT_QrtrSliceAligned;
	{
		if(b_verbose)
			printf("loading program ...\n");

		std::string s_params;
		if(!stl_ut::Format(s_params, "-D __%s_WORK_BOUNDS_CHECK__ -D __RETRAIN__ -D __%sPOT__"
		   " -D JIT_VECTOR_LENGTH=%dU -D JIT_SEED_LENGTH=%dU -D JIT_WINDOW_SIZE=%dU -D JIT_MAX_SLICE_LENGTH=%dU"
		   " -D JIT_HALF_SLICE_LENGTH=%dU -D JIT_QUARTER_SLICE_LENGTH=%dU -cl-nv-verbose",
		   (n_vector_length % n_local_work_size)? "FORCE" : "LEAVE", b_Is_POT(n_vector_length)? "" : "N",
		   n_vector_length, 0, n_window_size, n_max_slice_length, n_max_slice_length / 2, n_max_slice_length / 4)) {
			fprintf(stderr, "error: not enough memory1\n");
			return false;
		}
		// format just-in-time parameters

		if(CCLProgramCompiler::n_CompileProgramFile(h_context, &h_program,
		   "CLKernel_v2.c", 1, &h_device, s_params.c_str(), "CLKernel_v2_retrain.clbin") != CL_SUCCESS) {
			fprintf(stderr, "error: failed to load OpenCL program\n");
			return false;
		}
		// compile OpenCL program

		{
			cl_int n_result;
			h_ZeroMemory32 = clCreateKernel(h_program, "ZeroMemory32", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
			h_BuildTermVectors_Retrain_TrueRot_RegOpts_JIT =
				clCreateKernel(h_program, (b_Is_POT(n_vector_length))?
				"BuildTermVectors_Retrain_POT_TrueRot_RegOpts_JIT" :
				"BuildTermVectors_Retrain_NPOT_TrueRot_RegOpts_JIT", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
			h_BuildTermVectors_Retrain_TrueRot_RegOpts_JIT_SliceAligned =
				clCreateKernel(h_program, (b_Is_POT(n_vector_length))?
				"BuildTermVectors_Retrain_POT_TrueRot_RegOpts_JIT_SliceAligned" :
				"BuildTermVectors_Retrain_NPOT_TrueRot_RegOpts_JIT_SliceAligned", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
			h_BuildTermVectors_Retrain_TrueRot_RegOpts_JIT_HalfSliceAligned =
				clCreateKernel(h_program, (b_Is_POT(n_vector_length))?
				"BuildTermVectors_Retrain_POT_TrueRot_RegOpts_JIT_HalfSliceAligned" :
				"BuildTermVectors_Retrain_NPOT_TrueRot_RegOpts_JIT_HalfSliceAligned", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
			h_BuildTermVectors_Retrain_TrueRot_RegOpts_JIT_QrtrSliceAligned =
				clCreateKernel(h_program, (b_Is_POT(n_vector_length))?
				"BuildTermVectors_Retrain_POT_TrueRot_RegOpts_JIT_QuarterSliceAligned" :
				"BuildTermVectors_Retrain_NPOT_TrueRot_RegOpts_JIT_QuarterSliceAligned", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
		}
		// get OpenCL kernels
	}
	// compile and link OpenCL program

	CDocumentSplitter splitter(documents, n_dummy_term, n_chunk_size, n_window_size);
	if(!splitter.Prepare_FirstChunk()) {
		fprintf(stderr, "error: splitter.Prepare_FirstChunk() failed\n");
		return false;
	}
	// prepare splitter ...

	const size_t n_occurence_list_max_size = splitter.n_Max_OccurenceList_Size();
	// get maximal sizes of work buffers

	_ASSERTE(sizeof(CDocumentSplitter::size_t) == sizeof(unsigned int));
	// data size checks

	cl_mem dp_chunk, dp_occurence_list, dp_vectors, dp_seed_vectors;
	{
		if(!(p_vectors_gpu = new(std::nothrow) TTermScalar[terms.size() * n_vector_length])) {
			fprintf(stderr, "error: not enough memory (" PRIsizeB "B)\n", PRIsizeBparams(terms.size() * n_vector_length * sizeof(TTermScalar)));
			return false;
		}
		// alloc memory for vectors, and for seeds

		cl_int n_result;
		dp_vectors = clCreateBuffer(h_context, CL_MEM_READ_WRITE, terms.size() * n_vector_length * sizeof(TTermScalar), NULL, &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		dp_seed_vectors = clCreateBuffer(h_context, CL_MEM_READ_ONLY, (terms.size() + 1) * n_vector_length * sizeof(TTermScalar), NULL, &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		dp_chunk = clCreateBuffer(h_context, CL_MEM_READ_ONLY, n_chunk_size * sizeof(unsigned int), NULL, &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
		dp_occurence_list = clCreateBuffer(h_context, CL_MEM_READ_ONLY, n_occurence_list_max_size * sizeof(unsigned int), NULL, &n_result);
		if(n_result != CL_SUCCESS) {
			fprintf(stderr, "error: failed to link all OpenCL kernels\n");
			return false;
		}
	}
	// alloc buffers on GPU

	clFlush(h_cmd_queue);
	clFinish(h_cmd_queue);
	// we start timing here, make sure GPU isn't busy

	if(b_verbose)
		printf("running ...\n");

	double f_tv_generate_start = timer.f_Time();

	if(n_ZeroMemory32_GPU(dp_vectors, terms.size() * n_vector_length) != CL_SUCCESS) {
		fprintf(stderr, "error: ZeroMemory32() failed\n");
		return false;
	}
	// "memset" vectors on GPU

	if(clEnqueueWriteBuffer(h_cmd_queue, dp_seed_vectors, CL_FALSE, 0, (terms.size() + 1) *
	   n_vector_length * sizeof(TTermScalar), p_seed_vectors, 0, NULL, NULL) != CL_SUCCESS) {
		fprintf(stderr, "error: failed to copy source data to GPU\n");
		return false;
	}
	// copy seeds to GPU

	do {
		if(!splitter.Build_TermOccurenceTable()) {
			fprintf(stderr, "error: splitter.Build_TermOccurenceTable() failed\n");
			return false;
		}
		const std::vector<CDocumentSplitter::size_t> &chunk = splitter.Get_Chunk();
		const std::vector<CDocumentSplitter::size_t> &offset_list = splitter.Get_TermOccurence_OffsetList();
		const std::vector<CDocumentSplitter::size_t> &occurence_list = splitter.Get_TermOccurenceList();
		// build term occurence table

		_ASSERTE(chunk.size() <= n_chunk_size);
		_ASSERTE(occurence_list.size() <= n_occurence_list_max_size);
		// make sure generated lists fit onto GPU lists

		{
			int n_result;
			if((n_result = clEnqueueWriteBuffer(h_cmd_queue, dp_chunk, CL_FALSE, 0, chunk.size() * sizeof(unsigned int), &chunk[0], 0, NULL, NULL)) != CL_SUCCESS ||
			   (n_result = clEnqueueWriteBuffer(h_cmd_queue, dp_occurence_list, CL_FALSE, 0, occurence_list.size() * sizeof(unsigned int), &occurence_list[0], 0, NULL, NULL)) != CL_SUCCESS) {
				fprintf(stderr, "error: failed to copy command buffers to GPU (%d, %d, %d / %d)\n", chunk.size(), offset_list.size(), occurence_list.size(), n_result);
				return false;
			}
		}
		// copy lists to GPU

		const CDocumentSplitter::size_t *p_offset = &offset_list[0];
		const size_t n_offset_num = offset_list.size();
		const CDocumentSplitter::size_t *p_occurence_list = &occurence_list[0];

		for(size_t i = 0; i < n_offset_num; ++ i, ++ p_offset) {
			const CDocumentSplitter::size_t *p_occurence = p_occurence_list + *p_offset;
			// get head of occurence list

			size_t n_term_id = *p_occurence ++;
			size_t n_occurence_num = *p_occurence ++;
			// get term id and number of occurences

			size_t n_remaining = n_occurence_num;
			size_t n_offset = *p_offset + 2; // don't forget about offset to global occurence list!
			while(n_remaining) {
				cl_kernel h_kernel;
				size_t n_length;
				if(n_remaining < n_max_slice_length / 4) {
					n_length = n_remaining;
					h_kernel = h_BuildTermVectors_Retrain_TrueRot_RegOpts_JIT;
				} else if(n_remaining < n_max_slice_length / 2) {
					n_length = n_max_slice_length / 4;
					h_kernel = h_BuildTermVectors_Retrain_TrueRot_RegOpts_JIT_QrtrSliceAligned;
				} else if(n_remaining < n_max_slice_length) {
					n_length = n_max_slice_length / 2;
					h_kernel = h_BuildTermVectors_Retrain_TrueRot_RegOpts_JIT_HalfSliceAligned;
				} else /*if(n_remaining >= n_max_slice_length)*/ {
					_ASSERTE(n_remaining >= n_max_slice_length);
					n_length = n_max_slice_length;
					h_kernel = h_BuildTermVectors_Retrain_TrueRot_RegOpts_JIT_SliceAligned;
				}
				bool b_slice_aligned = n_remaining >= n_max_slice_length / 4;
				// choose which kernel to use, choose slice length

				if(b_slice_aligned) {
					clSetKernelArgs6(h_kernel, dp_vectors, int(n_term_id * n_vector_length), dp_chunk,
						dp_occurence_list, int(n_offset), /*int(n_length),*/ dp_seed_vectors);
				} else {
					clSetKernelArgs7(h_kernel, dp_vectors, int(n_term_id * n_vector_length), dp_chunk,
						dp_occurence_list, int(n_offset), int(n_length), dp_seed_vectors);
				}
				// set kernel params

				if(clEnqueueNDRangeKernel(h_cmd_queue, h_kernel, 1,
				   NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL) != CL_SUCCESS) {
					fprintf(stderr, "error: BuildTermVectors_Retrain() failed\n");
					return false;
				}
				// call the kernel

				n_offset += n_length;
				n_remaining -= n_length;
			}
		}
	} while(splitter.Prepare_NextChunk());
	// split documents, let GPU do all the work

	if(clEnqueueReadBuffer(h_cmd_queue, dp_vectors, CL_TRUE, 0, terms.size() *
	   n_vector_length * sizeof(TTermScalar), p_vectors_gpu, 0, NULL, NULL) != CL_SUCCESS) {
		fprintf(stderr, "error: failed to copy results from GPU\n");
		return false;
	}
	// copy results back to CPU (blocking read)

	double f_tv_generate_time = timer.f_Time() - f_tv_generate_start;
	printf("retraining term vectors on GPU took " "|%.5f|"/*PRItimeprecise*/ " secs\n", /*PRItimeparams*/(f_tv_generate_time));

	{
		clReleaseKernel(h_ZeroMemory32);
		clReleaseKernel(h_BuildTermVectors_Retrain_TrueRot_RegOpts_JIT);
		clReleaseKernel(h_BuildTermVectors_Retrain_TrueRot_RegOpts_JIT_SliceAligned);
		clReleaseKernel(h_BuildTermVectors_Retrain_TrueRot_RegOpts_JIT_HalfSliceAligned);
		clReleaseKernel(h_BuildTermVectors_Retrain_TrueRot_RegOpts_JIT_QrtrSliceAligned);
		// free kernels

		clReleaseMemObject(dp_vectors);
		clReleaseMemObject(dp_seed_vectors);
		clReleaseMemObject(dp_chunk);
		clReleaseMemObject(dp_occurence_list);
		// free GPU memory

		clReleaseProgram(h_program);
		clReleaseCommandQueue(h_cmd_queue);
		clReleaseContext(h_context);
		// release other OpenCL objects
	}
	// shutdown OpenCL

	*p_vectors_gpu_ptr = p_vectors_gpu;
	// output result

	return true;
}

int main(int n_arg_num, const char **p_arg_list)
{
	__FuncGuard("::main");

	/*CSplitterTester::DoTests();
	CSplitterTester::DoSpeedTests();

	return 0;*/

	CUnicodeMapping uniconv("windows-852.txt");
	// load default windows xp charset for stdout

	CMerseneTwister twister;
	twister.init_genrand(123456);
	// initialize MT

	CTimer timer;
	timer.ResetTimer();
	// timer for meassuring times of operations

	const char *p_s_infile = "n:\\downloads\\gw-cna_eng-term-vectors-lemm-using-hash";
	size_t n_vector_length = 1024;
	size_t n_seed_length = 100;
	size_t n_window_size = 10;
	/*size_t n_vector_length = 200;
	size_t n_seed_length = 10;
	size_t n_window_size = 10;*/
	bool b_profiling = false;
	//size_t n_occurence_slice = 256;

	if(!b_profiling) {
		for(int i = 1; i < n_arg_num; ++ i) {
			if(!strcmp(p_arg_list[i], "-h") || !strcmp(p_arg_list[i], "--help")) {
				printf("use RandomIndexing [-v|--vector-length <vector-length>] [-s|--seed <seed-length>]\n"
					"\t[-w|--window-size <window-size>] [-h|--help]\n");
				return 0;
			} else if(!strcmp(p_arg_list[i], "--profile")) {
				b_profiling = true;
			} else if(i + 1 >= n_arg_num) {
				fprintf(stderr, "error: argument \'%s\' needs value\n", p_arg_list[i]);
				return -1;
			} else if(!strcmp(p_arg_list[i], "-v") || !strcmp(p_arg_list[i], "--vector-length")) {
				n_vector_length = atol(p_arg_list[++ i]);
				if(n_vector_length <= 0 || n_vector_length < n_seed_length) {
					n_vector_length = max(n_seed_length, size_t(200));
					fprintf(stderr, "error: invalid vector length: defaulting to %d\n", n_vector_length);
				}
			} else if(!strcmp(p_arg_list[i], "-s") || !strcmp(p_arg_list[i], "--seed")) {
				n_seed_length = atol(p_arg_list[++ i]);
				if(n_seed_length <= 0 || n_seed_length > n_vector_length) {
					n_seed_length = min(n_vector_length, size_t(10));
					fprintf(stderr, "error: invalid seed length: defaulting to %d\n", n_seed_length);
				}
			} else if(!strcmp(p_arg_list[i], "-w") || !strcmp(p_arg_list[i], "--window-size")) {
				n_window_size = atol(p_arg_list[++ i]);
				if(n_window_size <= 0) {
					n_window_size = 10;
					fprintf(stderr, "error: invalid window length: defaulting to %d\n", n_window_size);
				}
			} else {
				fprintf(stderr, "error: unknown argument : \'%s\'\n", p_arg_list[i]);
				return -1;
			}
		}
		// parse commandline
	}
	// ignore commandline if profiling ... nvidia visual profiler is clumsy with passing args

	b_verbose = true;
	// ...

	if(b_verbose)
		printf("loading \'%s\' ...\n", p_s_infile);
	// verbose

	std::vector<std::basic_string<wchar_t> > terms;
	std::vector<TDocument> documents;
	if(!TDocument::Read(terms, documents, p_s_infile)) {
		fprintf(stderr, "error: failed to read \'%s\'\n", p_s_infile);
		return -1;
	}
	// read documents

	if(b_verbose)
		printf("done. have %d terms in %d documents\n", terms.size(), documents.size());
	// verbose

#if 0
	{
		std::string s_tmp;
		for(size_t i = 0, n = terms.size(); i < n; ++ i)
			printf((i)? ", %s" : "%s", p_s_DeUnicode(s_tmp, terms[i], uniconv));
		printf("\n");
	}
	// try to print terms

	{
		std::string s_tmp;
		for(size_t i = 0, n = documents.size(); i < n; ++ i) {
			const TDocument &r_doc = documents[i];
			for(size_t j = 0, m = r_doc.term_position_list.size(); j < m; ++ j)
				printf((j)? " %s" : "%s", p_s_DeUnicode(s_tmp, terms[r_doc.term_position_list[j]], uniconv));
			printf("\n");
		}
	}
	// print document contents
#endif //0

	{
		size_t n_total_length = 0;
		for(size_t i = 0, n = documents.size(); i < n; ++ i) {
			const TDocument &r_doc = documents[i];
			n_total_length += r_doc.term_position_list.size();
		}
		printf("there is " PRIsizeB " terms in total\n", PRIsizeBparams(n_total_length));
	}
	// debug - show how much data we have

	_ASSERTE(n_vector_length <= CSeedOps::seed_IndexMask);

	TTermScalar *p_vectors_retrain;
	TTermScalar *p_vectors;
	//TTermScalar *p_vectors_cpu;
	CSeedOps::TSeed *p_seeds;
	_ASSERTE(terms.size() < SIZE_MAX);
	size_t n_dummy_term = terms.size();
	{
		__FuncGuard("GenerateSeeds");

		if(!(p_vectors = new(std::nothrow) TTermScalar[(terms.size() + 1) * n_vector_length]) ||
		   !(p_vectors_retrain = new(std::nothrow) TTermScalar[(terms.size() + 1) * n_vector_length]) ||
		   //!(p_vectors_cpu = new(std::nothrow) TTermScalar[(terms.size() + 1) * n_vector_length]) ||
		   !(p_seeds = new(std::nothrow) CSeedOps::TSeed[(terms.size() + 1) * n_seed_length])) {
			fprintf(stderr, "error: not enough memory\n");
			return -1;
		}
		// alloc memory for vectors, and for seeds

		double f_generate_start = timer.f_Time();

		for(size_t i = 0, n = terms.size(); i < n; ++ i) {
			CSeedOps::TSeed *p_seed = p_seeds + i * n_seed_length;
			for(size_t j = 0; j < n_seed_length; ++ j)
				p_seed[j] = CSeedOps::n_RandomSeed(n_vector_length, twister);
			// generate some random seeds

			std::sort(p_seed, p_seed + n_seed_length, CSeedOps::b_Seed_Index_Smaller);
			// make sure they're sorted to ensure optimal access to memory (we're going to use them many times)
		}
		// generate random seeds

		{
			CSeedOps::TSeed *p_dummy_seed = p_seeds + n_dummy_term * n_seed_length;
			for(size_t i = 0; i < n_seed_length; ++ i)
				p_dummy_seed[i] = CSeedOps::n_DummySeed();
		}
		// generate dummy seed

		memset(p_vectors, 0, (terms.size() + 1) * n_vector_length * sizeof(TTermScalar));
		//memset(p_vectors_cpu, 0, (terms.size() + 1) * n_vector_length * sizeof(TTermScalar));
		memset(p_vectors_retrain, 0, (terms.size() + 1) * n_vector_length * sizeof(TTermScalar));
		// clear vectors

		double f_generate_time = timer.f_Time() - f_generate_start;
		if(b_verbose)
			printf("generating seeds took " "|%.5f|"/*PRItimeprecise*/ " secs\n", /*PRItimeparams*/(f_generate_time));
	}
	// alloc vectors and generate seeds

	TTermScalar *p_seed_vectors;
	{
		if(!(p_seed_vectors = new(std::nothrow) TTermScalar[(terms.size() + 1) * n_vector_length])) {
			fprintf(stderr, "error: not enough memory\n");
			return -1;
		}
		int n_sum = 0;
		memset(p_seed_vectors, 0, (terms.size() + 1) * n_vector_length * sizeof(TTermScalar));
		for(size_t i = 0, n = terms.size(); i < n; ++ i) {
			TTermScalar *p_seed_vector = p_seed_vectors + i * n_vector_length;
			CSeedOps::TSeed *p_sparse_seed = p_seeds + i * n_seed_length;

			for(size_t j = 0; j < n_seed_length; ++ j) {
				p_seed_vector[CSeedOps::n_Seed_Index(p_sparse_seed[j])] += CSeedOps::n_Seed_Value(p_sparse_seed[j]);
				n_sum += CSeedOps::n_Seed_Value(p_sparse_seed[j]); // just being curious
			}
		}
	}
	// prepare seed vectors for retraining (they actually have the same contents
	// as normal seed vectors, so the results would be comparable)

	/*{
		printf("simulating v1 pipeline:\n");

		const size_t n_chunk_size = 1 << 20;
		CDocumentSplitter splitter(documents, n_dummy_term, n_chunk_size, n_window_size);
		if(!splitter.Prepare_FirstChunk()) {
			fprintf(stderr, "error: splitter.Prepare_FirstChunk() failed\n");
			return -1;
		}
		// prepare splitter ...

		do {
			if(!splitter.Build_TermOccurenceTable()) {
				fprintf(stderr, "error: splitter.Build_TermOccurenceTable() failed\n");
				return -1;
			}
			const std::vector<size_t> &chunk = splitter.Get_Chunk();
			const std::vector<size_t> &offset_list = splitter.Get_TermOccurence_OffsetList();
			const std::vector<size_t> &occurence_list = splitter.Get_TermOccurenceList();
			// build term occurence table

			size_t n_slice_length = n_occurence_slice;
			_ASSERTE(occurence_list.size() > 2);
			size_t n_max_frequency = occurence_list[1];

			for(size_t n_job_offset = 0; n_job_offset < n_max_frequency; n_job_offset += n_slice_length) {
				size_t n_term_num = std::upper_bound(offset_list.begin(), offset_list.end(),
					n_job_offset, CDocumentSplitter::CSliceLengthBelow(n_job_offset, occurence_list)) - offset_list.begin();
				_ASSERTE(n_term_num > 0 && n_term_num <= offset_list.size());
				_ASSERTE(n_job_offset > 0 || n_term_num == offset_list.size()); // all of them should be above zero
				_ASSERTE(n_term_num == offset_list.size() ||
					occurence_list[offset_list[n_term_num - 1] + 1] >= n_job_offset &&
					occurence_list[offset_list[n_term_num] + 1] < n_job_offset);
				// find how much terms have more occurences than n_job_offset

				printf("slice of occurences %4d to %4d: %4d terms\n", n_job_offset,
					min(n_job_offset + n_slice_length, n_max_frequency), n_term_num);
			}
		} while(splitter.Prepare_NextChunk());
	}
	// test splitting tasks by occurences to maintain higher GPU load

	{
		printf("simulating v2 pipeline:\n");

		const size_t n_chunk_size = 1 << 20;
		CDocumentSplitter splitter(documents, n_dummy_term, n_chunk_size, n_window_size);
		if(!splitter.Prepare_FirstChunk()) {
			fprintf(stderr, "error: splitter.Prepare_FirstChunk() failed\n");
			return -1;
		}
		// prepare splitter ...

		do {
			const size_t n_max_slice_length = 512;
			const size_t n_max_pass_size = 10000;
			const size_t n_min_primary_pass_size = 1000;
			const size_t n_dummy_vector_bank_num = 4096;
			/*const size_t n_max_slice_length = 256;
			const size_t n_max_pass_size = 10000;
			const size_t n_min_primary_pass_size = 64;
			const size_t n_dummy_vector_bank_num = 8192;* /
			// term occurence table v2 config

			if(!splitter.Build_TermOccurenceTable_v2(n_max_slice_length,
			   n_max_pass_size, n_min_primary_pass_size, n_dummy_vector_bank_num)) {
				fprintf(stderr, "error: splitter.Build_TermOccurenceTable() failed\n");
				return -1;
			}
			const std::vector<size_t> &chunk = splitter.Get_Chunk();
			const std::vector<CDocumentSplitter::TWorkItem> &offset_list_v2 = splitter.Get_WorkItemList_v2();
			const std::vector<size_t> &occurence_list_v2 = splitter.Get_TermOccurenceList_v2();
			const std::vector<CDocumentSplitter::TPass> &pass_list_v2 = splitter.Get_PassList_v2();
			// build term occurence table v2

			for(size_t i = 0, n = pass_list_v2.size(); i < n; ++ i) {
				const CDocumentSplitter::TPass &r_pass = pass_list_v2[i];

				printf("pass %d : %s%s\n", i, (r_pass.b_primary)? "primary" : "secondary",
					(r_pass.b_slice_aligned)? ", slice-aligned" : "");

				size_t n_min_length = n_max_slice_length, n_max_length = 0;
				uint64_t n_pass_length_sum = 0;
				for(size_t j = r_pass.n_offset, m = r_pass.n_offset + r_pass.n_length; j < m; ++ j) {
					/*printf("\t\twork-item %d : term %d, %d occurences\n", j,
						offset_list_v2[j].n_term_id, offset_list_v2[j].n_length);* / // there's p to 10000, it's impractical to print them all
					n_pass_length_sum += offset_list_v2[j].n_length;
					if(n_min_length > offset_list_v2[j].n_length)
						n_min_length = offset_list_v2[j].n_length;
					if(n_max_length < offset_list_v2[j].n_length)
						n_max_length = offset_list_v2[j].n_length;
				}
				float f_avg_length = float(n_pass_length_sum) / r_pass.n_length;
				printf("\ttotal %d work-items : %g occurences in average (%d to %d)\n", r_pass.n_length, f_avg_length, n_min_length, n_max_length);
				for(size_t j = 0, m = r_pass.summation_list.size(); j < m; ++ j) {
					const CDocumentSplitter::TPass::TSummationStep &r_sum_step = r_pass.summation_list[j];
					printf("\tsummation step %4d : dummy vector banks %5d trough %5d (%3d) sum to term %d\n",
						j, r_sum_step.n_offset, r_sum_step.n_offset + r_sum_step.n_length,
						r_sum_step.n_length, r_sum_step.n_term_id);
					for(size_t k = r_pass.n_offset + r_sum_step.n_offset,
					   o = r_pass.n_offset + r_sum_step.n_offset + r_sum_step.n_length; k < o; ++ k) {
						/*printf("\t\twork-item %d : term %d, %d occurences\n", k,
							offset_list_v2[k].n_term_id, offset_list_v2[k].n_length);* /
					}
				}
			}
		} while(splitter.Prepare_NextChunk());
	}*/
	// test splitting tasks by occurences to maintain higher GPU load

	TTermScalar *p_vectors_gpu = 0;
	TTermScalar *p_vectors_retrain_gpu = 0;
	// term vectors generated on GPU

	/*if(n_BuildTermVectors_GPU_v1(&p_vectors_gpu, n_vector_length, n_seed_length,
	   n_window_size, b_profiling, n_occurence_slice, terms, n_dummy_term, documents, p_seeds))
		return -1;*/
	// build term vectors on GPU (v1 of the algorithm)

	if(!BuildTermVectors_GPU_v2(&p_vectors_gpu, n_vector_length, n_seed_length,
	   n_window_size, b_profiling, terms, n_dummy_term, documents, p_seeds))
		return -1;
	// build term vectors on GPU (v2 of the algorithm)

#ifdef RETRAIN
	Sleep(1000);
	// whoa

	if(!RetrainTermVectors_GPU(&p_vectors_retrain_gpu, n_vector_length,
	   n_window_size, b_profiling, terms, n_dummy_term, documents, p_seed_vectors))
		return -1;
	// generate retrained vectors on GPU
#endif //RETRAIN

	if(b_profiling)
		return 0; // gpu profiling

	{
		__FuncGuard("GenerateTermVectors");

		double f_tv_generate_start = timer.f_Time();

		for(size_t i = 0, n = documents.size(); i < n; ++ i) {
			const TDocument &r_doc = documents[i];
			for(size_t j = 0, m = r_doc.term_position_list.size(); j < m; ++ j) {
				TTermScalar *p_vector = p_vectors + n_vector_length * r_doc.term_position_list[j];
				int n_wnd_off = -int(min(j, size_t(n_window_size)));
				size_t n_min = j + n_wnd_off;
				size_t n_max = min(m, j + n_window_size + 1);
				for(size_t jj = n_min; jj < n_max; ++ jj, ++ n_wnd_off) {
					_ASSERTE((!n_wnd_off) == (jj == j)); // window offset is null *iff* jj equals j
					if(!n_wnd_off)
						continue;
					// skip self

#if 1
					int n_rotation = n_wnd_off;
					while(n_rotation < 0) // could avoid while by calculating smallest multiple of n_seed_length greater or equal to n_vector_length
						n_rotation += int(n_vector_length);
					// make sure rotation is positive, modulo arithmetic will do the rest
#else
					int n_rotation = (n_wnd_off > 0)? 1 : n_vector_length - 1; // different way to do it ...
					/*if(n_rotation < 0)
						n_rotation += n_vector_length;*/ // moved up
#endif
					// calculate rotation

					const CSeedOps::TSeed *p_seed = p_seeds + n_seed_length * r_doc.term_position_list[jj];
					for(size_t k = 0; k < n_seed_length; ++ k) {
						CSeedOps::TSeed s = p_seed[k];
						p_vector[(CSeedOps::n_Seed_Index(s) + n_rotation) % n_vector_length] += CSeedOps::n_Seed_Value(s);
					}
					// add seed to the vector
				}
			}
		}

		double f_tv_generate_time = timer.f_Time() - f_tv_generate_start;
		printf("generating term vectors on CPU took " "|%.5f|"/*PRItimeprecise*/ " secs\n", /*PRItimeparams*/(f_tv_generate_time));
	}
	// generate term vectors

	if(p_vectors_gpu) {
		if(!memcmp(p_vectors_gpu, p_vectors, n_vector_length * sizeof(TTermScalar) * terms.size())) {
			if(b_verbose)
				printf("term vectors generated on GPU are ok :)\n");
		} else {
			fprintf(stderr, "error: term vectors generated on GPU differ from naive algorithm!\n");
			for(size_t i = 0, n = terms.size(); i < n; ++ i) {
				if(memcmp(p_vectors_gpu + i * n_vector_length, p_vectors + i * n_vector_length, n_vector_length * sizeof(TTermScalar)))
					fprintf(stderr, "error: term vector %d is wrong\n", i);
			}
		}
	}
	// check GPU output (check it here to allow for quick testing of GPU
	// pipeline without waiting for CPU to confirm the results are correct)

	if(p_vectors_retrain_gpu) {
		if(!memcmp(p_vectors_retrain_gpu, p_vectors, n_vector_length * sizeof(TTermScalar) * terms.size())) {
			if(b_verbose)
				printf("retrain term vectors generated on GPU are ok :)\n");
		} else {
			fprintf(stderr, "error: retrain term vectors generated on GPU differ from naive algorithm!\n");
			for(size_t i = 0, n = terms.size(); i < n; ++ i) {
				if(memcmp(p_vectors_retrain_gpu + i * n_vector_length, p_vectors + i * n_vector_length, n_vector_length * sizeof(TTermScalar)))
					fprintf(stderr, "error: term vector %d is wrong\n", i);
			}
		}
	}
	// check GPU output (check it here to allow for quick testing of GPU
	// pipeline without having to wait for CPU to confirm the results are correct)

	/*{
		__FuncGuard("GenerateTermVectors_CPU");

		double f_tv_generate_start = timer.f_Time();

		const size_t n_chunk_size = 1 << 16; // 64k chunks turn out to work best with splitter
		CDocumentSplitter splitter(documents, n_dummy_term, n_chunk_size, n_window_size);
		if(!splitter.Prepare_FirstChunk()) {
			fprintf(stderr, "error: splitter.Prepare_FirstChunk() failed\n");
			return -1;
		}
		// prepare splitter ..

		do {
			if(!splitter.Build_TermOccurenceTable()) {
				fprintf(stderr, "error: splitter.Build_TermOccurenceTable() failed\n");
				return -1;
			}

			const std::vector<size_t> &chunk = splitter.Get_Chunk();
			const std::vector<size_t> &offset_list = splitter.Get_TermOccurence_OffsetList();
			const std::vector<size_t> &occurence_list = splitter.Get_TermOccurenceList();

			const size_t *p_chunk = &chunk[0];
			const size_t *p_offset = &offset_list[0];
			const size_t n_offset_num = offset_list.size();
			const size_t *p_occurence_list = &occurence_list[0];

			for(size_t i = 0; i < n_offset_num; ++ i, ++ p_offset) {
				const size_t *p_occurence = p_occurence_list + *p_offset;
				// get head of occurence list

				size_t n_term_id = *p_occurence ++;
				size_t n_occurence_num = *p_occurence ++;
				// get term id and number of occurences

				TTermScalar *p_vector = p_vectors_cpu + n_vector_length * n_term_id;
				// get term vector (output)

				for(const size_t *p_end = p_occurence + n_occurence_num; p_occurence != p_end; ++ p_occurence) {
					size_t n_position = *p_occurence;
					_ASSERTE(p_chunk[n_position] == n_term_id); // reported term should be on that position
					_ASSERTE(n_position >= n_window_size && n_position < chunk.size() - n_window_size); // we won't check array boundaries, right?
					// get position

					int n_wnd_off = -int(n_window_size);
					size_t n_min = n_position - n_window_size;
					size_t n_max = n_position + n_window_size + 1;
					for(size_t k = n_min; k < n_max; ++ k, ++ n_wnd_off) {
						if(k == n_position)
							continue;
						// skip self

#if 1
						int n_rotation = n_wnd_off;
						while(n_rotation < 0) // could avoid while by calculating smallest multiple of n_seed_length greater or equal to n_vector_length
							n_rotation += n_vector_length;
						// make sure rotation is positive, modulo arithmetic will do the rest
#else
						int n_rotation = (n_wnd_off > 0)? 1 : n_vector_length - 1; // different way to do it ...
						/*if(n_rotation < 0)
							n_rotation += n_vector_length;* / // moved up
#endif
						// calculate rotation

						const CSeedOps::TSeed *p_seed = p_seeds + n_seed_length * p_chunk[k];
						for(size_t l = 0; l < n_seed_length; ++ l) {
							CSeedOps::TSeed s = p_seed[l];
							p_vector[(CSeedOps::n_Seed_Index(s) + n_rotation) % n_vector_length] += CSeedOps::n_Seed_Value(s);
						}
						// add seed to the vector
					}
				}
				// loop over term occurences
			}
		} while(splitter.Prepare_NextChunk());

		double f_tv_generate_time = timer.f_Time() - f_tv_generate_start;
		printf("generating term vectors on CPU took " "|%.5f|"/*PRItimeprecise* / " secs\n", /*PRItimeparams* /(f_tv_generate_time));

		TTermScalar *p_vector = p_vectors_cpu + n_vector_length * n_dummy_term;
		for(size_t i = 0; i < n_vector_length; ++ i) {
			if(p_vector[i]) {
				fprintf(stderr, "error: term vector for dummy term is nonzero!\n");
				break;
			}
		}
		// make sure dummy term vector is null vector

		if(!memcmp(p_vectors_cpu, p_vectors, n_vector_length * sizeof(TTermScalar) * terms.size()))
			printf("term vectors generated on CPU are ok :)\n");
		else
			fprintf(stderr, "error: term vectors generated on CPU differ from naive algorithm!\n");
		// check output
	}*/
	// runs exactly the same algorithm GPU does (debugging)

	/*{
		__FuncGuard("NormalizeTermVectors");

		double f_tv_normalize_start = timer.f_Time();

		for(size_t i = 0, n = terms.size(); i < n; ++ i) {
			float *p_vector = p_vectors + i * n_vector_length;

			float f_length = 0;
			for(size_t j = 0; j < n_vector_length; ++ j)
				f_length += p_vector[j] * p_vector[j]; // this may not be very precise for long vectors
			f_length = float(1 / sqrt(f_length));

			for(size_t j = 0; j < n_vector_length; ++ j)
				p_vector[j] *= f_length;
		}

		double f_tv_normalize_time = timer.f_Time() - f_tv_normalize_start;
		printf("normalizing term vectors took " "|%.5f|"/*PRItimeprecise* / " secs\n", /*PRItimeparams* /(f_tv_normalize_time));
	}*/
	// normalize term vectors

#ifdef RETRAIN
	{
		__FuncGuard("RetrainTermVectors_CPU");

		double f_tv_retrain_start = timer.f_Time();

		const size_t n_chunk_size = 1 << 16; // 64k chunks turn out to work best with splitter
		CDocumentSplitter splitter(documents, n_dummy_term, n_chunk_size, n_window_size);
		if(!splitter.Prepare_FirstChunk()) {
			fprintf(stderr, "error: splitter.Prepare_FirstChunk() failed\n");
			return -1;
		}
		// prepare splitter ..
	
		do {
			if(!splitter.Build_TermOccurenceTable()) {
				fprintf(stderr, "error: splitter.Build_TermOccurenceTable() failed\n");
				return -1;
			}

			const std::vector<CDocumentSplitter::size_t> &chunk = splitter.Get_Chunk();
			const std::vector<CDocumentSplitter::size_t> &offset_list = splitter.Get_TermOccurence_OffsetList();
			const std::vector<CDocumentSplitter::size_t> &occurence_list = splitter.Get_TermOccurenceList();

			const CDocumentSplitter::size_t *p_chunk = &chunk[0];
			const CDocumentSplitter::size_t *p_offset = &offset_list[0];
			const size_t n_offset_num = offset_list.size();
			const CDocumentSplitter::size_t *p_occurence_list = &occurence_list[0];

			for(size_t i = 0; i < n_offset_num; ++ i, ++ p_offset) {
				const CDocumentSplitter::size_t *p_occurence = p_occurence_list + *p_offset;
				// get head of occurence list

				size_t n_term_id = *p_occurence ++;
				size_t n_occurence_num = *p_occurence ++;
				// get term id and number of occurences

				TTermScalar *p_vector = p_vectors_retrain + n_vector_length * n_term_id;
				// get term vector (output)

				size_t n_starting_rotation = n_Align_Up(n_window_size, n_vector_length) - n_window_size;
				size_t n_starting_rotation2 = n_vector_length + n_window_size;
				for(const CDocumentSplitter::size_t *p_end = p_occurence + n_occurence_num; p_occurence != p_end; ++ p_occurence) {
					size_t n_position = *p_occurence;
					_ASSERTE(p_chunk[n_position] == n_term_id); // reported term should be on that position
					_ASSERTE(n_position >= n_window_size && n_position < chunk.size() - n_window_size); // we won't check array boundaries, right?
					// get position

					int n_wnd_off = -int(n_window_size);
					size_t n_min = n_position - n_window_size;
					size_t n_max = n_position + n_window_size + 1;
					size_t n_rotation = n_starting_rotation;
					size_t n_rotation2 = n_starting_rotation2;
					for(size_t k = n_min; k < n_max; ++ k, ++ n_wnd_off, ++ n_rotation, -- n_rotation2) {
						if(k == n_position)
							continue;
						// skip self

						const TTermScalar *p_seed = p_seed_vectors + n_vector_length * p_chunk[k];
						for(size_t l = 0; l < n_vector_length; ++ l) {
							//p_vector[(l + n_rotation) % n_vector_length] += p_seed[l];
							p_vector[l] += p_seed[(l + n_rotation2) % n_vector_length];
						}
						// add previously trained vector to the vector
					}
				}
				// loop over term occurences
			}
		} while(splitter.Prepare_NextChunk());

		double f_tv_retrain_time = timer.f_Time() - f_tv_retrain_start;
		printf("retraining term vectors on CPU took " "|%.5f|"/*PRItimeprecise*/ " secs\n", /*PRItimeparams*/(f_tv_retrain_time));

		TTermScalar *p_vector = p_vectors_retrain + n_vector_length * n_dummy_term;
		for(size_t i = 0; i < n_vector_length; ++ i) {
			if(p_vector[i]) {
				fprintf(stderr, "error: term vector for dummy term is nonzero!\n");
				break;
			}
		}
		// make sure dummy term vector is null vector

		if(!memcmp(p_vectors_retrain, p_vectors, n_vector_length * sizeof(TTermScalar) * terms.size())) {
			if(b_verbose)
				printf("retrain term vectors generated on CPU are ok :)\n");
		} else
			fprintf(stderr, "error: retrain term vectors generated on CPU differ from naive algorithm!\n");
		// check output
	}
	// generate retrained vectors
#endif //RETRAIN

	delete[] p_vectors_retrain;
	delete[] p_vectors;
	//delete[] p_vectors_cpu;
	delete[] p_vectors_gpu; // caller responsibility
	delete[] p_vectors_retrain_gpu; // caller responsibility
	delete[] p_seed_vectors;
	delete[] p_seeds;
	// cleanup

	return 0;
}

/*#define __ENABLE_DUFFS_DEVICE__

#define __kernel
#define __global
#define TSeed CSeedOps::TSeed
#define n_Seed_Value(s) CSeedOps::n_Seed_Value(s)
#define n_Seed_Index(s) CSeedOps::n_Seed_Index(s)

unsigned int get_global_id(unsigned int n_dimension)
{
	return 0;
}

__kernel void BuildTermVectors_Seed_POT_TrueRot(const unsigned int n_vector_length, __global int *p_vectors_gpu,
	__global const unsigned int *p_chunk, __global const unsigned int *p_offset, const unsigned int n_offset_num,
	__global const unsigned int *p_occurence_list, const unsigned int n_seed_length,
	const unsigned int n_window_size, __global const TSeed *p_seeds)
{
	unsigned int i = get_global_id(0);
	if(i >= n_offset_num)
		return;
	// each thread handles a single term in the list

	__global const unsigned int *p_occurence = p_occurence_list + p_offset[i];
	// get head of occurence list

	unsigned int n_term = *p_occurence ++;
	unsigned int n_occurence_num = *p_occurence ++;
	// get term id and number of occurences

	__global int *p_vector = p_vectors_gpu + n_vector_length * n_term;
	// get term vector (output)

	const unsigned int n_vector_length_mask = n_vector_length - 1;
	// n_vector_length is power of two

	const unsigned int n_nearest_greater_vector_length = (n_window_size + n_vector_length_mask) & ~n_vector_length_mask;
	// smallest multiple of n_window_size greater or equal to n_vector_length

	for(unsigned int j = 0; j < n_occurence_num; ++ j) {
		unsigned int n_position = p_occurence[j];

		{
			unsigned int n_rotation = n_nearest_greater_vector_length - n_window_size;
			unsigned int n_min = n_position - n_window_size;
			for(unsigned int k = n_min; k < n_position; ++ k, ++ n_rotation) {
				__global const TSeed *p_seed = p_seeds + n_seed_length * p_chunk[k];

#ifndef __ENABLE_DUFFS_DEVICE__
				for(unsigned int l = 0; l < n_seed_length; ++ l) {
					TSeed s = p_seed[l];
					p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				}
#else //__ENABLE_DUFFS_DEVICE__
				unsigned int n_repeat_num = (n_seed_length + 7) / 8;
				TSeed s;
				switch(n_seed_length % 8) {
					do {
				case 0:
						s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 7:
						s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 6:
						s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 5:
						s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 4:
						s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 3:
						s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 2:
						s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 1:
						s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
					} while(-- n_repeat_num);
				}
				// add seed to the vector (todo - apply some kind of loop optimization / duff's device)
#endif //__ENABLE_DUFFS_DEVICE__
				// add seed to the vector
			}
			// terms left of focused term
		}
		{
			unsigned int n_rotation = 1;
			unsigned int n_max = n_position + n_window_size + 1;
			for(unsigned int k = n_position + 1; k < n_max; ++ k, ++ n_rotation) {
				__global const TSeed *p_seed = p_seeds + n_seed_length * p_chunk[k];

#ifndef __ENABLE_DUFFS_DEVICE__
				for(unsigned int l = 0; l < n_seed_length; ++ l) {
					TSeed s = p_seed[l];
					p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				}
#else //__ENABLE_DUFFS_DEVICE__
				unsigned int n_repeat_num = (n_seed_length + 7) / 8;
				TSeed s;
				switch(n_seed_length % 8) {
					do {
				case 0:
						s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 7:
						s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 6:
						s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 5:
						s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 4:
						s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 3:
						s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 2:
						s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
				case 1:
						s = *p_seed ++; p_vector[(n_Seed_Index(s) + n_rotation) & n_vector_length_mask] += n_Seed_Value(s);
					} while(-- n_repeat_num);
				}
				// add seed to the vector (todo - apply some kind of loop optimization / duff's device)
#endif //__ENABLE_DUFFS_DEVICE__
				// add seed to the vector
			}
			// terms right of focused term
		}
		// loop over window
	}
	// loop over term occurences
}*/
// duff's device "C" test
