/**
 *	@file ProcessTermVectors/Main.cpp
 *	@author -tHE SWINe-
 *	@brief Utility for extracting plain term vectors from CLucene positional index.
 *	@date 2010-07-17
 */

#include "../../UberLame_src/CallStack.h"
#include <stdio.h>
#include <numeric>
#include <hash_map>
#include <map>
#include <list>
#include <vector>
#include <string>
#include <CL/opencl.h>
#include "../../UberLame_src/Integer.h"
#include "../../UberLame_src/MinMax.h"
#include "../../UberLame_src/gpgpu/ClUtils.h"
#include "../../UberLame_src/Dir.h"
#include "../../UberLame_src/StlUtils.h"
#include "../../UberLame_src/Timer.h"
#include "../../UberLame_src/BitArray.h"
#include "../../UberLame_src/Thread.h"
#include "../../UberLame_src/Mersene.h"

#include "DocReader.h"
#include "DocSplitter.h"
#include "LRU_K.h"
#include "PageAllocator.h"

typedef CLuceneIndexReaderInterface::wstring wstring; /**< copy of the wstring type to the global namespace */
typedef CDocStreamSplitter::term_id_t term_id_t; /**< copy this to the global namespace */

#include "GPU_Allocator.h"
#include "GPU_RI_Memory.h"
#include "ChunkProducer.h"

//#define PATH_PREFIX "e:\\school-dvi4\\gigaword\\"
#define PATH_PREFIX "n:\\downloads\\"

#define DATA_SUBSET "-cna_eng-"
//#define DATA_SUBSET "-nyt_eng-"
//#define DATA_SUBSET "-apw_eng-"
#ifndef DATA_SUBSET
#define DATA_SUBSET "-"
#endif //DATA_SUBSET
// lazy me ... paths for the job / home machine



/**
 *	@brief term vector allocation structure
 */
struct TTermVectorAlloc {
	int32_t *p_host_vector;
#ifdef _DEBUG
	uint32_t n_gpu_slot;
#endif //_DEBUG
	bool b_null;

	inline TTermVectorAlloc()
		:p_host_vector(0),
#ifdef _DEBUG
		n_gpu_slot(-1),
#endif //_DEBUG
		b_null(true)
	{}
};

class CRI_on_GPU {
public:
	typedef int TTermScalar; /**< @brief term vector element type */

	/**
	 *	@brief defines operations on seed vector elements
	 */
	class CSeedOps {
	public:
		/**
		 *	@brief seed vector elements type (scalar)
		 *
		 *	Seed vector is a sparse vector, containing +1's, -1's and 0's (dummy vector).
		 *	TSeed contains pair of values: value and index. Value is one of -1, 0 or +1,
		 *	index is (zero-based) offset of this value in the vector.
		 */
		typedef unsigned short TSeed;

		/**
		 *	@brief seed vector constants
		 */
		enum {
			seed_SignBit = 1 << 15, /**< @brief location of sign bit */
			seed_ValueBit = 1 << 14, /**< @brief location of value bit */
			seed_ValueShift = 14, /**< @brief shift of value bit */
			seed_IndexMask = seed_ValueBit - 1 /**< @brief mask of index to term vector */
		};

		/**
		 *	@brief gets seed value from seed element
		 *
		 *	@param[in] s is seed element (generated by n_RandomSeed() or n_DummySeed())
		 *
		 *	@return Returns seed value (-1, 0 or +1).
		 */
		static inline int n_Seed_Value(TSeed s)
		{
			return (short(s) >> seed_ValueShift);
		}

		/**
		 *	@brief gets index to term vector from seed element
		 *
		 *	@param[in] s is seed element (generated by n_RandomSeed() or n_DummySeed())
		 *
		 *	@return Returns index to term vector.
		 */
		static inline int n_Seed_Index(TSeed s)
		{
			return s & seed_IndexMask;
		}

		/**
		 *	@brief generates a new random seed
		 *
		 *	@param[in] n_vector_length is term vector length
		 *	@param[in] r_twister is random number generator
		 *
		 *	@return Returns new random seed element.
		 */
		static inline TSeed n_RandomSeed(size_t n_vector_length, CMerseneTwister &r_twister)
		{
			uint32_t n_rand = r_twister.genrand_int32();
			return int((double(n_rand) / UINT32_MAX) * n_vector_length) | seed_ValueBit | ((n_rand & 1)? seed_SignBit : 0);
			// use lowest bit as sign; that doesn't matter as long as vector length stays below half of UINT32_MAX (safe assumption)
		}

		/**
		 *	@brief gets value of dummy seed element for dummy vector
		 *	@return Returns value of dummy seed element.
		 */
		static inline TSeed n_DummySeed()
		{
			return 0;
		}

		/**
		 *	@brief less-than ordering function for seed elements
		 *
		 *	@param[in] a is first seed element
		 *	@param[in] b is second seed element
		 *
		 *	@return Returns true if n_Seed_Index() of a is smaller than that of b, otherwise returns false.
		 */
		static inline bool b_Seed_Index_Smaller(TSeed a, TSeed b)
		{
			return n_Seed_Index(a) < n_Seed_Index(b);
		}
	};

protected:
	cl_context h_context;
	cl_device_id h_device;
	cl_command_queue h_cmd_queue;

	const size_t n_vector_length;
	const size_t n_seed_length;
	const size_t n_window_size;
	//const size_t n_term_num;
	const size_t n_vector_num;
	const size_t n_seed_num;					// @todo - remove unnecessary members; some of that can simply be passed as args to initialization functions
	const size_t n_chunk_size;
	const size_t n_max_pass_size;
	const size_t n_dummy_vector_bank_num;
	const size_t n_max_slice_length;
	const size_t n_min_primary_pass_size;
	const size_t n_min_last_primary_pass_size;

	size_t n_sumstep_local_work_size;
	size_t n_sumstep_global_work_size;

	cl_program h_program;
	cl_kernel h_ZeroMemory16, h_ZeroMemory32, h_SummationStep_JIT,
		h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems,
		h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned;

	cl_mem dp_chunk, dp_work_item_list, dp_occurence_list,
		dp_vectors, dp_dummy_vector_banks, dp_seeds;

public:
	CRI_on_GPU(cl_context _h_context, cl_device_id _h_device, cl_command_queue _h_cmd_queue,
		size_t _n_vector_length, size_t _n_seed_length, size_t _n_halfwindow_size,
		size_t _n_vector_num, size_t _n_seed_num, size_t _n_chunk_size, size_t _n_max_pass_size,
		size_t _n_dummy_vector_bank_num, size_t _n_max_slice_length, size_t _n_min_primary_pass_size,
		size_t _n_min_last_primary_pass_size)
		:h_context(_h_context), h_device(_h_device), h_cmd_queue(_h_cmd_queue),

		n_vector_length(_n_vector_length), n_seed_length(_n_seed_length), n_window_size(_n_halfwindow_size),
		n_vector_num(_n_vector_num), n_seed_num(_n_seed_num), n_chunk_size(_n_chunk_size),
		n_max_pass_size(_n_max_pass_size), n_dummy_vector_bank_num(_n_dummy_vector_bank_num),
		n_max_slice_length(_n_max_slice_length), n_min_primary_pass_size(_n_min_primary_pass_size),
		n_min_last_primary_pass_size(_n_min_last_primary_pass_size),

		h_program(0), h_ZeroMemory16(0), h_ZeroMemory32(0), h_SummationStep_JIT(0),
		h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems(0),
		h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned(0),

		dp_chunk(0), dp_work_item_list(0), dp_occurence_list(0),
		dp_vectors(0), dp_dummy_vector_banks(0), dp_seeds(0)
	{
		_ASSERTE(h_context != 0); // 0 is supposed to be an invalid context value

		if(!CompileLink_Kernels()) {
			h_context = 0; // to mark error
			return;
		}
		// compile and link OpenCL program

		if(!Alloc_DeviceBuffers()) {
			h_context = 0; // to mark error
			return;
		}
		// alloc buffers on GPU

		n_sumstep_local_work_size = (n_vector_length > 16384)? 512 :
									(n_vector_length > 8192)? 256 : 128; // no need to go lower here
		n_sumstep_global_work_size = n_Align_Up(n_vector_length, n_sumstep_local_work_size);
		// determine program block params now, we might save some work
	}

	~CRI_on_GPU()
	{
		if(h_ZeroMemory16)
			clReleaseKernel(h_ZeroMemory16);
		if(h_ZeroMemory32)
			clReleaseKernel(h_ZeroMemory32);
		if(h_SummationStep_JIT)
			clReleaseKernel(h_SummationStep_JIT);
		if(h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems)
			clReleaseKernel(h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems);
		if(h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned)
			clReleaseKernel(h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned);
		// free kernels

		if(dp_vectors)
			clReleaseMemObject(dp_vectors);
		if(dp_dummy_vector_banks)
			clReleaseMemObject(dp_dummy_vector_banks);
		if(dp_seeds)
			clReleaseMemObject(dp_seeds);
		if(dp_chunk)
			clReleaseMemObject(dp_chunk);
		if(dp_occurence_list)
			clReleaseMemObject(dp_occurence_list);
		if(dp_work_item_list)
			clReleaseMemObject(dp_work_item_list);
		// free GPU memory

		if(h_program)
			clReleaseProgram(h_program);
		// release other OpenCL objects
	}

	bool b_Status() const
	{
		return h_context != 0;
	}

	bool Clear_Vectors(size_t n_first_vec, size_t _n_vector_num)
	{
		_ASSERTE(_n_vector_num <= SIZE_MAX && n_first_vec <= SIZE_MAX - _n_vector_num); // just make sure the next line wouldn't overflow
		_ASSERTE(n_first_vec < n_vector_num && n_first_vec + _n_vector_num <= n_vector_num); // array bounds check
		_ASSERTE(sizeof(TTermScalar) == sizeof(int32_t)); // term vectors should use 32-bit elements

		return n_ZeroMemory32_GPU(dp_vectors, n_first_vec * n_vector_length,
			_n_vector_num * n_vector_length) == CL_SUCCESS;
	}

	/*bool Clear_Seeds(size_t n_first_seed, size_t _n_seed_num = 1)
	{
		_ASSERTE(_n_seed_num <= SIZE_MAX && n_first_seed <= SIZE_MAX - _n_seed_num); // just make sure the next line wouldn't overflow
		_ASSERTE(n_first_seed < n_seed_num && n_first_seed + _n_seed_num <= n_seed_num); // array bounds check
		_ASSERTE(sizeof(CSeedOps::TSeed) == sizeof(uint16_t)); // seed vectors should use 16-bit elements

		if(n_seed_length & 1) {
			return n_ZeroMemory16_GPU(dp_seeds, n_first_seed * n_seed_length,
				_n_seed_num * n_seed_length) == CL_SUCCESS;
		} else {
			return n_ZeroMemory32_GPU(dp_seeds, n_first_seed * n_seed_length / 2,
				_n_seed_num * n_seed_length / 2) == CL_SUCCESS;
			// this happens to be aligned (and possibly faster)
		}
	}*/

	bool Upload_Seeds(const CSeedOps::TSeed *p_src, size_t n_first_seed, size_t _n_seed_num = 1)
	{
		_ASSERTE(_n_seed_num <= SIZE_MAX && n_first_seed <= SIZE_MAX - _n_seed_num); // just make sure the next line wouldn't overflow
		_ASSERTE(n_first_seed < n_seed_num && n_first_seed + _n_seed_num <= n_seed_num); // array bounds check
		_ASSERTE(sizeof(CSeedOps::TSeed) == sizeof(uint16_t)); // seed vectors should use 16-bit elements

		return clEnqueueWriteBuffer(h_cmd_queue, dp_seeds, CL_TRUE, n_first_seed * sizeof(CSeedOps::TSeed),
			_n_seed_num * sizeof(CSeedOps::TSeed), p_src, 0, 0, 0) == CL_SUCCESS;
	}

	bool Upload_Vectors(const TTermScalar *p_src, size_t n_first_vec, size_t _n_vector_num)
	{
		_ASSERTE(_n_vector_num <= SIZE_MAX && n_first_vec <= SIZE_MAX - _n_vector_num); // just make sure the next line wouldn't overflow
		_ASSERTE(n_first_vec < n_vector_num && n_first_vec + _n_vector_num <= n_vector_num); // array bounds check
		_ASSERTE(sizeof(TTermScalar) == sizeof(int32_t)); // term vectors should use 32-bit elements

		return clEnqueueWriteBuffer(h_cmd_queue, dp_vectors, CL_TRUE, n_first_vec * sizeof(TTermScalar),
			_n_vector_num * sizeof(TTermScalar), p_src, 0, 0, 0) == CL_SUCCESS;
	}

	bool Download_Vectors(TTermScalar *p_dest, size_t n_first_vec, size_t _n_vector_num)
	{
		_ASSERTE(_n_vector_num <= SIZE_MAX && n_first_vec <= SIZE_MAX - _n_vector_num); // just make sure the next line wouldn't overflow
		_ASSERTE(n_first_vec < n_vector_num && n_first_vec + _n_vector_num <= n_vector_num); // array bounds check
		_ASSERTE(sizeof(TTermScalar) == sizeof(int32_t)); // term vectors should use 32-bit elements

		return clEnqueueReadBuffer(h_cmd_queue, dp_vectors, CL_TRUE, n_first_vec * sizeof(TTermScalar),
			_n_vector_num * sizeof(TTermScalar), p_dest, 0, 0, 0) == CL_SUCCESS;
	}

	bool Do_RandomIndexing(const CChunkProducer::TChunkData &r_chunk_data)
	{
		const std::vector<uint32_t> &chunk = r_chunk_data.chunk;
		const std::vector<CDocStreamSplitter::TWorkItem> &work_item_list = r_chunk_data.work_item_list;
		const std::vector<CDocStreamSplitter::chunk_off_t> &occurence_list = r_chunk_data.occurence_list;
		const std::vector<CDocStreamSplitter::TPass> &pass_list = r_chunk_data.pass_list;
		// get term occurence table

		const size_t n_work_item_list_max_size = n_chunk_size;//splitter.n_Max_WorkItemList_v2_Size();
		const size_t n_occurence_list_max_size = n_chunk_size;//splitter.n_Max_OccurenceList_v2_Size();
		// get maximal sizes of work buffers

		_ASSERTE(chunk.size() <= n_chunk_size);
		_ASSERTE(work_item_list.size() <= n_work_item_list_max_size);
		_ASSERTE(occurence_list.size() <= n_occurence_list_max_size);
		// make sure generated lists fit onto GPU lists

		_ASSERTE(sizeof(CDocStreamSplitter::chunk_off_t) == sizeof(uint32_t));
		_ASSERTE(sizeof(CDocStreamSplitter::TWorkItem) == 3 * sizeof(uint32_t));
		// make sure data type sizes match

		{
			int n_result;
			if((n_result = clEnqueueWriteBuffer(h_cmd_queue, dp_chunk, CL_FALSE, 0, chunk.size() * sizeof(uint32_t), &chunk[0], 0, NULL, NULL)) != CL_SUCCESS ||
			   (n_result = clEnqueueWriteBuffer(h_cmd_queue, dp_work_item_list, CL_FALSE, 0, work_item_list.size() * 3 * sizeof(uint32_t), &work_item_list[0], 0, NULL, NULL)) != CL_SUCCESS ||
			   (n_result = clEnqueueWriteBuffer(h_cmd_queue, dp_occurence_list, CL_FALSE, 0, occurence_list.size() * sizeof(uint32_t), &occurence_list[0], 0, NULL, NULL)) != CL_SUCCESS) {
				fprintf(stderr, "error: failed to copy command buffers to GPU (%d, %d, %d / %d)\n", chunk.size(), work_item_list.size(), occurence_list.size(), n_result);
				return false;
			}
		}
		// copy lists to the GPU

		for(size_t i = 0, n = pass_list.size(); i < n; ++ i) {
			const CDocStreamSplitter::TPass &r_t_pass = pass_list[i];
			// get pass

			/*printf("%s %s pass %2d : ", (r_t_pass.b_primary)? "pri" : "sec",
				(r_t_pass.b_slice_aligned)? "aligned" : "unalign", i);*/
			// verbose

			_ASSERTE(r_t_pass.summation_list.empty() == r_t_pass.b_primary);
			if(!r_t_pass.b_primary) {
				const CDocStreamSplitter::TPass::TSummationStep &r_t_sum = r_t_pass.summation_list.back();
				size_t n_vector_bank_usage = r_t_sum.n_offset + r_t_sum.n_length;
				_ASSERTE(n_vector_bank_usage < n_dummy_vector_bank_num);
				// determine how much dummy vector banks is going to be used

				/*printf("clearing %d dummy vector banks\n%20s: ", n_vector_bank_usage, "");*/
				// verbose

				if(n_ZeroMemory32_GPU(dp_dummy_vector_banks, 0, n_vector_bank_usage * n_vector_length) != CL_SUCCESS) {
					fprintf(stderr, "error: ZeroMemory32() failed\n");
					return false;
				}
				// clear dummy vector banks
			}
			// clear dummy vector bansk for secondary passess

			/*printf("running btv kernel for %d work-items (%d to %d)\n", r_t_pass.n_length,
				r_t_pass.n_offset, r_t_pass.n_offset + r_t_pass.n_length);*/
			// verbose

			cl_kernel h_kernel = ((r_t_pass.b_slice_aligned)?
				h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned :
				h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems);
			// choose which kernel to use

			clSetKernelArgs7(h_kernel, ((r_t_pass.b_primary)? dp_vectors : dp_dummy_vector_banks), dp_chunk,
				dp_work_item_list, int(r_t_pass.n_offset), int(r_t_pass.n_length), dp_occurence_list, dp_seeds);
			// set kernel params

			size_t n_local_work_size = (r_t_pass.n_length > 16384)? 512 :
									   (r_t_pass.n_length > 8192)? 256 :
									   (r_t_pass.n_length > 4096)? 128 : 32; // aim to fill ~30 GPU's multiprocessors
			size_t n_global_work_size = n_Align_Up(size_t(r_t_pass.n_length), n_local_work_size);
			if(clEnqueueNDRangeKernel(h_cmd_queue, h_kernel, 1,
			   NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL) != CL_SUCCESS) {
				fprintf(stderr, "error: BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_v2WorkItems() failed\n");
				return false;
			}
			// call the kernel

			/*if(!r_t_pass.b_primary)
				printf("%20s: running %d summation kernels\n", "", r_t_pass.summation_list.size());*/
			// verbose

			for(size_t j = 0, m = r_t_pass.summation_list.size(); j < m; ++ j) {
				const CDocStreamSplitter::TPass::TSummationStep &r_t_sum = r_t_pass.summation_list[j];
				// get summation pass

				/*printf("\trunning summation kernel for term %5d (dummy vector banks %3d to %3d)\n", r_t_sum.n_term_id,
					r_t_sum.n_offset, r_t_sum.n_offset + r_t_sum.n_length);*/
				// verbose

				size_t n_dest_vector_first_elem = n_vector_length * r_t_sum.n_term_id;
				size_t n_first_bank_first_elem = n_vector_length * r_t_sum.n_offset;
				clSetKernelArgs5(h_SummationStep_JIT, dp_vectors, int(n_dest_vector_first_elem),
					dp_dummy_vector_banks, int(n_first_bank_first_elem), int(r_t_sum.n_length));
				// set kernel params

				if(clEnqueueNDRangeKernel(h_cmd_queue, h_SummationStep_JIT, 1,
				   NULL, &n_sumstep_global_work_size, &n_sumstep_local_work_size, 0, NULL, NULL) != CL_SUCCESS) {
					fprintf(stderr, "error: SummationStep_JIT() failed\n");
					return false;
				}
				// call the kernel
			}
			// proceed with summation steps
		}

		return true;
	}

protected:
	/**
	 *	@brief equivalent of memset(dp_buffer, 0, n_size_elements * sizeof(uint32_t)) for GPU
	 *
	 *	@param[in] dp_buffer is destination buffer on the device
	 *	@param[in] n_first_element is offset of memory to be erased, in elements (32-bit integers)
	 *	@param[in] n_size_elements is size of memory to be erased, in elements (32-bit integers)
	 *
	 *	@return Returns OpenCL error codes (CL_SUCCESS on success).
	 */
	int n_ZeroMemory32_GPU(cl_mem dp_buffer, size_t n_first_element, size_t n_size_elements)
	{
		//_ASSERTE(n_size_elements <= (1 << 22)); // that would possibly cause errors when launching the kernel
		if(n_size_elements > INT_MAX)
			return CL_INVALID_VALUE;
		if(n_first_element > INT_MAX)
			return CL_INVALID_VALUE;
		clSetKernelArgs2(h_ZeroMemory32, dp_buffer, int(n_first_element), int(n_size_elements));
		size_t n_local_work_size = (n_size_elements > 16384)? 512 :
								   (n_size_elements > 8192)? 256 : 128; // no need to go lower here
		size_t n_global_work_size = n_Align_Up(n_size_elements, n_local_work_size);
		return clEnqueueNDRangeKernel(h_cmd_queue, h_ZeroMemory32, 1,
			NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
	}

	/**
	 *	@brief equivalent of memset(dp_buffer, 0, n_size_elements * sizeof(uint16_t)) for GPU
	 *
	 *	@param[in] dp_buffer is destination buffer on the device
	 *	@param[in] n_first_element is offset of memory to be erased, in elements (16-bit integers)
	 *	@param[in] n_size_elements is size of memory to be erased, in elements (16-bit integers)
	 *
	 *	@return Returns OpenCL error codes (CL_SUCCESS on success).
	 */
	int n_ZeroMemory16_GPU(cl_mem dp_buffer, size_t n_first_element, size_t n_size_elements)
	{
		//_ASSERTE(n_size_elements <= (1 << 22)); // that would possibly cause errors when launching the kernel
		if(n_size_elements > INT_MAX)
			return CL_INVALID_VALUE;
		if(n_first_element > INT_MAX)
			return CL_INVALID_VALUE;
		clSetKernelArgs2(h_ZeroMemory16, dp_buffer, int(n_first_element), int(n_size_elements));
		size_t n_local_work_size = (n_size_elements > 16384)? 512 :
								   (n_size_elements > 8192)? 256 : 128; // no need to go lower here
		size_t n_global_work_size = n_Align_Up(n_size_elements, n_local_work_size);
		return clEnqueueNDRangeKernel(h_cmd_queue, h_ZeroMemory16, 1,
			NULL, &n_global_work_size, &n_local_work_size, 0, NULL, NULL);
	}

	bool CompileLink_Kernels()
	{
		std::string s_params;
		if(!stl_ut::Format(s_params, "-D __SUMMATION_STEP_ENABLE_DUFFS_DEVICE__"
		   " -D __FORCE_WORK_BOUNDS_CHECK__ -D __SUMMATION_STEP_%s_WORK_BOUNDS_CHECK__"
		   " -D __%sPOT__ -D JIT_VECTOR_LENGTH=%dU -D JIT_SEED_LENGTH=%dU"
		   " -D JIT_WINDOW_SIZE=%dU -D JIT_MAX_SLICE_LENGTH=%dU  -cl-nv-verbose",
		   (n_vector_length % n_sumstep_local_work_size)? "FORCE" : "LEAVE",
		   b_Is_POT(n_vector_length)? "" : "N", n_vector_length, n_seed_length,
		   n_window_size, n_max_slice_length)) {
			fprintf(stderr, "error: not enough memory\n");
			return false;
		}
		// format just-in-time parameters

		if(CCLProgramCompiler::n_CompileProgramFile(h_context, &h_program,
		   "CLKernel_v3.c", 1, &h_device, s_params.c_str(), "CLKernel_v3.clbin") != CL_SUCCESS) {
			fprintf(stderr, "error: failed to load OpenCL program\n");
			return false;
		}
		// compile OpenCL program

		{
			cl_int n_result;
			h_ZeroMemory16 = clCreateKernel(h_program, "ZeroMemory16", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
			h_ZeroMemory32 = clCreateKernel(h_program, "ZeroMemory32", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
			h_SummationStep_JIT = clCreateKernel(h_program, "SummationStep_JIT", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
			h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems =
				clCreateKernel(h_program, (b_Is_POT(n_vector_length))?
				"BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_v2WorkItems" :
				"BuildTermVectors_Seed_NPOT_TrueRot_RegOpts_JIT_v2WorkItems", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
			h_BuildTermVectors_Seed_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned =
				clCreateKernel(h_program, (b_Is_POT(n_vector_length))?
				"BuildTermVectors_Seed_POT_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned" :
				"BuildTermVectors_Seed_NPOT_TrueRot_RegOpts_JIT_v2WorkItems_SliceAligned", &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
		}
		// get OpenCL kernels

		return true;
	}

	bool Alloc_DeviceBuffers()
	{
		{
			cl_int n_result;
			dp_vectors = clCreateBuffer(h_context, CL_MEM_READ_WRITE, n_vector_num * n_vector_length * sizeof(TTermScalar), NULL, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
			dp_dummy_vector_banks = clCreateBuffer(h_context, CL_MEM_READ_WRITE, n_dummy_vector_bank_num * n_vector_length * sizeof(TTermScalar), NULL, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
			dp_seeds = clCreateBuffer(h_context, CL_MEM_READ_ONLY/*CL_MEM_READ_WRITE*/, n_seed_num * n_seed_length * sizeof(CSeedOps::TSeed), NULL, &n_result); // use CL_MEM_READ_WRITE if seeds are written to by the memset16 kernel
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
			dp_chunk = clCreateBuffer(h_context, CL_MEM_READ_ONLY, n_chunk_size * sizeof(uint32_t), NULL, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}

			const size_t n_work_item_list_max_size = n_chunk_size;//splitter.n_Max_WorkItemList_v2_Size();
			const size_t n_occurence_list_max_size = n_chunk_size;//splitter.n_Max_OccurenceList_v2_Size();
			// get maximal sizes of work buffers

			dp_occurence_list = clCreateBuffer(h_context, CL_MEM_READ_ONLY, n_occurence_list_max_size * sizeof(uint32_t), NULL, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
			dp_work_item_list = clCreateBuffer(h_context, CL_MEM_READ_ONLY, n_work_item_list_max_size * 3 * sizeof(uint32_t), NULL, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to link all OpenCL kernels\n");
				return false;
			}
		}
		// alloc buffers on GPU

		return true;
	}

	inline CRI_on_GPU(const CRI_on_GPU &UNUSED(r_ri_on_gpu)); // don't copy this object
	inline CRI_on_GPU &operator =(const CRI_on_GPU &UNUSED(r_ri_on_gpu)) { return *this; } // don't copy this object
};

int main(int n_arg_num, const char **p_arg_list)
{
	const char *p_s_input_index = PATH_PREFIX "gw" DATA_SUBSET "positional-index";
	// input / output paths

#if 0
	size_t n_chunk_size = 4096; // use small chunks for debugging
#else
	size_t n_chunk_size = 1 << 20;
#endif
	uint32_t n_max_pass_size = 12500;
	uint32_t n_max_slice_length = 32;
	uint32_t n_min_primary_pass_size = 2000;
	uint32_t n_min_last_primary_pass_size = 200;
	size_t n_dummy_vector_bank_num = n_Align_Up(n_max_pass_size, uint32_t(1024));
	// splitter parameters

	bool b_lemmatize = true;
	const char *p_s_lemma_file = PATH_PREFIX "lemmatized-utf16.txt";
	size_t n_min_term_frequency = (b_lemmatize)? 0 : 10;
	size_t n_min_lemma_frequency = (b_lemmatize)? 10 : 0; // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
	bool b_remove_less_frequent_terms = true;
	bool b_remove_lemma_less_terms = false;
	bool b_use_dummy_for_lemma_less_terms = false;
	// lemmatization parameters

	bool b_verbose = true; // @todo - add verbosity checks everywhere (they're just on code copied from RI_on_GPU now)
	bool b_high_prio_thread = true;
	bool b_lazy_gpu_deallocation = true;
	size_t n_gpu_memory_megs = 790;
	size_t n_gpu_memory_reserve_megs = 100;
	size_t n_max_gpu_pool_size_megs = 4095;
	size_t n_seed_length = 100;
	size_t n_vector_length = 1024;
	size_t n_half_window_size = 10;
	// other parameters

	if(b_high_prio_thread) {
		CCurrentThreadHandle cth;
		cth.Set_HighPriority();
	}
	// set high priority to beat boinc for resources

	CTimer timer;
	double f_init_start = timer.f_Time();

	CLuceneIndexReaderInterface reader(p_s_input_index);
	if(!reader.b_Status()) {
		fprintf(stderr, "error: failed to open the index\n");
		return -1;
	}
	if(!reader.Get_TermList(b_verbose)) {
		fprintf(stderr, "error: failed to get the term list\n");
		return -1;
	}
	if(b_remove_lemma_less_terms && b_use_dummy_for_lemma_less_terms) {
		fprintf(stderr, "warning: can't remove lemma-less terms and use the dummy term "
			"instead of them at the same time. lemma-less terms will be removed\n");
		b_use_dummy_for_lemma_less_terms = false;
		// those are mutualy exclusive
	}
	if(!reader.Lemmatize((b_lemmatize)? p_s_lemma_file : 0, n_min_term_frequency,
	   n_min_lemma_frequency, b_remove_less_frequent_terms, b_remove_lemma_less_terms,
	   b_use_dummy_for_lemma_less_terms, b_verbose)) {
		fprintf(stderr, "error: failed to apply the lemmatization table\n");
		return -1;
	}
	// initialize the lucene index reader

	// @n_ote the full pass on gw takes 1:30 now. good job!
	// @t_odo wash up the clothes!
	// @t_odo implement lemmatization of the terms in document vectors, try to print some lemmatized text out
	// @n_ote the full pass on gw takes 2:25 now. maybe try to optimize it a little?
	// @n_ote the full pass on gw takes 1:04 on unloaded core2duo. no need to optimize ...
	// @n_ote the full pass on gw takes 1:38 on unloaded opteron (x64) / ?:?? (x64, __USE_MALLOC) / 1:44 (x86, __USE_MALLOC) or 1:38 (x86 w/o __USE_MALLOC). no need to optimize ...
	// @t_odo implement chunker - can recycle the old one w/o much effort
	// @t_odo remove all the v1 stuff from the chunker, just split v2 to bare occurences and passes; implement using std::map, meassure speed ...
	// @t_odo call mr. monitor
	// @t_odo call mr. monitor again
	// @t_odo go buy the fertilizer
	// @t_odo implement cache behavior, test the cache
	// @note the full pass on gw takes 1:55 on unloaded opteron (x64) with all the allocations and whatnots

	/*
	#define DATA_SUBSET "-cna_eng-"
	size_t n_chunk_size = 1 << 16;
	size_t n_max_gpu_pool_size_megs = 50;
	size_t n_vector_length = 1024;
	size_t n_half_window_size = 10;
	*/
	// @n_ote it takes 00:00:30.91 with these settings without the allocator
	// @n_ote it takes 00:22:00.00 with the allocator
	// @t_odo - optimize the allocator
	// @n_ote it takes 00:00:51.43 with the optimized allocator (just on an edge,
	//		but reading documents and allocation policy could be split to multiple cores)
	// @n_ote it takes 00:00:42.83 with the optimized allocator using stdext::hash_multimap
	// @note it takes 00:00:32.76 with even more optimized allocator and utilizing term offset for time dispersion

	double f_init_time = timer.f_Time() - f_init_start;
	printf("\ninitialization took " PRItime "\n\n", PRItimeparams(f_init_time));
	// verbose

	size_t n_gpu_pool_size_bytes, n_gpu_pool_size;
	{
		TGPUMemoryBilance gpu_mem(reader.r_TermList().size(), n_seed_length, n_vector_length,
			n_chunk_size, n_dummy_vector_bank_num, n_gpu_memory_megs,
			n_gpu_memory_reserve_megs, n_max_gpu_pool_size_megs);
		if(!gpu_mem.b_Status()) {
			fprintf(stderr, "error: not enough memory on GPU (more than "
				PRIsizeB "B required, " PRIsizeB "B available)\n",
				PRIsizeBparams(gpu_mem.n_gpu_buffers_size), PRIsizeBparams(gpu_mem.n_free_gpu_space));
			return -1;
		}
		if(gpu_mem.n_gpu_pool_size <= 1) {
			// this is extreme case; there needs to be one slot to hold the dummy vector
			// and then at least a few slots to hold term vectors in the chunk (few = 1 here)
			fprintf(stderr, "error: gpu space for term vectors is insufficient\n");
			return -1;
		}
		gpu_mem.Dump();
		n_gpu_pool_size_bytes = gpu_mem.n_gpu_pool_size_bytes;
		n_gpu_pool_size = gpu_mem.n_gpu_pool_size;
	}
	// memory calculations (seed)

	//	=== producer-side objects ===

	CLuceneDocStorage doc_storage(reader);
	// initialize document storage

	CChunkProducer chunk_producer(n_gpu_pool_size, reader.r_TermList(),
		doc_storage, /*CLuceneIndexReaderInterface::*/term_Dummy, n_chunk_size, n_half_window_size,
		n_max_slice_length, n_max_pass_size, n_min_primary_pass_size, n_min_last_primary_pass_size,
		n_dummy_vector_bank_num, b_lazy_gpu_deallocation);
	if(!chunk_producer.b_Status()) {
		fprintf(stderr, "error: chunk producer failed to initialize\n");
		return -1;
	}
	// initialize chunk producer

	//	=== ~producer-side objects ===

	//	=== consumer-side objects ===

	CPageAllocator<int32_t> host_allocator(n_vector_length, 4096);
	// initialize term vector storage

	//clZeroMemory(...) // @todo
	// initialize the term vectors on the GPU

	size_t n_ftsu_saved_clear_num = 0;
	CBitArray first_time_slot_usage(n_gpu_pool_size);
	if(first_time_slot_usage.n_Size() != n_gpu_pool_size) {
		fprintf(stderr, "error: not enough memory for the gpu slot first time usage table\n");
		return -1;
	}
	first_time_slot_usage = true; // none of the slots has been used yet
	// track first-time usage of the slots

	std::vector<TTermVectorAlloc> term_alloc_table; // tvat
	if(!stl_ut::Resize_To_N(term_alloc_table, reader.r_TermList().size())) {
		fprintf(stderr, "error: not enough memory for the term vector allocation table (2)\n");
		return -1;
	}
	// build the term vector allocation table (for the gpu-side of the algorithm) // @todo - update this based on allocations

	cl_context h_context;
	cl_device_id h_device;
	cl_command_queue h_cmd_queue;
	{
		if(b_verbose)
			printf("initializing OpenCL ...\n");

		if(CCLUtils::n_OpenCL_Init(&h_context) != CL_SUCCESS) {
			fprintf(stderr, "error: failed to initialize OpenCL\n");
			return -1;
		}
		// init OpenCL

		if(CCLUtils::n_Get_MaxGFlops_DeviceId(&h_device, h_context)) {
			fprintf(stderr, "error: failed to select OpenCL device\n");
			return -1;
		}
		// get best OpenCL device

		{
			cl_int n_result;
			h_cmd_queue = clCreateCommandQueue(h_context, h_device,
				CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &n_result);
			if(n_result != CL_SUCCESS) {
				fprintf(stderr, "error: failed to create OpenCL command queue\n");
				return -1;
			}
		}
		// create command queue
	}
	// initialize OpenCL

	{
		_ASSERTE(reader.r_TermList().size() < SIZE_MAX); // so adding 1 wouldn't overflow
		size_t n_term_num = reader.r_TermList().size();
		size_t n_seed_num = n_term_num + 1;
		size_t n_dummy_term_id = n_term_num;

		if(b_verbose)
			printf("compiling programs and allocating buffers ...\n");

		CRI_on_GPU ri_on_gpu(h_context, h_device, h_cmd_queue, n_vector_length, n_seed_length,
			n_half_window_size, /*n_occurence_slice,*/ n_gpu_pool_size, n_seed_num,
			n_chunk_size, n_max_pass_size, n_dummy_vector_bank_num, n_max_slice_length,
			n_min_primary_pass_size, n_min_last_primary_pass_size);
		if(!ri_on_gpu.b_Status()) {
			fprintf(stderr, "error: failed to initialize the RI on GPU object\n");
			return -1;
		}
		// initializes RI on GPU (compiles kernels, allocates buffers)

		if(b_verbose)
			printf("clearing term vector pool ...\n");

		if(!ri_on_gpu.Clear_Vectors(0, n_gpu_pool_size)) {
			fprintf(stderr, "error: OpenCL operation in file \'%s\', on line %d failed\n", __FILE__, __LINE__-1);
			return -1;
		}
		// clear all the term vectors

		if(b_verbose)
			printf("generating random seeds ...\n");

		{
			CRI_on_GPU::CSeedOps::TSeed *p_seeds;
			if(!(p_seeds = new(std::nothrow) CRI_on_GPU::CSeedOps::TSeed[n_seed_num * n_seed_length])) {
				fprintf(stderr, "error: not enough memory for the seed vectors\n");
				return -1;
			}
			// alloc seeds

			CMerseneTwister twister;
			twister.init_genrand(123456);
			// initialize MT

			for(size_t i = 0, n = n_term_num; i < n; ++ i) {
				CRI_on_GPU::CSeedOps::TSeed *p_seed = p_seeds + i * n_seed_length;
				for(size_t j = 0; j < n_seed_length; ++ j)
					p_seed[j] = CRI_on_GPU::CSeedOps::n_RandomSeed(n_vector_length, twister);
				// generate some random seeds

				std::sort(p_seed, p_seed + n_seed_length, CRI_on_GPU::CSeedOps::b_Seed_Index_Smaller);
				// make sure they're sorted to ensure optimal access to memory (we're going to use them many times)
			}
			// generate random seeds

			{
				CRI_on_GPU::CSeedOps::TSeed *p_dummy_seed = p_seeds + n_dummy_term_id * n_seed_length;
				for(size_t i = 0; i < n_seed_length; ++ i)
					p_dummy_seed[i] = CRI_on_GPU::CSeedOps::n_DummySeed();
			}
			// generate dummy seed

			if(!ri_on_gpu.Upload_Seeds(p_seeds, 0, n_seed_num)) {
				fprintf(stderr, "error: OpenCL operation in file \'%s\', on line %d failed\n", __FILE__, __LINE__-1);
				return -1;
			}
			// upload the seeds, including the dummy

			delete[] p_seeds;
			// don't need this anymore
		}
		// generate and upload seed vectors

		if(b_verbose)
			printf("running ...\n");

	//	=== ~consumer-side objects ===

		size_t n_second_upload_num = 0;
		double f_gpu_wait = 0, f_cpu_wait = 0;
		double f_prev_chunk_time = timer.f_Time();
		double f_chunking_start_time = f_prev_chunk_time;
		size_t n_pool_fill = 0;
		double f_next_verbose_time = 1;
		// verbose / stats

		size_t n_peak_host_memory_usage = 0;
		for(size_t n_chunk = 0;; ++ n_chunk) {
			CChunkProducer::TChunkData t_chunk_data;
			bool b_have_chunk;
			if(!chunk_producer(b_have_chunk, t_chunk_data)) {
				fprintf(stderr, "error: chunk producer failed\n");
				return -1;
			}
			if(!b_have_chunk)
				break; // that was the last one
			// fill the TChunkData structure

			for(size_t i = 0, n = t_chunk_data.vacation_list.size(); i < n; ++ i) {
				const CGPUTermVectorAllocator::TTermVacation &r_t_vac = t_chunk_data.vacation_list[i];
				TTermVectorAlloc &r_t_host_alloc = term_alloc_table[r_t_vac.n_term_id];

				if(!r_t_host_alloc.p_host_vector && !host_allocator.Get_Vector(r_t_host_alloc.p_host_vector)) {
					fprintf(stderr, "error: host allocator failed (not enough memory)\n");
					return -1;
				}
				if(n_peak_host_memory_usage < host_allocator.n_Allocated_Vector_Num())
					n_peak_host_memory_usage = host_allocator.n_Allocated_Vector_Num();
				// allocate memory for the term vector on host

				_ASSERTE(r_t_host_alloc.n_gpu_slot == r_t_vac.n_GPU_slot);
				// make sure the slot is right (it's kind of duplicate information)

				if(!ri_on_gpu.Download_Vectors(r_t_host_alloc.p_host_vector, r_t_vac.n_GPU_slot, 1)) {
					fprintf(stderr, "error: OpenCL operation in file \'%s\', on line %d failed\n", __FILE__, __LINE__-1);
					return -1;
				}
				// read data from the GPU

#ifdef _DEBUG
				r_t_host_alloc.n_gpu_slot = -1;
				// it's not on the GPU anymore, the slot is probably allocated to some other vector already
#endif //_DEBUG

				_ASSERTE(!r_t_host_alloc.b_null);
				// make sure it's no longer null

				if(r_t_vac.b_scrub) {
					//luceneWriter.saveTermVector(r_t_host_alloc.p_host_vector, r_t_vac.n_term_id);
					// flush the vector

					host_allocator.Free_Vector(r_t_host_alloc.p_host_vector);
					r_t_host_alloc.p_host_vector = 0; // !!
					// free the term vector memory
				}
				// this vector is not going to be used again, just save it and free the memory
			}
			// read vectors being vacated from the GPU

			for(size_t i = 0, n = t_chunk_data.allocation_list.size(); i < n; ++ i) {
				const CGPUTermVectorAllocator::TTermAllocation &r_t_alloc = t_chunk_data.allocation_list[i];
				TTermVectorAlloc &r_t_host_alloc = term_alloc_table[r_t_alloc.n_term_id];

				if(r_t_host_alloc.b_null) {
					_ASSERTE(!r_t_host_alloc.p_host_vector); // shouldn't be allocated
					r_t_host_alloc.b_null = false;
					// won't be null after this pass

					if(!first_time_slot_usage[r_t_alloc.n_GPU_slot]) {
						if(!ri_on_gpu.Clear_Vectors(r_t_alloc.n_GPU_slot, 1)) {
							fprintf(stderr, "error: OpenCL operation in file \'%s\', on line %d failed\n", __FILE__, __LINE__-1);
							return -1;
						}
					} else {
						++ n_ftsu_saved_clear_num;
						// keep number of saved clears (should equal number of slots on sufficiently large datasets)
					}

					// @t_odo - optimize this by using bit array of zero gpu slots (all of them are cleared on startup)
				} else {
					_ASSERTE(r_t_host_alloc.p_host_vector); // should be allocated

					if(!ri_on_gpu.Upload_Vectors(r_t_host_alloc.p_host_vector, r_t_alloc.n_GPU_slot, 1)) {
						fprintf(stderr, "error: OpenCL operation in file \'%s\', on line %d failed\n", __FILE__, __LINE__-1);
						return -1;
					}

					++ n_second_upload_num;
					// this counts as a "second upload"

					host_allocator.Free_Vector(r_t_host_alloc.p_host_vector);
					r_t_host_alloc.p_host_vector = 0; // !!
					// free the term vector memory, it's now on GPU
				}
#ifdef _DEBUG
				r_t_host_alloc.n_gpu_slot = r_t_alloc.n_GPU_slot;
#endif //_DEBUG
				// allocate the vector

				first_time_slot_usage[r_t_alloc.n_GPU_slot] = false;
				// slot is being used right noe
			}
			// write the vectors being allocated to the GPU

			//Generate_TermVectors_Seed(...) //@todo
			// call the GPU functions

			size_t n_host_mem_uasge = host_allocator.n_Allocated_Vector_Num();

			for(size_t i = 0, n = t_chunk_data.post_vacation_list.size(); i < n; ++ i) {
				const CGPUTermVectorAllocator::TTermVacation &r_t_vac = t_chunk_data.post_vacation_list[i];
				TTermVectorAlloc &r_t_host_alloc = term_alloc_table[r_t_vac.n_term_id];

				if(!r_t_host_alloc.p_host_vector && !host_allocator.Get_Vector(r_t_host_alloc.p_host_vector)) {
					fprintf(stderr, "error: host allocator failed (not enough memory)\n");
					return -1;
				}
				if(n_peak_host_memory_usage < host_allocator.n_Allocated_Vector_Num())
					n_peak_host_memory_usage = host_allocator.n_Allocated_Vector_Num();
				// allocate memory for the term vector on host

				_ASSERTE(r_t_host_alloc.n_gpu_slot == r_t_vac.n_GPU_slot);
				// make sure the slot is right (it's kind of duplicate information)

				if(!ri_on_gpu.Download_Vectors(r_t_host_alloc.p_host_vector, r_t_vac.n_GPU_slot, 1)) {
					fprintf(stderr, "error: OpenCL operation in file \'%s\', on line %d failed\n", __FILE__, __LINE__-1);
					return -1;
				}
				// read data from the GPU

#ifdef _DEBUG
				r_t_host_alloc.n_gpu_slot = -1;
				// it's not on the GPU anymore, the slot is probably allocated to some other vector already
#endif //_DEBUG

				_ASSERTE(!r_t_host_alloc.b_null);
				// make sure it's no longer null

				if(r_t_vac.b_scrub) {
					//luceneWriter.saveTermVector(r_t_host_alloc.p_host_vector, r_t_vac.n_term_id);
					// flush the term vector

					host_allocator.Free_Vector(r_t_host_alloc.p_host_vector);
					r_t_host_alloc.p_host_vector = 0; // !!
					// free the term vector memory
				}
				// this vector is not going to be used again, just save it and free the memory
			}
			// read vectors being vacated from the GPU (the same loop as above (@todo - make it a function))

			// @t_odo - make the above code the chunk producer class, call it instead
			// @t_odo - simulate allocation / dealocation in gpu and on cpu
			// @t_odo - test peak data usage, devise how is this going to be debugged
			// @todo - connect it with the OpenCL code

			// term vectors can be deallocated once their frequency matches the frequency in the list
			// it's harder to do for the seed vectors as they can repeat in multiple chunks borders.
			// but simple delayed deallocation should be doable.
			// in the seed training version of the algorithm, seeds for the 2700485 terms take
			// up 2700485 * sizeof(short) * 10 = 51.5 MB for 10 long seeds; we will now assume
			// the seeds would always fit in the memory.

			// @t_odo - need to permutate work-item terms based on vector bank allocation

			{
				const size_t n_chunker_queue_size = 4;
				double f_time =	timer.f_Time();
				double f_chunk_prepare_time = f_time - f_prev_chunk_time;
				f_prev_chunk_time = f_time;
				double f_chunk_gpu_time = (5.0 * n_chunk_size / 1048576.0); // !!!!!!!!!!!!!!!!!!!!!!!!
				double f_gpu_time = f_chunk_gpu_time * (n_chunk + 1) + f_gpu_wait;
				double f_cpu_time = f_time - f_chunking_start_time + f_cpu_wait;
				double f_stall = f_cpu_time - f_gpu_time;

				if(f_stall > 0) {
					f_gpu_wait += f_stall;
					f_stall = 0;
				}
				f_stall += f_gpu_wait;
				// chunker is behind; the gpu must wait

				if(f_stall < -f_chunk_gpu_time * n_chunker_queue_size) {
					double f_over_lead = -f_stall - f_chunk_gpu_time * n_chunker_queue_size;
					// how much time does cpu have to wait until more chunks can be produced (the queue is full)

					f_cpu_wait += f_over_lead;
					f_stall += f_over_lead;
				}
				// chunker is ahead and the queue is full; cpu must wait

				n_pool_fill -= t_chunk_data.vacation_list.size();
				size_t n_final_pre = 0;
				for(size_t i = 0, n = t_chunk_data.vacation_list.size(); i < n; ++ i) {
					if(t_chunk_data.vacation_list[i].b_scrub)
						++ n_final_pre;
				}
				n_pool_fill += t_chunk_data.allocation_list.size();
				n_pool_fill -= t_chunk_data.post_vacation_list.size(); // this happens after processing the chunk
				printf("chunk %d: %d terms, %d down (%d final), %d up, %d post, %d in pool, "
					"%d on host, time %.2f%s, lag %.2f%s\n", n_chunk, chunk_producer.n_Term_Usage(),
					t_chunk_data.vacation_list.size(), n_final_pre, t_chunk_data.allocation_list.size(),
					t_chunk_data.post_vacation_list.size(), n_pool_fill, n_host_mem_uasge, f_chunk_prepare_time,
					(f_chunk_prepare_time >= f_chunk_gpu_time)? " (!!!)" : "", f_stall, (f_stall > 0)? " (!!! STALL !!!)" : "");
			}
			size_t i = doc_storage.n_Current_Doc();
			if(timer.f_Time() >= f_next_verbose_time) {
				f_next_verbose_time = timer.f_Time() + 1;
				const ::wstring &document_file = doc_storage.r_s_Current_DocFile();
				wprintf(L"processing document \'%s\'\n", (wcsrchr(document_file.c_str(), '\\'))?
					wcsrchr(document_file.c_str(), '\\') + 1 : document_file.c_str());
			}
			// verbose
		}
		// process the documents in chunks

		printf("\ndone (it took " PRItime ")\n\n", PRItimeparams(timer.f_Time()));

		size_t n_peak_term_usage = chunk_producer.n_Peak_Term_Usage();
		printf("peak gpu term pool usage: %d; it should be sufficient to use " PRIsizeB "B for the vector pool\n",
			n_peak_term_usage, PRIsizeBparams(n_Align_Up(n_peak_term_usage *
			sizeof(int32_t) * n_vector_length, size_t(1048576))));
		printf("peak host memory usage: %d (" PRIsizeB "B)\n", n_peak_host_memory_usage,
			PRIsizeBparams(n_peak_host_memory_usage * sizeof(int32_t) * n_vector_length));
		printf("ftsu saved %d vector transfers (" PRIsizeB "B)\n", n_ftsu_saved_clear_num,
			PRIsizeBparams(n_ftsu_saved_clear_num * sizeof(int32_t) * n_vector_length));
		printf("there was %d second uploads (" PRIsizeB "B)\n", n_second_upload_num,
			PRIsizeBparams(n_second_upload_num * sizeof(int32_t) * n_vector_length));
		printf("cpu waited " PRItime "\n", PRItimeparams(f_cpu_wait));
		printf("gpu waited " PRItime "\n", PRItimeparams(f_gpu_wait));

		printf("histogram of numbers of removed low-frequency terms per document:\n");
		reader.Dump_TermRemovalHistogram();
		// debug
	}

	clReleaseCommandQueue(h_cmd_queue);
	clReleaseContext(h_context);
	// shutdown OpenCL

	return 0;
}

/*
getting terms ...
done. there is 2700845 terms (10.30 GB assuming 1024 long float vectors)
sorting the terms by frequency ...
the first three term frequencies are 19984689 ('said'), 13665351 ('he'), 10664002 ('from')
building global terms index map ...
filtering terms ...
purging terms ...
remapping terms ...
done. lemma miss rate: 0, removed terms: 2466978, shifted terms: 233867, remaining terms: 233867
the first three term frequencies are 19984689 ('said'), 13665351 ('he'), 10664002 ('from')

initialization took 00:02:53.68

there's 233867 terms with frequency above 49 (913.54 MB assuming 1024 long float vectors)
there's 233867 terms with frequency above 9 (913.54 MB assuming 1024 long float vectors)
there's 233867 terms with frequency above 1 (913.54 MB assuming 1024 long float vectors)
processing document 'xin_eng_200412.gz#XIN_ENG_20041226.0192'

done (it took 01:44:09.59)

histogram of numbers of removed low-frequency terms per document:
    0 - 1    : 2711814: **************************************************
    1 - 3    : 1670421: *******************************
    3 - 7    : 896718: *****************
    7 - 15   : 313156: ******
   15 - 31   : 76507: **
   31 - 63   : 21258: *
   63 - 127  :  4321: *
  127 - 255  :  2517: *
  255 - 511  :   485: *
  511 - 1023 :    16: *
 1023 - 2047 :     0:
 2047 - 4095 :     0:
 4095 - 8191 :     0:
 8191 - 16383:     0:
16383 - 32767:     0:
32767 - 65535:     0:
  off-scale  :     0:
Press any key to continue . . .
*/
