/*
								+---------------------------------+
								|                                 |
								|   ***  Document splitter  ***   |
								|                                 |
								|  Copyright   -tHE SWINe- 2010  |
								|                                 |
								|          DocSplitter.h          |
								|                                 |
								+---------------------------------+
*/

/**
 *	@file DocSplitter.h
 *	@author -tHE SWINe-
 *	@brief Document splitter
 *	@date 2010-08-06
 */

#ifndef __DOCUMENT_SPLITTER_INCLUDED
#define __DOCUMENT_SPLITTER_INCLUDED

#include "DocStorage.h"
#include "../../UberLame_src/Integer.h"

/**
 *	@def DOC_SPLITTER_TRACK_CHUNK_DOCS
 *	@brief if defined, CDocumentSplitter tracks first and last document present in current chunk
 */
//#define DOC_SPLITTER_TRACK_CHUNK_DOCS

/**
 *	@def DOC_SPLITTER_BUILD_CHUNK_TERM_LIST
 *	@brief if defined, CDocumentSplitter builds list of terms, present in current chunk
 *	@note DOC_SPLITTER_TRACK_CHUNK_DOCS is required
 */
//#define DOC_SPLITTER_BUILD_CHUNK_TERM_LIST

/**
 *	@def DOC_SPLITTER_USE_TERM_OCCURENCE_PERMUTATION_TABLE
 *	@brief if defined, CDocumentSplitter employs permutation table to avoid sorting large term occurence table directly
 *	@note DOC_SPLITTER_BUILD_CHUNK_TERM_LIST overrides this
 */
#define DOC_SPLITTER_USE_TERM_OCCURENCE_PERMUTATION_TABLE

/**
 *	@def DOC_SPLITTER_ALIGNED_SLICES_IN_FIRST_PASS
 *	@brief if defined, terms with frequency equal to whole multiple of slice length are processed in the first pass, potentialy increasing slice lenght variance in that pass and thus lowering computation efficiency
 *	@note This is mostly insignificant, there are almost no such terms.
 *	@note In case DOC_SPLITTER_SLICE_REMAINDERS_TO_THE_FIRST_PASS is not defined, this only applies to terms with frequency equal to slice length (ie. not it's multiple).
 */
//#define DOC_SPLITTER_ALIGNED_SLICES_IN_FIRST_PASS

/**
 *	@def DOC_SPLITTER_SLICE_REMAINDERS_TO_THE_FIRST_PASS
 *	@brief if defined, slices with size less than maximal slice length are placed in the first pass (otherwise they would be in all the passes), so all the next chunks have slices with exactly the same size
 *	@note Order of processing term occurences is not changed, ie. last slices aren't moved to the first pass, instead first slices are made shorter as required. DOC_SPLITTER_ALIGNED_SLICES_IN_FIRST_PASS affects some of those slices.
 */
#define DOC_SPLITTER_SLICE_REMAINDERS_TO_THE_FIRST_PASS

/**
 *	@brief splits documents into chunks
 *
 *	Chunks contain indices of terms in order they apeear in consecutive documents.
 *	Individual documents are separated by amount of occurences of dummy term to prevent
 *	last term of one document having effect on first term of the next document.
 *	In order to make processing code simple, each chunk have enough (half-window size)
 *	leading and trailing padding (dummy terms) so the code reading terms under the
 *	window doesn't need to check for array boundaries. Consequently, in case all the documents
 *	do not fit in a single chunk, part of the last chunk is repeated in the next chunk.
 *	This repeated part is called restart interval and it is two half-window sizes long.
 *
 *	Naive processing code then looks like this:
 *
 *@code
 *	std::vector <TDocument> docs = ...;
 *	size_t dummy_term_id = ...; // eg. terms.size()
 *	size_t chunk_size = ...; // eg. cl_max_allocation_size / sizeof(float)
 *	size_t halfwindow_size = 10;
 *
 *	CDocumentSplitter splitter(docs, dummy_term_id, chunk_size, halfwindow_size);
 *
 *	splitter.Prepare_FirstChunk(); // todo - mind some error-checking
 *	do {
 *		const std::vector<size_t> &chunk = splitter.Get_Chunk();
 *		for(size_t i = halfwindow_size; i < chunk.size() - halfwindow_size; ++ i) {
 *			size_t focused_term_id = chunk[i];
 *			for(size_t wnd = i - halfwindow_size; wnd <= i + halfwindow_size; ++ wnd) {
 *				if(wnd == i)
 *					continue;
 *				size_t co_occuring_term_id = chunk[wnd];
 *
 *				// todo - update term vectors for focused_term_id, based on co_occuring_term_id
 *			}
 *		}
 *	} while(splitter.Prepare_NextChunk());@endcode
 *
 *	Note the effective window size is twice half-window size + 1.
 *
 *	@note There are functions to build lists of work-items, which makes more sophisticated
 *		parallel processing viable. Specifically Build_TermOccurenceTable() and Build_TermOccurenceTable_v2(),
 *		which are most effective for term vector element-parallel (long seed vectors, eg. retraining) and
 *		term-parallel (sparse seed vectors) processing, respectively.
 *
 */
class CDocumentSplitter {
public:
#ifdef __x64__
	typedef uint32_t size_t;
#else //__x64__
	typedef ::size_t size_t;
#endif //__x64__

	/**
	 *	@brief simple comparator for std::lower_bound()
	 *
	 *	This was used  in early task slicing experiments (before Build_TermOccurenceTable_v2())
	 *	to determine number of terms in offset list, which would contribute to slice with given starting offset.
	 *	Terms offset list points to are sorted by their decreasing frequency hence binary search may be used.
	 */
	class CSliceLengthBelow {
	protected:
#ifdef _DEBUG
		size_t m_n_ref; /**< @brief reference value slice length is compared to */
#endif //_DEBUG
		const std::vector<size_t> &m_r_occurence_list; /**< @brief const reference to occurence list, which contains information about symbol frequencies (slice lengths) */

	public:
		/**
		 *	@brief default constructor
		 *
		 *	@param[in] n_ref is reference value slice length is compared to
		 *	@param[in] r_occurence_list is const reference to occurence list, which contains information about symbol frequencies (slice lengths)
		 */
		CSliceLengthBelow(size_t n_ref, const std::vector<size_t> &r_occurence_list);

		/**
		 *	@brief comparison operator
		 *
		 *	@brief n_ref is first of compared values, it is reference value slice length is compared to, and must be equal to the one, supplied to constructor (debug error-checking)
		 *	@brief n_offset is element of the offset list, pointing to list of occurences for a particular term
		 *
		 *	@return Returns true if frequency of term given by n_offset is smaller than reference value.
		 */
		bool operator ()(size_t n_ref, size_t n_offset) const;
	};

	/**
	 *	@brief generic range of generic data
	 *
	 *	A Build_TermOccurenceTable_v2() structure.
	 */
	struct TRange {
		size_t n_offset; /**< @brief zero-based index of first data-item */
		size_t n_length; /**< @brief number of data-items */

		/**
		 *	@brief default constructor; constructs empty range
		 */
		TRange();

		/**
		 *	@brief constructor
		 *
		 *	@param[in] _n_offset is zero-based index of first data-item
		 *	@param[in] _n_length is number of data-items
		 */
		TRange(size_t _n_offset, size_t _n_length);

		/**
		 *	@brief less-than operator for sorting ranges by their length
		 *
		 *	@param[in] r_t_other is other range, this range is compared to
		 *
		 *	@return Returns true if this range is shorter than r_t_other.
		 */
		bool operator <(const TRange &r_t_other) const;

		/**
		 *	@brief greater-than operator for sorting ranges by their length
		 *
		 *	@param[in] r_t_other is other range, this range is compared to
		 *
		 *	@return Returns true if this range is longer than r_t_other.
		 */
		bool operator >(const TRange &r_t_other) const;
	};

	/**
	 *	@brief generic range of term-assigned data
	 *
	 *	A Build_TermOccurenceTable_v2() structure.
	 */
	struct TTermRange : public TRange {
		size_t n_term_id; /**< @brief term this range is assigned to */

		/**
		 *	@brief default constructor; constructs empty range
		 */
		TTermRange();

		/**
		 *	@brief constructor
		 *
		 *	@param[in] _n_offset is zero-based index of first data-item
		 *	@param[in] _n_length is number of data-items
		 *	@param[in] _n_term_id is term this range is assigned to
		 */
		TTermRange(size_t _n_offset, size_t _n_length, size_t _n_term_id);
	};

	/**
	 *	@brief build term vectors algorithm work-item (range of occurences of specific term)
	 *
	 *	A Build_TermOccurenceTable_v2() structure.
	 */
	typedef TTermRange TWorkItem;

	/**
	 *	@brief description of one pass of build term vectors algorithm
	 *
	 *	A Build_TermOccurenceTable_v2() structure.
	 */
	struct TPass : public TRange {
		/**
		 *	@brief summation step range (points to dummy vector banks)
		 *
		 *	A Build_TermOccurenceTable_v2() structure.
		 */
		typedef TTermRange TSummationStep;

		bool b_primary; /**< @brief primary task passes work on real vectors; secondary task passes subdivide few long vectors and work on dummy places which are later summed and added to real vectors */
#ifdef DOC_SPLITTER_SLICE_REMAINDERS_TO_THE_FIRST_PASS
		bool b_slice_aligned; /**< @brief if set, all work-items in this pass are n_max_slice_length long (requires DOC_SPLITTER_SLICE_REMAINDERS_TO_THE_FIRST_PASS) */
#endif //DOC_SPLITTER_SLICE_REMAINDERS_TO_THE_FIRST_PASS
		std::vector<TSummationStep> summation_list; /**< @brief list of summations performed by secondary task passes (those are data-parallel and are executed one by one, do not need to reside in an array to be run in a single OpenCL batch) */

		/**
		 *	@brief default constructor; constructs empty pass
		 */
		TPass();

		/**
		 *	@brief constructor
		 *
		 *	@param[in] _n_offset is zero-based index of first work-item (points to CDocumentSplitter::m_work_item_list_v2)
		 *	@param[in] _n_length is number of work-items
		 *	@param[in] _b_primary is primary pass flag
		 */
		TPass(size_t _n_offset, size_t _n_length, bool _b_primary = true);
	};

protected:
	const std::vector <TDocument> &m_r_document_list; /**< @brief reference to the list of documents */
	size_t m_n_max_chunk_size; /**< @brief target chunk size */
	size_t m_n_window_size; /**< @brief half-size of window in the semantic term vector calculation algorithm */
	size_t m_n_dummy_term; /**< @brief index of dummy term vector */

	uint64_t m_n_length_of_all_docs; /**< @brief sum of lengths of all documents */
	uint64_t m_n_length_of_concat; /**< @brief length of all documents concatenated, including padding */
	size_t m_n_chunk_num; /**< @brief number of chunks */

	size_t m_n_current_chunk; /**< @brief zero-based index of current chunk */
	size_t m_n_current_doc; /**< @brief zero-based index of current document */
	size_t m_n_current_doc_off; /**< @brief offset into current document, and it's padding */
	std::vector<size_t> m_chunk; /**< @brief buffer with current chunk */
#ifdef DOC_SPLITTER_TRACK_CHUNK_DOCS
	size_t m_n_next_chunk_first_doc; /**< @brief zero-based index of first document in the next chunk (or first document in restart interval of current chunk) */
	size_t m_n_first_chunk_doc; /**< @brief zero-based index of first document in current chunk */
	size_t m_n_last_chunk_doc; /**< @brief zero-based index of last document in current chunk + 1 */
#endif //DOC_SPLITTER_TRACK_CHUNK_DOCS

	typedef std::pair<size_t, std::vector<size_t> > TTermOccurence; /**< @brief term occurence (term and list of it's positions in the chunk) */

#ifdef DOC_SPLITTER_BUILD_CHUNK_TERM_LIST
	std::vector<size_t> m_chunk_term_list; /**< @brief list of unique terms in the curernt chunk */
#endif //DOC_SPLITTER_BUILD_CHUNK_TERM_LIST
	std::vector<TTermOccurence> m_term_occurence_table; /**< @brief table of term occurences */
	std::vector<size_t> m_offset_list; /**< @brief list of offsets to term occurence list */
	std::vector<size_t> m_occurence_list; /**< @brief term occurence list; at each beginning offset (given by m_offset_list) contains term id, number of occurences, and list of occurences */

	/**
	 *	@brief small function object, used for building term occurence permutation table
	 */
	class CGetIndexFrequencyPair {
	protected:
		size_t m_n_index; /**< @brief term occurence index counter */

	public:
		/**
		 *	@brief default constructor
		 */
		inline CGetIndexFrequencyPair()
			:m_n_index(0)
		{}

		/**
		 *	@brief conversion of TTermOccurence to pair, containing term index and term id
		 *
		 *	@param[in] term_occurence is term occurence table entry; those are supposed
		 *		to be supplied in order they appear in the table (eg. via std::for_each)
		 *		in order for indices to be generated correctly
		 *
		 *	@return Returns std::pair of term occurence index (first term has index 0,
		 *		each successive term has index one larger) and term id.
		 */
		inline std::pair<size_t, size_t> operator ()(const TTermOccurence &term_occurence)
		{
			return std::make_pair(m_n_index ++, term_occurence.second.size());
		}
	};

	std::vector<TPass> m_pass_list; /**< @brief list of build term vector algorithm passess; contains up to chunk size / n_max_pass_size task passes */
	std::vector<TWorkItem> m_work_item_list_v2; /**< @brief list of build term vector algorithm work-items; contains up to chunk size work-items */
	std::vector<size_t> m_occurence_list_v2; /**< @brief list of raw term occurences in the current chunk; contains up to chunk size items */

public:
	/**
	 *	@brief default constructor
	 *
	 *	@param[in] r_document_list is list of documents
	 *	@param[in] n_dummy_term is index of dummy term vector (vector, containing nulls, used to separate documents)
	 *	@param[in] n_max_chunk_size is maximal chunk size (in term indices)
	 *	@param[in] n_halfwindow_size is half-size of window in the semantic term vector calculation algorithm
	 */
	CDocumentSplitter(const std::vector <TDocument> &r_document_list,
		size_t n_dummy_term, size_t n_max_chunk_size, size_t n_halfwindow_size);

	/**
	 *	@brief gets document padding overhead
	 *	@return Returns fraction of space taken up by padding to length of all documents.
	 */
	double f_Padding_Overhead() const;

	/**
	 *	@brief gets chunk restart overhead
	 *	@return Returns fraction of space taken up by repeating end of previous chunk
	 *		at the beginning of the next chunk to length of all documents.
	 */
	double f_ChunkRestart_Overhead() const;

	/**
	 *	@brief gets number of chunks
	 *	@return Returns number of chunks.
	 */
	inline size_t n_Chunk_Num() const
	{
		return m_n_chunk_num;
	}

#ifdef DOC_SPLITTER_TRACK_CHUNK_DOCS
	/**
	 *	@brief gets index of the first document in current chunk
	 *	@return Returns index of the first document in current chunk.
	 *	@note Range of documents in current chunk is given by half-open interval
	 *		[n_First_ChunkDocument(), n_Last_ChunkDocument()).
	 */
	inline size_t n_First_ChunkDocument() const
	{
		return m_n_first_chunk_doc;
	}

	/**
	 *	@brief gets index of the last document in current chunk + 1
	 *	@return Returns index of the last document in current chunk + 1.
	 *	@note Range of documents in current chunk is given by half-open interval
	 *		[n_First_ChunkDocument(), n_Last_ChunkDocument()).
	 */
	inline size_t n_Last_ChunkDocument() const
	{
		return m_n_last_chunk_doc;
	}
#endif //DOC_SPLITTER_TRACK_CHUNK_DOCS

	/**
	 *	@brief prepares the first chunk
	 *
	 *	@return Returns true on success, false on failure (not enough memory).
	 *
	 *	@note Current chunk may be accessed using Get_Chunk().
	 */
	bool Prepare_FirstChunk();

	/**
	 *	@brief prepares the next chunk
	 *
	 *	@return Returns true on success, or false if there's no more chunks.
	 *
	 *	@note This function shouldn't be called before calling Prepare_FirstChunk().
	 *	@note Current chunk may be accessed using Get_Chunk().
	 */
	bool Prepare_NextChunk();

	/**
	 *	@brief gets current chunk
	 *
	 *	@return Returns const reference to current chunk.
	 *
	 *	@note Contents of returned buffer are invalid unless
	 *		Prepare_FirstChunk() (or subsequently Prepare_NextChunk()) is called.
	 */
	const std::vector<size_t> &Get_Chunk() const;

	/**
	 *	@brief builds term occurence table, and associated tables for feeding low-level algorithm
	 *	@return Returns true on success, false on failure.
	 */
	bool Build_TermOccurenceTable();

#ifdef DOC_SPLITTER_BUILD_CHUNK_TERM_LIST
	/**
	 *	@brief gets list of unique terms in the curernt chunk
	 *	@return Returns chunk term list.
	 *	@note This list is not filled until Build_TermOccurenceTable() is called.
	 */
	inline const std::vector<size_t> &Get_Chunk_TermList() const
	{
		return m_chunk_term_list;
	}
#endif //DOC_SPLITTER_BUILD_CHUNK_TERM_LIST

	/**
	 *	@brief gets table of term occurences in the curernt chunk
	 *	@return Returns term occurence table.
	 *	@note This list is not filled until Build_TermOccurenceTable() is called.
	 */
	const std::vector<TTermOccurence> &Get_TermOccurenceTable() const
	{
		return m_term_occurence_table;
	}

	/**
	 *	@brief gets list of offsets to term occurence list
	 *	@return Returns the offset list.
	 *	@note This list is not filled until Build_TermOccurenceTable() is called.
	 */
	const std::vector<size_t> &Get_TermOccurence_OffsetList() const
	{
		return m_offset_list;
	}

	/**
	 *	@brief gets list of term occurences for low-level algorithm
	 *
	 *	Term occurence list contains term id, number of occurences, and list of occurences
	 *	at each beginning offset (given by offsets in Get_TermOccurence_OffsetList()).
	 *
	 *	@return Returns term occurence list.
	 *	@note This list is not filled until Build_TermOccurenceTable() is called.
	 */
	const std::vector<size_t> &Get_TermOccurenceList() const
	{
		return m_occurence_list;
	}

	/**
	 *	@brief Gets offset list size limit, in items (not in bytes).
	 *	@return Returns maximal offset list size.
	 */
	size_t n_Max_OffsetList_Size() const
	{
		return m_n_max_chunk_size;
	}

	/**
	 *	@brief Gets occurence list size limit, in items (not in bytes).
	 *	@return Returns maximal occurence list size.
	 */
	size_t n_Max_OccurenceList_Size() const
	{
		return m_n_max_chunk_size + 2 * n_Max_OffsetList_Size();
	}

	/**
	 *	@brief builds v2 term occurence table, and associated tables for feeding low-level algorithm
	 *
	 *	This function introduces the following hierarchy of steps and structures:
	 *		- algorithm input is list of occurences of each term. Processing window is placed over each occurence of each individual term, adding up contributions of co-occuring terms to focused term. This input is impractical to be processed directly, because there may be too many threads required to process it, or there may be too much work to be done by a single thread (processing too much occurences would cause thread to time-out on GPU). That's why it needs to be subdivided to more fine-grained work-items.
	 *		- <b>slice</b> is list of parts of term occurences, where no part may be longer than maximal slice length. Slices may be processed in parallel because each part of the slice contributes to different terms and thus each such part may be processed by a single thread without memory write race conditions.
	 *		- <b>work-item</b> is list, containing (part of) term occurences for a single term. Slices are made-up of work-items.
	 *		- <b>pass</b> is a single pass of the algorithm, building term vectors. It processes (part of) a single slice. There are primary passess, which process many work-items of different terms and secondary passess, which process many work-items of a few repeating terms.
	 *		- <b>dummy vector</b>, also dummy vector bank is place where secondary passess store results. Different threads cannot work on a single term, so each secondary pass work-item is assigned dummy term vector to store output.
	 *		- <b>summation step</b> is final step of secondary pass, when dummy vectors are added to the original vectors.
	 *
	 *	@param[in] n_max_slice_length is number of term occurences, processed by one GPU thread in one pass (default 256)
	 *	@param[in] n_max_pass_size is maximal number of GPU threads, running in parallel in one pass (default 10000)
	 *	@param[in] n_min_primary_pass_size is minimal number of GPU threads, running in primary pass (default 1000)
	 *	@param[in] n_min_last_primary_pass_size is minimal number of GPU threads, running in the last primary pass (default 500)
	 *	@param[in] n_dummy_vector_bank_num is number of dummy term vector banks; this should be no more than n_max_pass_size, otherwise all the banks above that number will be left unused (default 8192)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	bool Build_TermOccurenceTable_v2(size_t n_max_slice_length,
		size_t n_max_pass_size, size_t n_min_primary_pass_size,
		size_t n_min_last_primary_pass_size, size_t n_dummy_vector_bank_num);

	/**
	 *	@brief gets list of work-items
	 *	@return Returns the offset list.
	 *	@note This list is not filled until Build_TermOccurenceTable_v2() is called.
	 */
	const std::vector<TWorkItem> &Get_WorkItemList_v2() const
	{
		return m_work_item_list_v2;
	}

	/**
	 *	@brief gets list of term occurences for low-level algorithm
	 *
	 *	Term occurence list contains raw list of occurences (unlike it's predecessor)
	 *	at each beginning offset (given by offsets in Get_TermOccurence_OffsetList_v2()).
	 *
	 *	@return Returns term occurence list.
	 *	@note This list is not filled until Build_TermOccurenceTable_v2() is called.
	 */
	const std::vector<size_t> &Get_TermOccurenceList_v2() const
	{
		return m_occurence_list_v2;
	}

	/**
	 *	@brief gets list of OpenCL kernel passes for processing current chunk
	 *	@return Returns kernel pass list.
	 *	@note This list is not filled until Build_TermOccurenceTable_v2() is called.
	 */
	const std::vector<TPass> &Get_PassList_v2() const
	{
		return m_pass_list;
	}

	/**
	 *	@brief Gets v2 occurence list size limit, in items (not in bytes).
	 *	@return Returns maximal v2 occurence list size.
	 */
	size_t n_Max_OccurenceList_v2_Size() const
	{
		return m_n_max_chunk_size;
	}

	/**
	 *	@brief Gets v2 work-item size limit, in items (not in bytes).
	 *	@return Returns maximal v2 offset list size.
	 */
	size_t n_Max_WorkItemList_v2_Size() const
	{
		return m_n_max_chunk_size;
	}

	/**
	 *	@brief Gets pass list size limit, in items (not in bytes).
	 *
	 *	@param[in] n_max_pass_size is limit of slices, running
	 *		in parallel. tasks above this size must be subdivided. 
	 *
	 *	@return Returns maximal task pass list size.
	 */
	size_t n_Max_PassList_Size(size_t n_max_pass_size) const
	{
		return (m_n_max_chunk_size + n_max_pass_size - 1) -
			(m_n_max_chunk_size + n_max_pass_size - 1) % n_max_pass_size;
	}

protected:
	/**
	 *	@brief helper function for std::accumulate()
	 *
	 *	@param[in] n_length is first summation operand
	 *	@param[in] r_doc is second summation operand
	 *
	 *	@return Returns sum of n_length + r_doc.term_position_list.size().
	 */
	static inline uint64_t SumDocLengths(uint64_t n_length, const TDocument &r_doc);

	/**
	 *	@brief calculates sum of lengths of all documents
	 *
	 *	@param[in] r_document_list is list of documents
	 *
	 *	@return Returns sum of lengths of all documents in r_document_list.
	 *	@f[
	 *		l_{docs} = \sum_{i=0}^{N_{docs} - 1} l_{doc_i}
	 *	@f]
	 */
	static uint64_t n_LengthOfAllDocuments(const std::vector <TDocument> &r_document_list);

	/**
	 *	@brief calculates length of concatenated document sequence
	 *
	 *	@param[in] n_length_of_all_documents is length of all documents (eg. obtained by call to n_LengthOfAllDocuments())
	 *	@param[in] n_document_num is number of documents
	 *	@param[in] n_halfwindow_size is half-size of window in the semantic term vector calculation algorithm
	 *
	 *	@return Returns length of concatenated document sequence.
	 *	@f[
	 *		l_{concat} = l_{docs} + l_{wnd} \left ( N_{docs} + 1 \right )
	 *	@f]
	 */
	static uint64_t n_LengthOfConcatSequence(uint64_t n_length_of_all_documents, size_t n_document_num, size_t n_halfwindow_size);

	/**
	 *	@brief calculates number of chunks to which documents shall be split for processing
	 *
	 *	@param[in] n_length_of_concat_sequence is length of all documents concatenated
	 *		with dummy spaces between them (eg. obtained by call to n_LengthOfConcatSequence())
	 *	@param[in] n_max_chunk_size is maximal chunk size (in term indices)
	 *	@param[in] n_halfwindow_size is half-size of window in the semantic term vector calculation algorithm
	 *
	 *	@return Returns number of chunks given their length.
	 *	@f[
	 *		N = \left \lceil
	 *				\frac{l_{concat} - 2l_{wnd}}{l_{chunk_{max}} - 2l_{wnd}}
	 *			\right \rceil
	 *	@f]
	 */
	static size_t n_Chunk_Num(uint64_t n_length_of_concat_sequence, size_t n_max_chunk_size, size_t n_halfwindow_size);

	/**
	 *	@brief gets term id from term-frequency pair
	 *
	 *	@param[in] t_pair is element of term frequency map in TDocument
	 *
	 *	@return Returns term id from t_pair.
	 */
	static inline int n_GetTerm_Id(std::map<size_t, size_t>::value_type t_pair)
	{
		_ASSERTE(t_pair.first <= INT_MAX);
		return int(t_pair.first);
	}

	/**
	 *	@brief greater-than comparator for sorting term occurence table by frequency
	 *
	 *	@param[in] a is first compared term occurence table item
	 *	@param[in] b is second compared term occurence table item
	 *
	 *	@return Returns true if first term has greater frequency than the other one, otherwise returns false.
	 */
	static inline bool b_HasGreaterFrequency(const TTermOccurence &a, const TTermOccurence &b)
	{
		return a.second.size() > b.second.size();
	}

	/**
	 *	@brief greater-than comparator for sorting term occurence permutatuion table by frequency
	 *
	 *	@param[in] a is first compared term occurence permutatuion table item
	 *	@param[in] b is second compared term occurence permutatuion table item
	 *
	 *	@return Returns true if first term has greater frequency than the other one, otherwise returns false.
	 */
	static inline bool b_HasGreaterFrequency_Perm(const std::pair<size_t, size_t> &a, const std::pair<size_t, size_t> &b)
	{
		return a.second > b.second;
	}

	/**
	 *	@brief greater-than comparator for std::lower_bound on work-item list
	 *
	 *	@param[in] r_t_work_item is work-item whose lenght is compared
	 *	@param[in] n_thresh is threshold
	 *
	 *	@return Returns true if r_t_work_item.n_length > n_thresh, otherwise returns false.
	 */
#ifdef _DEBUG
	class CCompareWorkItemLenght { /**< vc80 stl tests predicate sanity, need to have reverse operator as well */
	public:
		inline bool operator ()(size_t n_thresh, const TWorkItem &r_t_work_item) const /** @brief refer to CDocumentSplitter::b_WorkItemLenght_Above() */
		{
			return n_thresh > r_t_work_item.n_length;
		}

		inline bool operator ()(const TWorkItem &r_t_work_item, size_t n_thresh) const /** @brief refer to CDocumentSplitter::b_WorkItemLenght_Above() */
		{
			return r_t_work_item.n_length > n_thresh;
		}

		inline bool operator ()(const TWorkItem &r_t_work_item, const TWorkItem &r_t_work_item2) const /** @brief refer to CDocumentSplitter::b_WorkItemLenght_Above() */
		{
			return r_t_work_item.n_length > r_t_work_item2.n_length;
		}
	};
#define b_WorkItemLenght_Above CCompareWorkItemLenght()
#else //_DEBUG
	static inline bool b_WorkItemLenght_Above(const TWorkItem &r_t_work_item, size_t n_thresh)
	{
		return r_t_work_item.n_length > n_thresh;
	}
#endif //_DEBUG

	/**
	 *	@brief generates primary passess, procesing given range of work-items
	 *
	 *	@param[in] n_first_work_item is index of the first work-item
	 *	@param[in] n_last_work_item is 1 + index of the last work-item
	 *	@param[in] n_max_pass_size is maximal number of work-items in a single pass
	 *	@param[in] b_slice_aligned_pass is set if all the work-items have length n_max_slice_lenght
	 *
	 *	@return Returns true on success, false on failure.
	 */
	bool Generate_PrimaryPasses(size_t n_first_work_item, size_t n_last_work_item,
		size_t n_max_pass_size, bool b_slice_aligned_pass);
};

/**
 *	@brief siple test suite for CDocumentSplitter
 */
class CSplitterTester {
protected:
	std::vector<TDocument> m_document_list; /**< @brief test documents (sequence of contiguous terms) */
	size_t m_n_dummy_term; /**< @brief dummy term id (0) */
	size_t m_n_term_num; /**< @brief number of document terms (equals length of all documents) */
	bool m_b_contiguous_docs; /**< @brief for some tests, it's useful if documents contain repeating terms */

public:
	/**
	 *	@brief prepares documents for testing CDocumentSplitter
	 *
	 *	@param[in] n_doc_num is number of documents
	 *	@param[in] n_doc_size is minimal document size
	 *	@param[in] n_doc_size_variation is document size variation (positive only)
	 *	@param[in] b_contiguous_docs decides whether documents contains
	 *		contiguous terms (true), or whether they contain repeating terms as well (false)
	 *
	 *	@return Returns true on success, false on failure.
	 */
	bool PrepareDocs(size_t n_doc_num, size_t n_doc_size, size_t n_doc_size_variation, bool b_contiguous_docs = true);

	/**
	 *	@brief tests a single configuration of CDocumentSplitter
	 *
	 *	@param[in] n_chunk_size is size of chunks, documents are split to
	 *	@param[in] n_halfwindow_size is half-size of window in the semantic term vector calculation algorithm
	 *
	 *	@return Returns true on success, false on failure.
	 */
	bool Test_SplitterIntegrity(size_t n_chunk_size, size_t n_halfwindow_size) const;

	/**
	 *	@brief runs some basic tests, shows progress in stdout
	 *	@return Returns true if all tests succeeded, otherwise returns false.
	 */
	static bool DoTests();

	/**
	 *	@brief runs timing test, displays results on stdout
	 *
	 *	@param[in] n_chunk_size is size of chunks, documents are split to
	 *	@param[in] n_halfwindow_size is half-size of window in the semantic term vector calculation algorithm
	 *	@param[in] n_repeat_num is number of trials over which timing is averaged
	 *
	 *	@return Returns true on succeess, false on failure.
	 */
	bool Test_SplitterTiming(size_t n_chunk_size, size_t n_halfwindow_size, int n_repeat_num = 10) const;

	/**
	 *	@brief runs some timing tests, displays results on stdout
	 *
	 *	@param[in] n_repeat_num is number of trials over which timing is averaged
	 *
	 *	@return Returns true on succeess, false on failure.
	 */
	static bool DoSpeedTests(int n_repeat_num = 10);
};

#endif //__DOCUMENT_SPLITTER_INCLUDED
