/*
								+---------------------------------+
								|                                 |
								|   ***  Document splitter  ***   |
								|                                 |
								|  Copyright   -tHE SWINe- 2010  |
								|                                 |
								|          DocSplitter.h          |
								|                                 |
								+---------------------------------+
*/

/**
 *	@file DocSplitter.h
 *	@author -tHE SWINe-
 *	@brief Document stream splitter
 *	@date 2010-11-16
 */

#ifndef __DOCUMENT_SPLITTER_INCLUDED
#define __DOCUMENT_SPLITTER_INCLUDED

#include "../../UberLame_src/Integer.h"

/**
 *	@def DOC_SPLITTER2_TRACK_CHUNK_DOCS
 *	@brief if defined, CDocStreamSplitter tracks first and last document present in current chunk
 *	@deprecated This is deprecated as it's easier to track documents using
 *		information from CLucene. This feature is no longer implemented.
 */
//#define DOC_SPLITTER2_TRACK_CHUNK_DOCS

/**
 *	@def DOC_SPLITTER2_BUILD_CHUNK_TERM_LIST
 *	@brief if defined, CDocStreamSplitter builds list of terms, present in current chunk
 *	@note DOC_SPLITTER2_TRACK_CHUNK_DOCS is required
 *	@deprecated This is deprecated as CLucene already provides this list. This feature is no longer implemented.
 */
//#define DOC_SPLITTER2_BUILD_CHUNK_TERM_LIST

/**
 *	@def DOC_SPLITTER2_USE_TERM_OCCURENCE_PERMUTATION_TABLE
 *	@brief if defined, CDocStreamSplitter employs permutation table to avoid sorting large term occurence table directly
 *	@note DOC_SPLITTER2_BUILD_CHUNK_TERM_LIST overrides this
 *	@deprecated This is deprecated as CLucene already provides this list. This feature is always implemented.
 */
#define DOC_SPLITTER2_USE_TERM_OCCURENCE_PERMUTATION_TABLE

/**
 *	@def DOC_SPLITTER2_ALIGNED_SLICES_IN_FIRST_PASS
 *	@brief if defined, terms with frequency equal to whole multiple of slice length are processed in the first pass, potentialy increasing slice lenght variance in that pass and thus lowering computation efficiency
 *	@note This is mostly insignificant, there are almost no such terms.
 *	@note In case DOC_SPLITTER2_SLICE_REMAINDERS_TO_THE_FIRST_PASS is not defined, this only applies to terms with frequency equal to slice length (ie. not it's multiple).
 */
//#define DOC_SPLITTER2_ALIGNED_SLICES_IN_FIRST_PASS

/**
 *	@def DOC_SPLITTER2_SLICE_REMAINDERS_TO_THE_FIRST_PASS
 *	@brief if defined, slices with size less than maximal slice length are placed in the first pass (otherwise they would be in all the passes), so all the next chunks have slices with exactly the same size
 *	@note Order of processing term occurences is not changed, ie. last slices aren't moved to the first pass, instead first slices are made shorter as required. DOC_SPLITTER2_ALIGNED_SLICES_IN_FIRST_PASS affects some of those slices.
 */
#define DOC_SPLITTER2_SLICE_REMAINDERS_TO_THE_FIRST_PASS

/**
 *	@def DOC_SPLITTER2_USE_HASH_CONTAINERS
 *	@brief uses stdext::hash_map instead of std::map (specifically for building the term occurence table)
 */
//#define DOC_SPLITTER2_USE_HASH_CONTAINERS

/**
 *	@brief splits document streams into chunks
 *
 *	This is a slightly cannibalized version of the original CDocStreamSplitter, designed to work with generic stream of document (contrary to the std::vector<TDocument> it's predecessor worked with).
 *
 *	Chunks contain indices of terms in order they apeear in consecutive documents.
 *	Individual documents are separated by amount of occurences of dummy term to prevent
 *	last term of one document having effect on first term of the next document.
 *	In order to make processing code simple, each chunk have enough (half-window size)
 *	leading and trailing padding (dummy terms) so the code reading terms under the
 *	window doesn't need to check for array boundaries. Consequently, in case all the documents
 *	do not fit in a single chunk, part of the last chunk is repeated in the next chunk.
 *	This repeated part is called restart interval and it is two half-window sizes long.
 *
 *	Naive processing code then looks like this:
 *
 *@code
 *	CDocStreamSplitter::CDocumentStorage *docs = ...;
 *	CDocStreamSplitter::term_id_t dummy_term_id = ...; // eg. terms.size()
 *	size_t chunk_size = ...; // eg. cl_max_allocation_size / sizeof(float)
 *	size_t halfwindow_size = 10;
 *
 *	CDocStreamSplitter splitter(*docs, dummy_term_id, chunk_size, halfwindow_size);
 *
 *	splitter.Prepare_FirstChunk(); // todo - mind some error-checking
 *	do {
 *		const std::vector<CDocStreamSplitter::term_id_t> &chunk = splitter.Get_Chunk();
 *		for(size_t i = halfwindow_size; i < chunk.size() - halfwindow_size; ++ i) {
 *			size_t focused_term_id = chunk[i];
 *			for(size_t wnd = i - halfwindow_size; wnd <= i + halfwindow_size; ++ wnd) {
 *				if(wnd == i)
 *					continue;
 *				size_t co_occuring_term_id = chunk[wnd];
 *
 *				// todo - update term vectors for focused_term_id, based on co_occuring_term_id
 *			}
 *		}
 *	} while(splitter.Prepare_NextChunk());@endcode
 *
 *	Note the effective window size is twice half-window size + 1.
 *
 *	@note There are functions to build lists of work-items, which makes more sophisticated
 *		parallel processing viable. Specifically Build_TermOccurenceTable() and Build_TermOccurenceTable_v2(),
 *		which are most effective for term vector element-parallel (long seed vectors, eg. retraining) and
 *		term-parallel (sparse seed vectors) processing, respectively.
 *
 */
class CDocStreamSplitter {
public:
	typedef uint32_t term_id_t; /**< @brief primitive type for storing term id's (will be used on GPU) */
	typedef uint32_t chunk_off_t; /**< @brief primitive data type for indexing the chunk (will be used on GPU) */

/**
 *	@def CHUNK_OFF_MAX
 *	@brief maximal value of the chunk_off_t type
 */
#define CHUNK_OFF_MAX UINT32_MAX

	/**
	 *	@brief document storage virtual class
	 */
	class CDocumentStorage {
	public:
		/**
		 *	@brief resets index of the current document to zero
		 */
		virtual void Reset() = 0;

		/**
		 *	@brief determines whether there are more documents
		 *	@return Returns true if document index is less than document counnt, otherwise returns false.
		 */
		virtual bool b_HaveMoreDocuments() const = 0;

		/**
		 *	@brief gets list of terms in the current document, and increments the document counter
		 *	@param[out] r_term_vector is list of 
		 */
		virtual bool GetNextDocument(std::vector<term_id_t> &r_term_vector) = 0;
	};

	/**
	 *	@brief generic range of generic data
	 *
	 *	A Build_TermOccurenceTable_v2() structure.
	 */
	struct TRange {
		chunk_off_t n_offset; /**< @brief zero-based index of first data-item */
		chunk_off_t n_length; /**< @brief number of data-items */

		/**
		 *	@brief default constructor; constructs empty range
		 */
		TRange();

		/**
		 *	@brief constructor
		 *
		 *	@param[in] _n_offset is zero-based index of first data-item
		 *	@param[in] _n_length is number of data-items
		 */
		TRange(chunk_off_t _n_offset, chunk_off_t _n_length);

		/**
		 *	@brief less-than operator for sorting ranges by their length
		 *
		 *	@param[in] r_t_other is other range, this range is compared to
		 *
		 *	@return Returns true if this range is shorter than r_t_other.
		 */
		bool operator <(const TRange &r_t_other) const;

		/**
		 *	@brief greater-than operator for sorting ranges by their length
		 *
		 *	@param[in] r_t_other is other range, this range is compared to
		 *
		 *	@return Returns true if this range is longer than r_t_other.
		 */
		bool operator >(const TRange &r_t_other) const;
	};

	/**
	 *	@brief generic range of term-assigned data
	 *
	 *	A Build_TermOccurenceTable_v2() structure.
	 */
	struct TTermRange : public TRange {
		term_id_t n_term_id; /**< @brief term this range is assigned to */

		/**
		 *	@brief default constructor; constructs empty range
		 */
		TTermRange();

		/**
		 *	@brief constructor
		 *
		 *	@param[in] _n_offset is zero-based index of first data-item
		 *	@param[in] _n_length is number of data-items
		 *	@param[in] _n_term_id is term this range is assigned to
		 */
		TTermRange(chunk_off_t _n_offset, chunk_off_t _n_length, term_id_t _n_term_id);
	};

	/**
	 *	@brief build term vectors algorithm work-item (range of occurences of specific term)
	 *
	 *	A Build_TermOccurenceTable_v2() structure.
	 */
	typedef TTermRange TWorkItem;

	/**
	 *	@brief description of one pass of build term vectors algorithm
	 *
	 *	A Build_TermOccurenceTable_v2() structure.
	 */
	struct TPass : public TRange {
		/**
		 *	@brief summation step range (points to dummy vector banks)
		 *
		 *	A Build_TermOccurenceTable_v2() structure.
		 */
		typedef TTermRange TSummationStep;

		bool b_primary; /**< @brief primary task passes work on real vectors; secondary task passes subdivide few long vectors and work on dummy places which are later summed and added to real vectors */
#ifdef DOC_SPLITTER2_SLICE_REMAINDERS_TO_THE_FIRST_PASS
		bool b_slice_aligned; /**< @brief if set, all work-items in this pass are n_max_slice_length long (requires DOC_SPLITTER2_SLICE_REMAINDERS_TO_THE_FIRST_PASS) */
#endif //DOC_SPLITTER2_SLICE_REMAINDERS_TO_THE_FIRST_PASS
		std::vector<TSummationStep> summation_list; /**< @brief list of summations performed by secondary task passes (those are data-parallel and are executed one by one, do not need to reside in an array to be run in a single OpenCL batch) */

		/**
		 *	@brief default constructor; constructs empty pass
		 */
		TPass();

		/**
		 *	@brief constructor
		 *
		 *	@param[in] _n_offset is zero-based index of first work-item (points to CDocStreamSplitter::m_work_item_list_v2)
		 *	@param[in] _n_length is number of work-items
		 *	@param[in] _b_primary is primary pass flag
		 */
		TPass(chunk_off_t _n_offset, chunk_off_t _n_length, bool _b_primary = true);
	};

	typedef std::pair<term_id_t, std::vector<chunk_off_t> > TTermOccurence; /**< @brief term occurence (term and list of it's positions in the chunk) */

protected:
	class CGetIndexFrequencyPair; /**< @brief a small function object for building the term occurence permutation table */

	CDocumentStorage &m_r_doc_storage; /**< @brief reference to the list of documents */
	chunk_off_t m_n_max_chunk_size; /**< @brief target chunk size */
	chunk_off_t m_n_window_size; /**< @brief half-size of window in the semantic term vector calculation algorithm */
	term_id_t m_n_dummy_term; /**< @brief index of dummy term vector */

	size_t m_n_current_chunk; /**< @brief zero-based index of the current chunk */
	std::vector<term_id_t> m_current_doc; /**< @brief term vector of the current document */
	size_t m_n_current_doc_off; /**< @brief offset into the current document, and it's padding */
	std::vector<term_id_t> m_chunk; /**< @brief buffer with current chunk */

	std::vector<TTermOccurence> m_term_occurence_table; /**< @brief table of term occurences */

	std::vector<TPass> m_pass_list; /**< @brief list of build term vector algorithm passess; contains up to chunk size / n_max_pass_size task passes */
	std::vector<TWorkItem> m_work_item_list_v2; /**< @brief list of build term vector algorithm work-items; contains up to chunk size work-items */
	std::vector<chunk_off_t> m_occurence_list_v2; /**< @brief list of raw term occurences in the current chunk; contains up to chunk size items */

public:
	/**
	 *	@brief default constructor
	 *
	 *	@param[in] r_doc_storage is document storage provider
	 *	@param[in] n_dummy_term is index of dummy term vector (vector, containing nulls, used to separate documents)
	 *	@param[in] n_max_chunk_size is maximal chunk size (in term indices)
	 *	@param[in] n_halfwindow_size is half-size of window in the semantic term vector calculation algorithm
	 */
	CDocStreamSplitter(CDocumentStorage &r_doc_storage,
		term_id_t n_dummy_term, size_t n_max_chunk_size, size_t n_halfwindow_size);

	/**
	 *	@brief resets the document splitter
	 *
	 *	@return Returns true on success, false on failure (not enough memory).
	 */
	void Reset();

	/**
	 *	@brief determines whether there are going to be more chunks
	 *
	 *	@return Returns true if there are documents to build the next chunk from, otherwise returns false.
	 */
	inline bool b_Have_NextChunk() const
	{
		_ASSERTE(m_n_current_doc_off <= m_current_doc.size() + m_n_window_size);
		return m_n_current_doc_off < m_current_doc.size() + m_n_window_size || m_r_doc_storage.b_HaveMoreDocuments();
	}

	/**
	 *	@brief prepares the next chunk
	 *
	 *	@return Returns true on success, or false if there's no more chunks.
	 *
	 *	@note Current chunk may be accessed using Get_Chunk().
	 */
	bool Build_NextChunk();

	/**
	 *	@brief gets current chunk
	 *
	 *	@return Returns const reference to current chunk.
	 *
	 *	@note Contents of returned buffer are invalid unless
	 *		Prepare_FirstChunk() (or subsequently Prepare_NextChunk()) is called.
	 */
	inline const std::vector<term_id_t> &Get_Chunk() const
	{
		return m_chunk;
	}

	/**
	 *	@brief builds the v2 term occurence table
	 *
	 *	@return Returns true on success, false on failure.
	 */
	bool Build_TermOccurenceTable_v2();

	/**
	 *	@brief gets table of term occurences in the curernt chunk
	 *
	 *	@return Returns term occurence table.
	 *
	 *	@note This list is not filled until Build_TermOccurenceTable_v2() is called.
	 */
	inline const std::vector<TTermOccurence> &Get_TermOccurenceTable() const
	{
		return m_term_occurence_table;
	}

	/**
	 *	@brief builds v2 structures for feeding the low-level algorithm
	 *
	 *	This specifically fills the low-level (raw) term occurence list, builds the work-item list
	 *	which contains slice-sized work-items, and fills the pass list.
	 *
	 *	This function introduces the following hierarchy of steps and structures:
	 *		- algorithm input is list of occurences of each term. Processing window is placed over each occurence of each individual term, adding up contributions of co-occuring terms to focused term. This input is impractical to be processed directly, because there may be too many threads required to process it, or there may be too much work to be done by a single thread (processing too much occurences would cause thread to time-out on GPU). That's why it needs to be subdivided to more fine-grained work-items.
	 *		- <b>slice</b> is list of parts of term occurences, where no part may be longer than maximal slice length. Slices may be processed in parallel because each part of the slice contributes to different terms and thus each such part may be processed by a single thread without memory write race conditions.
	 *		- <b>work-item</b> is list, containing (part of) term occurences for a single term. Slices are made-up of work-items.
	 *		- <b>pass</b> is a single pass of the algorithm, building term vectors. It processes (part of) a single slice. There are primary passess, which process many work-items of different terms and secondary passess, which process many work-items of a few repeating terms.
	 *		- <b>dummy vector</b>, also dummy vector bank is place where secondary passess store results. Different threads cannot work on a single term, so each secondary pass work-item is assigned dummy term vector to store output.
	 *		- <b>summation step</b> is final step of secondary pass, when dummy vectors are added to the original vectors.
	 *
	 *	@param[in] n_max_slice_length is number of term occurences, processed by one GPU thread in one pass (default 256)
	 *	@param[in] n_max_pass_size is maximal number of GPU threads, running in parallel in one pass (default 10000)
	 *	@param[in] n_min_primary_pass_size is minimal number of GPU threads, running in primary pass (default 1000)
	 *	@param[in] n_min_last_primary_pass_size is minimal number of GPU threads, running in the last primary pass (default 500)
	 *	@param[in] n_dummy_vector_bank_num is number of dummy term vector banks; this should be no more than n_max_pass_size, otherwise all the banks above that number will be left unused (default 8192)
	 *
	 *	@return Returns true on success, false on failure.
	 *	@note This must be called after Build_TermOccurenceTable_v2().
	 */
	bool Build_PassList(chunk_off_t n_max_slice_length,
		chunk_off_t n_max_pass_size, chunk_off_t n_min_primary_pass_size,
		chunk_off_t n_min_last_primary_pass_size, size_t n_dummy_vector_bank_num);

	/**
	 *	@brief gets list of work-items
	 *	@return Returns the offset list.
	 *	@note This list is not filled until Build_TermOccurenceTable_v2() is called.
	 */
	inline const std::vector<TWorkItem> &Get_WorkItemList_v2() const
	{
		return m_work_item_list_v2;
	}

	/**
	 *	@brief Gets v2 work-item size limit, in items (not in bytes).
	 *	@return Returns maximal v2 offset list size.
	 */
	inline chunk_off_t n_Max_WorkItemList_v2_Size() const
	{
		return m_n_max_chunk_size;
	}

	/**
	 *	@brief gets list of term occurences for low-level algorithm
	 *
	 *	Term occurence list contains raw list of occurences (unlike it's predecessor)
	 *	at each beginning offset (given by offsets in Get_TermOccurence_OffsetList_v2()).
	 *
	 *	@return Returns term occurence list.
	 *	@note This list is not filled until Build_TermOccurenceTable_v2() is called.
	 */
	inline const std::vector<chunk_off_t> &Get_TermOccurenceList_v2() const
	{
		return m_occurence_list_v2;
	}

	/**
	 *	@brief Gets v2 occurence list size limit, in items (not in bytes).
	 *	@return Returns maximal v2 occurence list size.
	 */
	inline chunk_off_t n_Max_OccurenceList_v2_Size() const
	{
		return m_n_max_chunk_size;
	}

	/**
	 *	@brief gets list of OpenCL kernel passes for processing current chunk
	 *	@return Returns kernel pass list.
	 *	@note This list is not filled until Build_TermOccurenceTable_v2() is called.
	 */
	inline const std::vector<TPass> &Get_PassList_v2() const
	{
		return m_pass_list;
	}

	/**
	 *	@brief Gets pass list size limit, in items (not in bytes).
	 *
	 *	@param[in] n_max_pass_size is limit of slices, running
	 *		in parallel. tasks above this size must be subdivided. 
	 *
	 *	@return Returns maximal task pass list size.
	 */
	inline size_t n_Max_PassList_v2_Size(size_t n_max_pass_size) const
	{
		return n_Align_Up(size_t(m_n_max_chunk_size), n_max_pass_size);
	}

protected:
	static inline bool b_HasGreaterFrequency_Perm(const std::pair<chunk_off_t, chunk_off_t> &a,
		const std::pair<chunk_off_t, chunk_off_t> &b);
#ifdef _DEBUG
	class CCompareWorkItemLenght; /**< vc80 stl tests predicate sanity, need to have reverse operator as well */
#else //_DEBUG
	static inline bool b_WorkItemLenght_Above(const TWorkItem &r_t_work_item, chunk_off_t n_thresh);
#endif //_DEBUG
	bool Generate_PrimaryPasses(size_t n_first_work_item, size_t n_last_work_item,
		size_t n_max_pass_size, bool b_slice_aligned_pass);
};

#endif //__DOCUMENT_SPLITTER_INCLUDED
