/*
								+---------------------------------+
								|                                 |
								| *** Lucene doc reader iface *** |
								|                                 |
								|  Copyright   -tHE SWINe- 2010  |
								|                                 |
								|           DocReader.h           |
								|                                 |
								+---------------------------------+
*/

/**
 *	@file DocReader.h
 *	@author -tHE SWINe-
 *	@brief CLucene document reader interface
 *	@date 2010-11-06
 */

#ifndef __LUCENE_READER_INCLUDED
#define __LUCENE_READER_INCLUDED

/**
 *	@def __LUCENE_READER_EXPECT_GIGAWORD_DOCS
 *	@brief we want to process documents with gigaword-specfic fields
 */
#define __LUCENE_READER_EXPECT_GIGAWORD_DOCS

/**
 *	@def __LUCENE_READER_USE_HASH_CONTAINERS
 *	@brief decides whether to use stdext::hash_map and stdext::hash_set
 *		instead of std::map and std::set
 */
//#define __LUCENE_READER_USE_HASH_CONTAINERS

#include "CLucene.h"
#include <string>
#include <vector>
#include <map>
#ifdef __LUCENE_READER_USE_HASH_CONTAINERS
#include <hash_map>

#if defined(_MSC_VER) && !defined(__MWERKS__) && _MSC_VER < 1400 && !defined(stdext)
#define stdext std
#endif //_MSC_VER && !__MWERKS__ && _MSC_VER < 1400
// msvc60 doesn't have stdext::hash map, but std::hash_map instead

#endif //__LUCENE_READER_USE_HASH_CONTAINERS

/**
 *	@brief class implementing basic read operations on CLucene index
 */
class CLuceneIndexReaderInterface {
public:
	typedef std::basic_string<TCHAR> wstring; /**< wide string type (utf-16) */

#if 0 // so large values in enum cause problems in vc2008. what's worse, it doesn't even report any errors
	enum {
		term_Dummy = UINT32_MAX,
		term_RemoveOp = UINT32_MAX - 1,
		term_MaxTermId = UINT32_MAX - 1
	};
#else
#define term_Dummy (UINT32_MAX)
#define term_RemoveOp (UINT32_MAX - 1)
#define term_MaxTermId (UINT32_MAX - 1)
#endif

protected:
	/**
	 *	@brief hash function for wide strings
	 */
	class CWideCStringHasher {
	public:
		enum {
			bucket_size = 256,
			min_buckets = 16384 // we expect *lots* of terms
		};

		/**
		 *	@brief default constructor (has no effect)
		 */
		inline CWideCStringHasher()
		{}

		/**
		 *	@brief hash function
		 *
		 *	@param[in] p_key is wide string (key in hash-map)
		 *
		 *	@return Returns hash of p_key.
		 */
		inline size_t operator ()(const TCHAR *p_key) const
		{
			size_t hash_value = 13;
			for(; *p_key; ++ p_key)
				hash_value = hash_value * 23 + *p_key;
			return hash_value;
		}

		/**
		 *	@brief less-than ordering function
		 *
		 *	@param[in] p_left is wide string
		 *	@param[in] p_right is wide string
		 *
		 *	@return Returns true if p_left is before p_right, otherwise returns false.
		 */
		inline bool operator ()(const TCHAR *p_left, const TCHAR *p_right) const
		{
			return wcscmp(p_left, p_right) < 0;
		}
	};

	/**
	 *	@brief logarithmic scale histogram (bin size increases exponentially)
	 *	@param[in] n_bin_num is number of histogram bins
	 *	@note This is more of a debug / stats functionality.
	 */
	template <const int n_bin_num>
	struct TLogScaleHistogram {
		size_t p_freq[n_bin_num];
		size_t n_off_scale;

		static inline int n_Low(int n_bin)
		{
			return (1 << (n_bin)) - 1;
		}

		static inline int n_High(int n_bin)
		{
			return n_Low(n_bin) + (1 << (n_bin));
		}

		void Reset()
		{
			for(int i = 0; i < n_bin_num; ++ i) 
				p_freq[i] = 0;
			n_off_scale = 0;
		}

		void Count(int n_value)
		{
			for(int i = 0; i < n_bin_num; ++ i) {
				if(n_value >= n_Low(i) && n_value < n_High(i)) {
					++ p_freq[i];
					return;
				}
			}
			++ n_off_scale;
		}

		size_t n_Sum() const
		{
			size_t n_result = 0;
			for(int i = 0; i < n_bin_num; ++ i)
				n_result += p_freq[i];
			return n_result + n_off_scale;
		}

		size_t n_Max() const
		{
			size_t n_result = 0;
			for(int i = 0; i < n_bin_num; ++ i)
				n_result = max(n_result, p_freq[i]);
			return max(n_result, n_off_scale);
		}

		void Print()
		{
			double f_renorm = 50.0 / n_Max();
			for(int i = 0; i < n_bin_num; ++ i) {
				printf("%5d - %-5d: %5d: %s\n", n_Low(i), n_High(i), p_freq[i],
					"**************************************************" + 50 - int(ceil(f_renorm * p_freq[i])));
			}
			printf("  off-scale  : %5d: %s\n", n_off_scale,
				"**************************************************" + 50 - int(ceil(f_renorm * n_off_scale)));
		}
	};

	/**
	 *	@brief hash-map with wide string keys and int values
	 */
#ifdef __LUCENE_READER_USE_HASH_CONTAINERS
	typedef stdext::hash_map<const TCHAR*, size_t, CWideCStringHasher> CWideCStringIndexMap;
#else
	typedef std::map<const TCHAR*, size_t, CWideCStringHasher> CWideCStringIndexMap;
#endif

	lucene::index::IndexReader *m_p_index_reader; /**< CLucene index reader */

	std::vector<std::pair<wstring, size_t> > m_term_list; /**< list of terms present in the index, and of their frequencies */
	CWideCStringIndexMap m_term_global_index_map; /**< maps terms to indices to m_term_list */

	std::vector<std::pair<wstring, size_t> > m_lemma_list; /**< list of lemma names and numbers of references */
	std::vector<uint32_t> m_lemmatized_term_id_list; /**< same size as m_term_list, only it contains id's of lemmas / dummy term id for rejected terms */

	std::vector<uint32_t> m_document_term_list; /**< list of terms, present in the last document retrieved using Get_Document(); those are lemmatized */
	std::vector<uint32_t> m_temp_term_vector; /**< unfiltered version of m_document_term_list (only updated if filtering n_remove_term_id's) */

	TLogScaleHistogram<16> m_t_docu_tr_hist;

public:
	CLuceneIndexReaderInterface(const char *p_s_input_index);
	~CLuceneIndexReaderInterface();

	bool b_Status() const;

	inline const std::vector<uint32_t> &r_Document_TermVector() const
	{
		return m_document_term_list;
	}

	inline const std::vector<std::pair<wstring, size_t> > &r_TermList() const
	{
		return (m_lemma_list.empty())? m_term_list : m_lemma_list; // return lemma list directly, if used
	}

	bool Get_TermList(bool b_verbose = false);

	bool Lemmatize(const char *p_s_lemma_file, size_t n_min_term_frequency, size_t n_min_lemma_frequency,
		bool b_remove_less_frequent_terms, bool b_remove_lemma_less_terms,
		bool b_use_dummy_for_lemma_less_terms, bool b_verbose = false);

	void Delete_LemmaList(); /**< the lemmas (the token texts) aren't normally needed for the processing, they take up some memory so it may be good thing to free them */

	inline size_t n_Document_Num() const
	{
		return m_p_index_reader->numDocs();
	}

	bool Get_Document(size_t n_document, bool b_lemmatize, std::vector<uint32_t> &r_term_vector,
		wstring &r_s_doc_file, wstring &r_s_doc_modified);

	void Dump_TermRemovalHistogram();

protected:
	template <const uint32_t n_remove_term_id>
	class CConditionalCopy {
	protected:
		std::vector<uint32_t>::iterator m_p_out_it, m_p_begin_it;

	public:
		inline CConditionalCopy(std::vector<uint32_t>::iterator p_out_it)
			:m_p_out_it(p_out_it), m_p_begin_it(p_out_it)
		{}

		inline void operator ()(uint32_t n_term_id)
		{
			if(n_term_id != n_remove_term_id) {
				*m_p_out_it = n_term_id;
				++ m_p_out_it;
			}
		}

		/*inline operator std::vector<uint32_t>::iterator() const // doesn't work for some odd reason
		{
			return m_p_out_it;
		}*/

		inline operator size_t() const
		{
			return m_p_out_it - m_p_begin_it;
		}
	};

	static inline bool b_HigherTermFreq(const std::pair<wstring, size_t> &a, const std::pair<wstring, size_t> &b);
	static void TrimSpaceW(wstring &r_s_string);
	static bool ReadLineW(FILE *p_fr, wstring &r_s_line, size_t &r_n_cur_line);
};

#endif //__LUCENE_READER_INCLUDED
