/*
								+---------------------------------+
								|                                 |
								| *** Lucene doc reader iface *** |
								|                                 |
								|  Copyright   -tHE SWINe- 2010  |
								|                                 |
								|          DocReader.cpp          |
								|                                 |
								+---------------------------------+
*/

#include "../../UberLame_src/CallStack.h"
#include <stdio.h>
#include <algorithm>
#include <numeric>
#include "../../UberLame_src/Integer.h"
#include "../../UberLame_src/Dir.h" // PRIsizeB
#include "../../UberLame_src/StlUtils.h"
#include "../../UberLame_src/Timer.h"
#include "DocReader.h"

/*
 *								=== CLuceneIndexReaderInterface ===
 */

CLuceneIndexReaderInterface::CLuceneIndexReaderInterface(const char *p_s_input_index)	
{
	try {
		m_p_index_reader = lucene::index::MultiReader::open(
			lucene::store::FSDirectory::getDirectory(p_s_input_index, false));
	} catch(CLuceneError &err) {
		fprintf(stderr, "error: CLucene exception: %s\n", err.what());
		m_p_index_reader = 0;
	}

	m_t_docu_tr_hist.Reset();
}

CLuceneIndexReaderInterface::~CLuceneIndexReaderInterface()
{
	_CLDELETE(m_p_index_reader);
}

bool CLuceneIndexReaderInterface::b_Status() const
{
	return m_p_index_reader != 0;
}

bool CLuceneIndexReaderInterface::Get_TermList(bool b_verbose)
{
	m_term_list.clear();
	m_term_global_index_map.clear();

	if(b_verbose)
		printf("getting terms ...\n");

	CTimer timer; // verbose/perf timer
	double f_next_verbose_time = 1;

#ifdef _DEBUG
	bool b_dont_expecting_more_terms = false;
#endif //_DEBUG
	lucene::index::TermEnum *p_term_enum = m_p_index_reader->terms();
	while(p_term_enum->next()) {
		lucene::index::Term *p_term = p_term_enum->term(); // @note do not free this
		const TCHAR *p_s_field = p_term->field();

		if(wcscmp(p_s_field, L"contents")) {
#ifdef _DEBUG
			b_dont_expecting_more_terms = true;
			continue;
#else //_DEBUG
			break;
#endif //_DEBUG
		}
		// just want terms for "contents"

		_ASSERTE(!b_dont_expecting_more_terms); // if(b_dont_expecting_more_terms) fprintf(stderr, "warning: terms for individual fields aren't stored in contiguous blocks!\n");
		// see if it's possible to just break once "contents" terms stop ... that would save a lot of time

		size_t n_term_freq = 0;
		{
			lucene::index::TermDocs *p_term_docs = m_p_index_reader->termDocs(p_term);
			while(p_term_docs->next()) {
				uint32_t n_doc_freq = unsigned(p_term_docs->freq());
				if(n_term_freq >= SIZE_MAX - n_doc_freq)
					n_term_freq = SIZE_MAX; // saturate
				else
					n_term_freq += n_doc_freq;
			}
			_CLDELETE(p_term_docs);
		}
		// calculate term frequency accross all the documents

		const TCHAR *p_s_text = p_term->text();

		try {
			m_term_list.push_back(std::make_pair(wstring(p_s_text), n_term_freq));
		} catch(std::bad_alloc&) {
			return false;
		}
		// add the term and it's frequency to the list

		if(b_verbose && !(m_term_list.size() & 1023) && timer.f_Time() > f_next_verbose_time) {
			f_next_verbose_time = timer.f_Time() + 1;
			printf("%d\r", m_term_list.size());
		}
		// verbose
	}
	p_term_enum->close();
	_CLDELETE(p_term_enum);
	// go trough documents terms

	if(b_verbose) {
		printf("%16s\rdone. ", "");
		printf("there is %d terms (" PRIsizeB "B assuming 1024 long float vectors)\n",
			m_term_list.size(), PRIsizeBparams(uint64_t(m_term_list.size()) *
			1024 * sizeof(float)));

		printf("sorting the terms by frequency ...\n");
	}
	// verbose

	try {
		std::stable_sort(m_term_list.begin(), m_term_list.end(), b_HigherTermFreq);
	} catch(std::bad_alloc&) {
		return false;
	}
	// sort the terms by frequency (max freq first)

	if(m_term_list.size() > term_MaxTermId)
		return false;
	// number of terms must not exceed term_MaxTermId

	if(b_verbose) {
		if(m_term_list.size() > 2) {
			wprintf(L"the first three term frequencies are %d (\'%s\'), %d (\'%s\'), %d (\'%s\')\n",
				m_term_list[0].second, m_term_list[0].first.c_str(),
				m_term_list[1].second, m_term_list[1].first.c_str(),
				m_term_list[2].second, m_term_list[2].first.c_str());
		}
		printf("building global terms index map ...\n");
	}

	try {
		for(size_t i = 0, n = m_term_list.size(); i < n; ++ i) {
			m_term_global_index_map[m_term_list[i].first.c_str()] = i;

			if(b_verbose && !(i & 1023) && timer.f_Time() > f_next_verbose_time) {
				f_next_verbose_time = timer.f_Time() + 1;
				printf("%d\r", i + 1);
			}
			// verbose
		}
	} catch(std::bad_alloc&) {
		return false;
	}
	// build global index map

	return true;
}

bool CLuceneIndexReaderInterface::Lemmatize(const char *p_s_lemma_file,
	size_t n_min_term_frequency, size_t n_min_lemma_frequency,
	bool b_remove_less_frequent_terms, bool b_remove_lemma_less_terms,
	bool b_use_dummy_for_lemma_less_terms, bool b_verbose)
{
	m_lemma_list.clear();
	m_lemmatized_term_id_list.clear();

	CTimer timer; // verbose/perf timer
	double f_next_verbose_time = 1;
	// verbose

	size_t n_removed_terms = 0;
	size_t n_lemma_miss_rate = 0;
	// stats

	{
#ifdef __LUCENE_READER_USE_HASH_CONTAINERS
		stdext::hash_map<wstring, size_t> lemma_map, rev_lemma_map;
#else //__LUCENE_READER_USE_HASH_CONTAINERS
		std::map<wstring, size_t> lemma_map, rev_lemma_map;
#endif //__LUCENE_READER_USE_HASH_CONTAINERS

		if(p_s_lemma_file) {
			FILE *p_fr;
			if(!(p_fr = fopen(p_s_lemma_file, "rb")))
				return false;

			if(b_verbose)
				printf("reading mapping file ...\n");

			wstring s_line, s_lemma;
			size_t n_line = 0;
			while(ReadLineW(p_fr, s_line, n_line)) {
				size_t n_pos;
				if((n_pos = s_line.find(';')) == std::string::npos) {
					fprintf(stderr, "warning: there is no semicolon on line %d\n", n_line);
					continue;
				}
				if(s_line.find(';', n_pos + 1) != std::string::npos)
					fprintf(stderr, "warning: there is more than one semicolon on line %d\n", n_line);
				// find semicolon

				try {
					s_lemma.erase();
					s_lemma.insert(s_lemma.begin(), s_line.begin() + n_pos + 1, s_line.end());
					s_line.erase(n_pos);
					// split line to term/lemma

					const wstring &s_term = s_line; // the line contains the term
					// just a different name for the sake of clarity

					if(m_term_global_index_map.find(s_term.c_str()) == m_term_global_index_map.end())
						continue;
					// do not load lemmas that wont be referenced anyway

#ifdef __LUCENE_READER_USE_HASH_CONTAINERS
					stdext::hash_map<wstring, size_t>::iterator p_lemma_it;
#else //__LUCENE_READER_USE_HASH_CONTAINERS
					std::map<wstring, size_t>::iterator p_lemma_it;
#endif //__LUCENE_READER_USE_HASH_CONTAINERS
					if((p_lemma_it = rev_lemma_map.find(s_lemma)) == rev_lemma_map.end()) {
						size_t n_new_lemma_id = m_lemma_list.size();
						if(n_new_lemma_id >= term_MaxTermId) {
							fclose(p_fr);
							return false;
						}
						lemma_map.insert(std::make_pair(s_term, n_new_lemma_id));
						rev_lemma_map.insert(std::make_pair(s_lemma, n_new_lemma_id));
						m_lemma_list.push_back(std::make_pair(s_lemma, size_t(0)));
					} else
						lemma_map.insert(std::make_pair(s_term, (*p_lemma_it).second));
					// use reverse map to detect whether there is such a lemma
					// (a simple but stupid fileformat designed we ...)
				} catch(std::bad_alloc&) {
					fclose(p_fr);
					return false;
				}
				// add to the list/map

				if(b_verbose && !(n_line & 1023) && timer.f_Time() > f_next_verbose_time) {
					f_next_verbose_time = timer.f_Time() + 1;
					printf("%d\r", lemma_map.size());
				}
				// verbose
			}

			if(ferror(p_fr)) {
				fclose(p_fr);
				return false;
			}
			fclose(p_fr);

			if(b_verbose)
				printf("done. have %d lemmas associated with %d terms\n", m_lemma_list.size(), lemma_map.size());
		}
		// load lemma list

		if(!stl_ut::Resize_To_N(m_lemmatized_term_id_list, m_term_list.size()))
			return false;
		// alloc term_id list

		if(b_verbose)
			printf("filtering terms ...\n");

#ifdef __LUCENE_READER_USE_HASH_CONTAINERS
		const stdext::hash_map<wstring, size_t>::const_iterator p_lmap_end_it = lemma_map.end();
#else //__LUCENE_READER_USE_HASH_CONTAINERS
		const std::map<wstring, size_t>::const_iterator p_lmap_end_it = lemma_map.end();
#endif //__LUCENE_READER_USE_HASH_CONTAINERS
		for(size_t i = 0, n = m_term_list.size(); i < n; ++ i) {
			if(m_term_list[i].second < n_min_term_frequency) {
				m_lemmatized_term_id_list[i] = (b_remove_less_frequent_terms)? term_RemoveOp : term_Dummy;
				// the term doesn't have sufficient frequency

				++ n_removed_terms;
			} else if(p_s_lemma_file) {
#ifdef __LUCENE_READER_USE_HASH_CONTAINERS
				stdext::hash_map<wstring, size_t>::const_iterator p_lemma_it;
#else //__LUCENE_READER_USE_HASH_CONTAINERS
				std::map<wstring, size_t>::const_iterator p_lemma_it;
#endif //__LUCENE_READER_USE_HASH_CONTAINERS
				if((p_lemma_it = lemma_map.find(m_term_list[i].first)) != lemma_map.end()) {
					size_t n_lemma_index = (*p_lemma_it).second;
					// get index of the lemma

					_ASSERTE(n_lemma_index < term_MaxTermId);
					_ASSERTE(n_lemma_index < UINT32_MAX);
					m_lemmatized_term_id_list[i] = uint32_t(n_lemma_index);
					// use the lemma

					m_lemma_list[n_lemma_index].second += m_term_list[i].second;
					// calculate lemma reference frequency
				} else if(b_remove_lemma_less_terms) {
					m_lemmatized_term_id_list[i] = term_RemoveOp;
					// mark the term for removal

					++ n_lemma_miss_rate;
					// stats
				} else if(b_use_dummy_for_lemma_less_terms) {
					m_lemmatized_term_id_list[i] = term_Dummy;
					// mark the term for removal

					++ n_lemma_miss_rate;
					// stats
				} else {
					const wstring &s_lemma = m_term_list[i].first; // the term will be it's own lemma
					const wstring &s_term = m_term_list[i].first;
					// just different names for the sake of clarity

					size_t n_lemma_index;

#ifdef __LUCENE_READER_USE_HASH_CONTAINERS
					stdext::hash_map<wstring, size_t>::iterator p_lemma_it;
#else //__LUCENE_READER_USE_HASH_CONTAINERS
					std::map<wstring, size_t>::iterator p_lemma_it;
#endif //__LUCENE_READER_USE_HASH_CONTAINERS
					if((p_lemma_it = rev_lemma_map.find(s_lemma)) == rev_lemma_map.end()) {
						size_t n_new_lemma_id = m_lemma_list.size();
						if(n_new_lemma_id >= term_MaxTermId)
							return false;
						n_lemma_index = n_new_lemma_id; // ...
						lemma_map.insert(std::make_pair(s_term, n_new_lemma_id));
						rev_lemma_map.insert(std::make_pair(s_lemma, n_new_lemma_id));
						m_lemma_list.push_back(std::make_pair(s_lemma, size_t(0)));
					} else {
						n_lemma_index = (*p_lemma_it).second; // ...
						lemma_map.insert(std::make_pair(s_term, (*p_lemma_it).second));
					}
					// add term to the lemma map as it's own lemma
					// use reverse map to detect whether there is such a lemma
					// (a simple but stupid fileformat designed we ...)

					_ASSERTE(n_lemma_index < term_MaxTermId);
					_ASSERTE(n_lemma_index < UINT32_MAX);
					m_lemmatized_term_id_list[i] = uint32_t(n_lemma_index);
					// use the new lemma

					++ n_lemma_miss_rate;
					// stats
				}
				// find lemma for the term
			} else {
				size_t n_lemma_index = m_lemma_list.size();
				_ASSERTE(n_lemma_index < term_MaxTermId); // this should be checked when getting list of terms
				_ASSERTE(n_lemma_index < UINT32_MAX);
				m_lemmatized_term_id_list[i] = uint32_t(n_lemma_index);
				try {
					m_lemma_list.push_back(m_term_list[i]); // involves copying of the strings
				} catch(std::bad_alloc&) {
					return false;
				}
				// just use term id then; filter used terms to lemma list
			}

			if(b_verbose && !(i & 1023) && timer.f_Time() > f_next_verbose_time) {
				f_next_verbose_time = timer.f_Time() + 1;
				printf("%d\r", i + 1);
			}
			// verbose
		}
		// find the lemma indices ...
	}
	// braces limit lifetime of lemma_map (no longer needed)

	size_t n_removed_lemmas = 0;
	size_t n_shifted_lemmas = 0;
	// stats

	if(b_verbose)
		printf("purging terms ...\n");

	if(!m_lemma_list.empty()) {
		std::vector<uint32_t> index_correction_vector;
		{
			std::vector<std::pair<wstring, size_t> > new_lemma_list;
			if(!stl_ut::Resize_To_N(index_correction_vector, m_lemma_list.size()) ||
			   !stl_ut::Resize_To_N(new_lemma_list, m_lemma_list.size()))
				return false;
			// will contain a new index table // @todo-need to reverse order of the loop below

			size_t n_lemma_index = 0;
			for(size_t i = 0, n = m_lemma_list.size(); i < n; ++ i) {
				if(m_lemma_list[i/*n_lemma_index*/].second < n_min_lemma_frequency) {
					index_correction_vector[i] = (b_remove_less_frequent_terms)? term_RemoveOp : term_Dummy;
					// use dummy / the remove opcode instead of this lemma

					++ n_removed_lemmas;
					// stats
				} else {
					_ASSERTE(n_lemma_index < term_MaxTermId);
					_ASSERTE(n_lemma_index < UINT32_MAX);
					index_correction_vector[i] = uint32_t(n_lemma_index);
					// store a new index ...

					/*_ASSERTE(new_lemma_list.capacity() >= n_lemma_index + 1);
					_ASSERTE(new_lemma_list.size() == n_lemma_index);
					//try {
						new_lemma_list.resize(n_lemma_index + 1);*/ // do not resize at runtime
					/*} catch(std::bad_alloc&) {
						return false; // just in case
					}*/
					_ASSERTE(new_lemma_list.size() > n_lemma_index);
					std::pair<wstring, size_t> &r_src = m_lemma_list[i]; // not const, we're using swap()
					std::pair<wstring, size_t> &r_dest = new_lemma_list[n_lemma_index];
					r_dest.first.swap(r_src.first);
					r_dest.second = r_src.second;
					// filter lemmas to the new list, without reallocating
					
					++ n_lemma_index;
					// update lemma indexing ...

					++ n_shifted_lemmas;
					// stats
				}

				if(b_verbose && !(i & 1023) && timer.f_Time() > f_next_verbose_time) {
					f_next_verbose_time = timer.f_Time() + 1;
					printf("%d\r", i + 1);
				}
				// verbose
			}
			// remove less-referenced lemmas, prepare reindexing table

			new_lemma_list.resize(n_lemma_index);
			new_lemma_list.swap(m_lemma_list);
			// put the new lemma list in place
		}
		// limit lifetime of new_lemma_list (containing the old m_lemma_list at the end)

		if(b_verbose)
			printf("remapping terms ...\n");

		for(size_t i = 0, n = m_lemmatized_term_id_list.size(); i < n; ++ i) {
			uint32_t n_lemma_id = m_lemmatized_term_id_list[i];
			// get lemma id ...

			if(n_lemma_id == term_Dummy || n_lemma_id == term_RemoveOp)
				continue;
			// skip specials

			_ASSERTE(n_lemma_id < index_correction_vector.size());
			m_lemmatized_term_id_list[i] = index_correction_vector[n_lemma_id];
			// get a new lemma id from the reindexing table

			if(b_verbose && !(i & 1023) && timer.f_Time() > f_next_verbose_time) {
				f_next_verbose_time = timer.f_Time() + 1;
				printf("%d\r", i + 1);
			}
			// verbose
		}
		// update the lemmatized term id list
	}
	// remove the less-frequent lemmas

	if(b_verbose) {
		if(!p_s_lemma_file)
			n_removed_lemmas += n_removed_terms;
		printf("done. lemma miss rate: %d, removed terms: %d, shifted terms: %d, remaining terms: %d\n",
			n_lemma_miss_rate, n_removed_lemmas, n_shifted_lemmas, m_lemma_list.size());
	}
	// verbose

	if(b_verbose) {
		if(m_lemmatized_term_id_list.size() > 2) {
			wprintf(L"the first three term frequencies are %d (\'%s\'), %d (\'%s\'), %d (\'%s\')\n",
				m_lemma_list[m_lemmatized_term_id_list[0]].second, m_lemma_list[m_lemmatized_term_id_list[0]].first.c_str(),
				m_lemma_list[m_lemmatized_term_id_list[1]].second, m_lemma_list[m_lemmatized_term_id_list[1]].first.c_str(),
				m_lemma_list[m_lemmatized_term_id_list[2]].second, m_lemma_list[m_lemmatized_term_id_list[2]].first.c_str());
		}
	}

	return true;
}

void CLuceneIndexReaderInterface::Delete_LemmaList() /**< the lemmas (the token texts) aren't normally needed for the processing, they take up some memory so it may be good thing to free them */
{
	std::vector<std::pair<wstring, size_t> > empty;
	m_lemma_list.swap(empty);
}

bool CLuceneIndexReaderInterface::Get_Document(size_t n_document, bool b_lemmatize,
	std::vector<uint32_t> &r_term_vector, wstring &r_s_doc_file, wstring &r_s_doc_modified)
{
	b_lemmatize = b_lemmatize && !m_lemma_list.empty();
	// can only lemmatize if the lemmas were actually loaded

	_ASSERTE(n_document <= INT32_MAX);
	lucene::document::Document *p_document = _CLNEW lucene::document::Document();
	if(!m_p_index_reader->document(int32_t(n_document), p_document)) {
		_CLDELETE(p_document);
		return false;
	}
	// get the document

	lucene::document::Field *p_path = p_document->getField(_T("path")); // @note do not free this
	lucene::document::Field *p_date = p_document->getField(_T("modified")); // @note do not free this
#ifdef __LUCENE_READER_EXPECT_GIGAWORD_DOCS
	lucene::document::Field *p_gw_id = p_document->getField(_T("gw-id")); // @note do not free this
	if(!p_path || !p_gw_id || !p_date) {
#else //__LUCENE_READER_EXPECT_GIGAWORD_DOCS
	if(!p_path || !p_date) {
#endif //__LUCENE_READER_EXPECT_GIGAWORD_DOCS
		_CLDELETE(p_document);
		return false;
	}
	// get document fields

	if(!stl_ut::AssignWCStr(r_s_doc_file, p_path->stringValue()) ||
#ifdef __LUCENE_READER_EXPECT_GIGAWORD_DOCS
	   !stl_ut::AppendWCStr(r_s_doc_file, L"#") ||
	   !stl_ut::AppendWCStr(r_s_doc_file, p_gw_id->stringValue()) ||
#endif //__LUCENE_READER_EXPECT_GIGAWORD_DOCS
	   !stl_ut::AssignWCStr(r_s_doc_modified, p_date->stringValue())) {
		_CLDELETE(p_document);
		return false;
	}
	// copy document fields

	lucene::index::TermPositionVector *p_term_pos;
	_ASSERTE(n_document <= INT32_MAX);
	if(!(p_term_pos = (m_p_index_reader->getTermFreqVector(int32_t(n_document), _T("contents")))->__asTermPositionVector())) {
		_CLDELETE(p_document);
		return false;
	}
	// get term position vector

	const Array<int32_t> &term_frequencies = *p_term_pos->getTermFrequencies(); // @note do not free this
	size_t n_word_num = term_frequencies.length;
	size_t n_document_length = std::accumulate(term_frequencies.values,
		term_frequencies.values + term_frequencies.length, size_t(0));
	// calculate document length

	bool b_have_remove_terms = false;
	{
		const TCHAR **p_term_name_list = p_term_pos->getTerms(); // @note do not free this
		// get local terms

		try {
			m_document_term_list.resize(n_word_num);
		} catch(std::bad_alloc&) {
			_CLDELETE(p_term_pos);
			_CLDELETE(p_document);
			return false;
		}
		const CWideCStringIndexMap::const_iterator p_end_it = m_term_global_index_map.end();
		for(size_t i = 0; i < n_word_num; ++ i) {
			_ASSERTE(p_term_pos->indexOf(p_term_name_list[i]) == i); // this returns local index (useless), it should match i (it needs to)
			CWideCStringIndexMap::const_iterator p_hash_it = m_term_global_index_map.find(p_term_name_list[i]);
			if(p_hash_it == p_end_it) {
				_CLDELETE(p_term_pos);
				_CLDELETE(p_document);
				fprintf(stderr, "error: document introduces unknown term\n");
				return false;
			}
			size_t n_term_id = (*p_hash_it).second; // contains the term id
			_ASSERTE(n_term_id < term_MaxTermId); // !!
			if(b_lemmatize) {
				_ASSERTE(n_term_id < m_lemmatized_term_id_list.size());
				n_term_id = m_lemmatized_term_id_list[n_term_id];
				if(n_term_id == term_RemoveOp)
					b_have_remove_terms = true;
				m_document_term_list[i] = uint32_t(n_term_id);
			} else
				m_document_term_list[i] = uint32_t(n_term_id);
		}
		// alloc and fill the vector (using hashmap)
	}
	// find global indices

	{
		try {
#ifdef _DEBUG
			r_term_vector.clear(); // !!
			r_term_vector.resize(n_document_length, -1);
#else //_DEBUG
			r_term_vector.resize(n_document_length); // don't need to explicitly initialize, it's just for debugging
#endif //_DEBUG
		} catch(std::bad_alloc&) {
			_CLDELETE(p_term_pos);
			_CLDELETE(p_document);
			return false;
		}
		// allocate term index vector

		for(size_t i = 0, n = p_term_pos->size(); i < n; ++ i) {
			_ASSERTE(i <= INT32_MAX);
			const Array<int32_t> &term_positions = *p_term_pos->getTermPositions(int32_t(i)); // @note do not free this

			/*bool b_is_sorted = true;
			for(size_t j = 1, m = term_positions.length; j < m; ++ j) {
				if(term_positions[j - 1] > term_positions[j]) {
					b_is_sorted = false;
					break;
				}
			}
			if(!b_is_sorted) {
				_CLDELETE(p_term_pos);
				_CLDELETE(p_document);
				fprintf(stderr, "error: term positions aren't sorted\n");
				return false;
			}*/ // just checked this for some crazy idea with chunking

			for(size_t j = 0, m = term_positions.length; j < m; ++ j) {
				_ASSERTE(term_positions[j] >= 0 && unsigned(term_positions[j]) < n_document_length); // make sure position is within document
				_ASSERTE(r_term_vector[term_positions[j]] == uint32_t(-1)); // make sure positions do not collide
				r_term_vector[term_positions[j]] = m_document_term_list[i];
			}
			// fill the positions
		}
	}
	// fill terms in document term array

	if(b_have_remove_terms) {
		try {
			m_temp_term_vector.resize(n_document_length);
		} catch(std::bad_alloc&) {
			_CLDELETE(p_term_pos);
			_CLDELETE(p_document);
			return false;
		}
		// allocate term index vector

		size_t n_new_size = std::for_each(r_term_vector.begin(),
			r_term_vector.end(), CConditionalCopy<term_RemoveOp>(m_temp_term_vector.begin()));
		_ASSERTE(n_new_size < n_document_length); // < because b_have_remove_terms
		m_temp_term_vector.resize(n_new_size);
		// transform and erase the unused end of the vector

		size_t n_shorten = r_term_vector.size() - m_temp_term_vector.size();
		m_t_docu_tr_hist.Count(int(n_shorten));
		// debug

		m_temp_term_vector.swap(r_term_vector);
		// swap it with the output
	} else
		m_t_docu_tr_hist.Count(0); // !!

	_CLDELETE(p_term_pos);
	_CLDELETE(p_document);

	return true;
}

void CLuceneIndexReaderInterface::Dump_TermRemovalHistogram()
{
	m_t_docu_tr_hist.Print();
}

bool CLuceneIndexReaderInterface::b_HigherTermFreq(const std::pair<wstring, size_t> &a, const std::pair<wstring, size_t> &b)
{
	return a.second > b.second;
}

void CLuceneIndexReaderInterface::TrimSpaceW(wstring &r_s_string)
{
	size_t b = 0, e = r_s_string.length();
	while(e > 0 && r_s_string[e - 1] < 255 && isspace(r_s_string[e - 1])) // isspace triggers an assertion in vs2008 if over 255!
		-- e;
	while(b < e && r_s_string[b] < 255 && isspace(r_s_string[b]))
		++ b;
	r_s_string.erase(e);
	r_s_string.erase(0, b);
}

bool CLuceneIndexReaderInterface::ReadLineW(FILE *p_fr, wstring &r_s_line, size_t &r_n_cur_line)
{
	while(!feof(p_fr)) {
		r_s_line.erase();
		try {
			for(int c = fgetc(p_fr); c != EOF; c = fgetc(p_fr)) {
				unsigned short n_char = c;
				c = fgetc(p_fr);
				if(c == EOF)
					return false; // this is error actually
				n_char |= c << 8;

				if(n_char == '\n')
					break;

				r_s_line += n_char;
			}
		} catch(std::bad_alloc&) {
			return false;
		}
		// read line

		++ r_n_cur_line;
		// line counter for file debugging

		/*if(r_s_line.find('#') != std::string::npos)
			r_s_line.erase(r_s_line.find('#'));*/
		// throw away line comments

		TrimSpaceW(r_s_line);
		// throw away begin / end whitespace

		if(!r_s_line.length())
			continue;
		// skip empty lines

		return true;
	}

	return false;
}

/*
 *								=== ~CLuceneIndexReaderInterface ===
 */
