/**
 *	@file WriteTermVectors/Main.cpp
 *	@author -tHE SWINe-
 *	@brief Utility for extracting plain term vectors from CLucene positional index.
 *	@date 2010-07-17
 */

#include <../../UberLame_src/CallStack.h>
#include <stdio.h>
#include <numeric>
#include <hash_map>
#include <../../UberLame_src/Dir.h>
#include <../../UberLame_src/StlUtils.h>
#include <../../UberLame_src/Timer.h>

#include <CLucene.h>

/**
 *	@def __LUCENE_READER_USE_HASH_CONTAINERS
 *	@brief decides whether to use stdext::hash_map and stdext::hash_set
 *		instead of std::map and std::set
 */
//#define __LUCENE_READER_USE_HASH_CONTAINERS

/**
 *	@brief hash function for wide strings
 */
class CWideCStringHasher {
public:
	enum {
		bucket_size = 256,
		min_buckets = 16384
	};

	/**
	 *	@brief default constructor (has no effect)
	 */
	CWideCStringHasher()
	{}

	/**
	 *	@brief hash function
	 *
	 *	@param[in] p_key is wide string (key in hash-map)
	 *
	 *	@return Returns hash of p_key.
	 */
	size_t operator ()(const wchar_t *p_key) const
	{
		size_t hash_value = 13;
		for(; *p_key; ++ p_key)
			hash_value = hash_value * 23 + *p_key;
		return hash_value;
	}

	/**
	 *	@brief less-than ordering function
	 *
	 *	@param[in] p_left is wide string
	 *	@param[in] p_right is wide string
	 *
	 *	@return Returns true if p_left is before p_right, otherwise returns false.
	 */
    bool operator ()(const wchar_t *p_left, const wchar_t *p_right) const
	{
        return wcscmp(p_left, p_right) < 0;
    }
};

/**
 *	@brief hash-map with wide string keys and int values
 */
#ifdef __LUCENE_READER_USE_HASH_CONTAINERS
typedef stdext::hash_map<const wchar_t*, size_t, CWideCStringHasher> CWideCStringIndexHashMap;
#else //__LUCENE_READER_USE_HASH_CONTAINERS
typedef std::map<const wchar_t*, size_t, CWideCStringHasher> CWideCStringIndexHashMap;
#endif //__LUCENE_READER_USE_HASH_CONTAINERS

/**
 *	@brief writes wide string to a file
 *
 *	@param[in] r_s_str is string to be written
 *	@param[in] p_fw is output file
 *
 *	@return Returns true on success, false on failure.
 */
bool WriteWString(const std::basic_string<wchar_t> &r_s_str, FILE *p_fw)
{
	_ASSERTE(r_s_str.length() < UINT32_MAX);
	uint32_t n_length = uint32_t(r_s_str.length());
	return fwrite(&n_length, sizeof(uint32_t), 1, p_fw) == 1 &&
		fwrite(r_s_str.data(), sizeof(wchar_t), n_length, p_fw) == n_length;
}

/**
 *	@brief writes wide null-terminated string to the file
 *
 *	@param[in] p_s_str is string to be written
 *	@param[in] p_fw is output file
 *
 *	@return Returns true on success, false on failure.
 */
bool WriteWCString(const wchar_t *p_s_str, FILE *p_fw)
{
	_ASSERTE(wcslen(p_s_str) < UINT32_MAX);
	uint32_t n_length = uint32_t(wcslen(p_s_str));
	return fwrite(&n_length, sizeof(uint32_t), 1, p_fw) == 1 &&
		fwrite(p_s_str, sizeof(wchar_t), n_length, p_fw) == n_length;
}

/**
 *	@brief writes two wide null-terminated strings, connected by a separator to the file
 *
 *	@param[in] p_s_str is the first string to be written
 *	@param[in] p_s_sep is the string separator
 *	@param[in] p_s_str2 is the second string to be written
 *	@param[in] p_fw is output file
 *
 *	@return Returns true on success, false on failure.
 */
bool WriteWCString2(const wchar_t *p_s_str, const wchar_t *p_s_sep, const wchar_t *p_s_str2, FILE *p_fw)
{
	if(!p_s_sep)
		p_s_sep = L"";
	_ASSERTE(wcslen(p_s_str) < UINT32_MAX);
	_ASSERTE(wcslen(p_s_sep) < UINT32_MAX);
	_ASSERTE(wcslen(p_s_str2) < UINT32_MAX);
	_ASSERTE(wcslen(p_s_str) + wcslen(p_s_sep) < UINT32_MAX - wcslen(p_s_str2));
	uint32_t n_length = uint32_t(wcslen(p_s_str) + wcslen(p_s_sep) + wcslen(p_s_str2));
	return fwrite(&n_length, sizeof(uint32_t), 1, p_fw) == 1 &&
		fwrite(p_s_str, sizeof(wchar_t), wcslen(p_s_str), p_fw) == wcslen(p_s_str) &&
		fwrite(p_s_sep, sizeof(wchar_t), wcslen(p_s_sep), p_fw) == wcslen(p_s_sep) &&
		fwrite(p_s_str2, sizeof(wchar_t), wcslen(p_s_str2), p_fw) == wcslen(p_s_str2);
}

template <class _Ty>
void TrimSpace(_Ty &r_s_string)
{
	size_t b = 0, e = r_s_string.length();
	while(e > 0 && isspace(r_s_string[e - 1]))
		-- e;
	while(b < e && isspace(r_s_string[b]))
		++ b;
	r_s_string.erase(e);
	r_s_string.erase(0, b);
}

bool ReadLineW(FILE *p_fr, std::basic_string<wchar_t> &r_s_line, int &r_n_cur_line)
{
	while(!feof(p_fr)) {
		r_s_line.erase();
		try {
			for(int c = fgetc(p_fr); c != EOF; c = fgetc(p_fr)) {
				unsigned short n_char = c;
				c = fgetc(p_fr);
				if(c == EOF)
					return false; // this is error actually
				n_char |= c << 8;

				if(n_char == '\n')
					break;

				r_s_line += n_char;
			}
		} catch(std::bad_alloc&) {
			return false;
		}
		// read line

		++ r_n_cur_line;
		// line counter for file debugging

		/*if(r_s_line.find('#') != std::string::npos)
			r_s_line.erase(r_s_line.find('#'));*/
		// throw away line comment

		TrimSpace(r_s_line);
		// throw away begin / end whitespace

		if(!r_s_line.length())
			continue;
		// skip empty lines

		return true;
	}

	return false;
}

int main(int n_arg_num, const char **p_arg_list)
{
	const char *p_s_mapping_file = "n:\\downloads\\lemmatized-utf16.txt";
	const char *p_s_input_index = "n:\\downloads\\gw-cna_eng-positional-index";
	const char *p_s_output_file = "n:\\downloads\\gw-cna_eng-term-vectors-lemm-dummy";

	CTimer timer; // verbose/perf timer
	double f_next_verbose_time = 1;

	printf("reading mapping file ...\n");

#ifdef __LUCENE_READER_USE_HASH_CONTAINERS
	stdext::hash_set<std::basic_string<wchar_t> > lemma_list;
	stdext::hash_map<std::basic_string<wchar_t>, std::basic_string<wchar_t> > lemma_map;
#else //__LUCENE_READER_USE_HASH_CONTAINERS
	std::set<std::basic_string<wchar_t> > lemma_list;
	std::map<std::basic_string<wchar_t>, std::basic_string<wchar_t> > lemma_map;
#endif //__LUCENE_READER_USE_HASH_CONTAINERS
	FILE *p_fr;
	if(!(p_fr = fopen(p_s_mapping_file, "rb"))) {
		fprintf(stderr, "error: failed to open \'%s\' ...\n", p_s_mapping_file);
		return -1;
	}
	try {
		std::basic_string<wchar_t> s_line, s_lemma;
		int n_line = 0;
		while(ReadLineW(p_fr, s_line, n_line)) {
			size_t n_pos;
			if((n_pos = s_line.find(';')) == std::string::npos) {
				fprintf(stderr, "warning: there is no semicolon on line %d\n", n_line);
				continue;
			}
			if(s_line.find(';', n_pos + 1) != std::string::npos)
				fprintf(stderr, "warning: there is more than one semicolon on line %d\n", n_line);
			// find semicolon

			s_lemma.erase();
			s_lemma.insert(s_lemma.begin(), s_line.begin() + n_pos + 1, s_line.end());
			s_line.erase(n_pos);
			// split line to term/lemma

			lemma_map[s_line] = s_lemma;
			// add to the map

			if(timer.f_Time() >= f_next_verbose_time) {
				f_next_verbose_time = timer.f_Time() + 1;
				printf("%d\r", lemma_map.size());
			}
			// verbose
		}
	} catch(std::bad_alloc&) {
		fclose(p_fr);
		fprintf(stderr, "error: not enough memory while reading \'%s\' ...\n", p_s_mapping_file);
		return -1;
	}
	printf("%16s\rdone\n", "");
	fclose(p_fr);
	// read the mapping file ...

	printf("opening index ...\n");
	lucene::index::IndexReader *p_index_reader = lucene::index::MultiReader::open(
		lucene::store::FSDirectory::getDirectory(p_s_input_index, false));
	if(!p_index_reader) {
		fprintf(stderr, "error: failed to open index file \'%s\'\n", p_s_input_index);
		return -1;
	}
	// get index reader

	printf("getting terms ...\n");

	int n_term_num = 0, n_lemma_miss_rate = 0;
#ifdef _DEBUG
	bool b_dont_expecting_more_terms = false;
#endif //_DEBUG
	lucene::index::TermEnum *p_term_enum = p_index_reader->terms();
	while(p_term_enum->next()) {
		const lucene::index::Term *p_lucene_term = p_term_enum->term();
		const TCHAR *p_s_field = p_lucene_term->field();

		if(wcscmp(p_s_field, L"contents")) {
#ifdef _DEBUG
			b_dont_expecting_more_terms = true;
			break;//continue;
#else //_DEBUG
			break;
#endif //_DEBUG
		}
		// just want terms for "contents"

		_ASSERTE(!b_dont_expecting_more_terms); // if(b_dont_expecting_more_terms) fprintf(stderr, "warning: terms for individual fields aren't stored in contiguous blocks!\n");
		// see if it's possible to just break once "contents" terms stop ... that would save a lot of time

		const TCHAR *p_s_text = p_lucene_term->text();

		/*if(*p_s_text && !isalpha(*p_s_text))
			continue;*/
		// skip rubbish terms

		++ n_term_num;

		std::basic_string<wchar_t> s_term(p_s_text);
		const std::basic_string<wchar_t> *p_term;

#ifdef __LUCENE_READER_USE_HASH_CONTAINERS
		stdext::hash_map<std::basic_string<wchar_t>, std::basic_string<wchar_t> >::const_iterator p_term_it;
#else //__LUCENE_READER_USE_HASH_CONTAINERS
		std::map<std::basic_string<wchar_t>, std::basic_string<wchar_t> >::const_iterator p_term_it;
#endif //__LUCENE_READER_USE_HASH_CONTAINERS
		if((p_term_it = lemma_map.find(s_term)) != lemma_map.end())
			p_term = &(*p_term_it).second;
		else {
			++ n_lemma_miss_rate;
			p_term = &s_term; // use the original
		}
		// try to find lemma for this string

		if(lemma_list.find(*p_term) == lemma_list.end())
			lemma_list.insert(*p_term);
		else
			continue;
		// add to the term set

		// @todo - calculate total term frequencies, throw away too little ones, see where we can get

		//term_list.push_back();

		//wprintf(_T("\tterm \'%s\' (field \'%s\')\n"), r_term.text(), p_s_filed); // debug
		if(timer.f_Time() >= f_next_verbose_time) {
			f_next_verbose_time = timer.f_Time() + 1;
			printf("%d\r", lemma_list.size());
		}

		//_CLDELETE(p_term);
	}
	p_term_enum->close();
	_CLDELETE(p_term_enum);
	printf("%16s\rdone\n", "");
	// go trough documents terms

	printf("\tthere is %d lemmatized terms (%d terms were read, %d missed lemma map) ("
		PRIsizeB "B assuming 1024 long vectors)\n", lemma_list.size(), n_term_num,
		n_lemma_miss_rate, PRIsizeBparams(uint64_t(lemma_list.size()) * 1024 * sizeof(float)));
	// verbose

	printf("building global terms index map ...\n");

	CWideCStringIndexHashMap term_global_index_map;
	{
		CWideCStringIndexHashMap lemma_global_index_map;
#ifdef __LUCENE_READER_USE_HASH_CONTAINERS
		stdext::hash_set<std::basic_string<wchar_t> >::const_iterator p_lemma_it = lemma_list.begin();
#else //__LUCENE_READER_USE_HASH_CONTAINERS
		std::set<std::basic_string<wchar_t> >::const_iterator p_lemma_it = lemma_list.begin();
#endif //__LUCENE_READER_USE_HASH_CONTAINERS
		for(size_t i = 0, n = lemma_list.size(); i < n; ++ i, ++ p_lemma_it) {
			lemma_global_index_map[(*p_lemma_it).c_str()] = i;
		}
#ifdef __LUCENE_READER_USE_HASH_CONTAINERS
		stdext::hash_map<std::basic_string<wchar_t>, std::basic_string<wchar_t> >::const_iterator p_term_it = lemma_map.begin();
#else //__LUCENE_READER_USE_HASH_CONTAINERS
		std::map<std::basic_string<wchar_t>, std::basic_string<wchar_t> >::const_iterator p_term_it = lemma_map.begin();
#endif //__LUCENE_READER_USE_HASH_CONTAINERS
		for(size_t i = 0, n = lemma_map.size(); i < n; ++ i, ++ p_term_it) {
			_ASSERTE(lemma_global_index_map.find((*p_term_it).second.c_str()) != lemma_global_index_map.end());
			term_global_index_map[(*p_term_it).first.c_str()] = lemma_global_index_map[(*p_term_it).second.c_str()];
		}
	}
	// build global index map

	printf("reading documents ...\n");

	FILE *p_fw;

	if(!(p_fw = fopen(p_s_output_file, "wb"))) {
		fprintf(stderr, "error: failed to open \'%s\' for writing ...\n", p_s_output_file);
		return -1;
	}

	{
		uint32_t n_file_magic = 0xbaadf00dU;
		_ASSERTE(lemma_list.size() <= UINT32_MAX);
		uint32_t n_term_num = uint32_t(lemma_list.size());
		_ASSERTE(p_index_reader->numDocs() <= UINT32_MAX);
		uint32_t n_document_num = uint32_t(p_index_reader->numDocs());
		if(fwrite(&n_file_magic, sizeof(uint32_t), 1, p_fw) != 1 ||
		   fwrite(&n_term_num, sizeof(uint32_t), 1, p_fw) != 1 ||
		   fwrite(&n_document_num, sizeof(uint32_t), 1, p_fw) != 1) {
			fprintf(stderr, "error: i/o error while writing document indices\n");
			fclose(p_fw);
			return -1;
		}
		// write file magic number and number of documents ...
	}
	// write header

	{
#ifdef __LUCENE_READER_USE_HASH_CONTAINERS
		stdext::hash_set<std::basic_string<wchar_t> >::const_iterator p_lemma_it = lemma_list.begin();
#else //__LUCENE_READER_USE_HASH_CONTAINERS
		std::set<std::basic_string<wchar_t> >::const_iterator p_lemma_it = lemma_list.begin();
#endif //__LUCENE_READER_USE_HASH_CONTAINERS
		for(size_t i = 0, n = lemma_list.size(); i < n; ++ i, ++ p_lemma_it) {
			if(!WriteWString(*p_lemma_it, p_fw)) {
				fprintf(stderr, "error: i/o error while writing terms\n");
				fclose(p_fw);
				return -1;
			}
		}
	}
	// write terms

	try {
		for(size_t i = 0, n = p_index_reader->numDocs(); i < n; ++ i) {
			lucene::document::Document *p_document = _CLNEW lucene::document::Document();
			if(!p_index_reader->document(int32_t(i), p_document)) {
				fprintf(stderr, "warning: document %d couldn't be retrieved\n", i);
				continue;
			}

			lucene::document::Field *p_path = p_document->getField(_T("path"));
			lucene::document::Field *p_date = p_document->getField(_T("modified"));
			lucene::document::Field *p_gw_id = p_document->getField(_T("gw-id"));

			if(timer.f_Time() >= f_next_verbose_time) {
				f_next_verbose_time = timer.f_Time() + 1;
				wprintf(_T("\tdocument \'%s#%s\' (last modified %s)         \r"),
					p_path->stringValue(), p_gw_id->stringValue(), p_date->stringValue());
			}
			// show progress from time to time

			lucene::index::TermPositionVector *p_term_pos = (p_index_reader->getTermFreqVector(int32_t(i), _T("contents")))->__asTermPositionVector();
			// get term position vector

			const Array<int32_t> &term_frequencies = *p_term_pos->getTermFrequencies();
			size_t n_word_num = term_frequencies.length;
			size_t n_document_length = std::accumulate(term_frequencies.values,
				term_frequencies.values + term_frequencies.length, size_t(0));
			// calculate document length

			{
				uint32_t n_document_magic = 0xd00cd00cU;
				_ASSERTE(n_word_num <= UINT32_MAX);
				uint32_t n_word_num32 = uint32_t(n_word_num);
				_ASSERTE(n_document_length <= UINT32_MAX);
				uint32_t n_document_length32 = uint32_t(n_document_length);

				if(fwrite(&n_document_magic, sizeof(int32_t), 1, p_fw) != 1 ||
				   fwrite(&n_word_num32, sizeof(int32_t), 1, p_fw) != 1 ||
				   fwrite(&n_document_length32, sizeof(int32_t), 1, p_fw) != 1 ||
				   !WriteWCString2(p_path->stringValue(), L"#", p_gw_id->stringValue(), p_fw) ||
				   !WriteWCString(p_date->stringValue(), p_fw)) {
					fprintf(stderr, "error: i/o error while writing terms\n");
					fclose(p_fw);
					return -1;
				}
			}
			// write document start marker and number of unique terms in this document

			std::vector<size_t> term_global_indices;
			{
				const TCHAR **p_terms = p_term_pos->getTerms();
				// get local terms

				term_global_indices.resize(n_word_num);
				for(size_t i = 0; i < n_word_num; ++ i) {
					CWideCStringIndexHashMap::const_iterator p_hash_it =
						term_global_index_map.find(p_terms[i]);
					if(p_hash_it == term_global_index_map.end()) {
						_ASSERTE(*p_terms[i] && !isalpha(*p_terms[i])); // this term was probably filtered out // @todo - introduce term filter function / object for this
						/*term_global_indices[i] = -1;
						continue;*/
						fprintf(stderr, "error: document introduces unknown term\n");
						return -1;
					}
					term_global_indices[i] = (*p_hash_it).second;

					{
						_ASSERTE(term_frequencies[i] <= UINT32_MAX);
						uint32_t n_frequency = uint32_t(term_frequencies[i]);
						_ASSERTE(term_global_indices[i] <= UINT32_MAX);
						uint32_t n_global_term_id = uint32_t(term_global_indices[i]);

						if(fwrite(&n_global_term_id, sizeof(int32_t), 1, p_fw) != 1 ||
						   fwrite(&n_frequency, sizeof(int32_t), 1, p_fw) != 1) {
							fprintf(stderr, "error: i/o error while writing document indices\n");
							fclose(p_fw);
							return -1;
						}
					}
					// write term id - document frequency pair
				}
				// alloc and fill the vector (using hashmap)
			}
			// find global indices

			std::vector<size_t> terms;
#ifdef _DEBUG
			terms.resize(n_document_length, -1);
#else //_DEBUG
			terms.resize(n_document_length); // don't need to explicitly initialize, it's just for debugging
#endif //_DEBUG
			// allocate term index vector

			//size_t n_skip = 0; // this won't work, we're accessing the document vector randomly in several passes
			for(size_t i = 0, n = p_term_pos->size(); i < n; ++ i) {
				const Array<int32_t> &term_positions = *p_term_pos->getTermPositions(int32_t(i));
				for(size_t j = 0, m = term_positions.length; j < m; ++ j) {
					/*if(term_global_indices[i] == -1) { // term not found - some rejected term
						++ n_skip;
						continue;
					}*/
					_ASSERTE(term_positions[j] >= 0 && unsigned(term_positions[j]) < n_document_length); // make sure position is within document
					_ASSERTE(terms[term_positions[j]] < 0); // make sure positions do not collide
					terms[term_positions[j]/* - n_skip*/] = term_global_indices[i];
				}
			}
			//terms.resize(n_document_length - n_skip); // constract the document vector
			// fill terms in document term array

			for(size_t i = 0; i < terms.size(); ++ i) {
				int32_t n_term_id;

				if(terms[i] < 0) {
					fprintf(stderr, "error: term(s) missing\n");
					n_term_id = -1;
				} else {
					_ASSERTE(terms[i] <= INT32_MAX);
					n_term_id = int32_t(terms[i]);
					//wprintf((i)? _T(" %s") : _T("%s"), term_list[n_term_id].c_str());
				}
				// get term id

				if(fwrite(&n_term_id, sizeof(int32_t), 1, p_fw) != 1) {
					fprintf(stderr, "error: i/o error while writing document indices\n");
					fclose(p_fw);
					return -1;
				}
				// write term id to a file
			}
			//printf("\n");
			// print document (debug)

			_CLDELETE(p_term_pos);
			/*_CLDELETE(p_path);
			_CLDELETE(p_date);
			_CLDELETE(p_gw_id);*/
			_CLDELETE(p_document);
		}
		// list documents

	} catch(CLuceneError &err) {
		printf("lucene error: %s\n", err.what());
	} catch(exception &err) {
		printf("general error: %s\n", err.what());
	}

	_CLDELETE(p_index_reader);
	// free index reader

	fclose(p_fw);
	// close the output file

	printf("\ndone (it took " PRItime ")\n", PRItimeparams(timer.f_Time()));

	return 0;
}
