/*
								+---------------------------------+
								|                                 |
								| *** CLucene raw data feeder *** |
								|                                 |
								|  Copyright   -tHE SWINe- 2010  |
								|                                 |
								|            Main.cpp             |
								|                                 |
								+---------------------------------+
*/

/*#ifdef WIN32
#include <crtdbg.h>
#undef _ASSERTE
#endif*/
// win32 memory debugger

#include "../../UberLame_src/NewFix.h"
#include "../../UberLame_src/CallStack.h"
#include <string>
#include <vector>
#include <algorithm>
#include <string.h>
#include <stdio.h>
#include "../../UberLame_src/MinMax.h"
#include "../../UberLame_src/StlUtils.h"
#include "../../UberLame_src/Dir.h" // PRIsizeB
#include "../../UberLame_src/Unused.h"
#include "../../UberLame_src/Timer.h"

#include <CLucene.h>

#if defined(_MSC_VER) && !defined(__MWERKS__)
#define for if(0) {} else for
#endif
// msvc 'for' scoping hack

bool b_ContainsWords(const TCHAR *p_s_text)
{
	size_t b = 0, e = wcslen(p_s_text);
	if(!e)
		return false;
	// no text, no words

	do {
		while(b < e && isspace(p_s_text[b]))
			++ b;
		// skip whitespace

		size_t n_word_length = 0;
		while(b < e && !isspace(p_s_text[b])) {
			if(isalpha(p_s_text[b])) {
				++ n_word_length;
				if(n_word_length > 4)
					return true;
			} else
				n_word_length = 0;
			++ b;
		}
		// skip word
	} while(b < e);
	// try to find long enough word

	return false;
}

int main(int UNUSED(n_arg_num), const char **UNUSED(p_arg_list))
{
	const char *p_s_output_index = "E:\\gigaword\\gw-positional-index";//"n:\\downloads\\gw-cna_eng-positional-index";
	// clucene index directory

	const char *p_s_input_file = "E:\\gigaword\\gw.raw";//"n:\\downloads\\gw-cna_eng.raw";
	// unicode file, containing all the documents' fields as 32-bit length, followed by utf-16 contents. plain and simple.
	// the fields go in the following order: L"gw-id", L"gw-type", L"gw-headline", L"gw-dateline", L"contents"

	{
		lucene::analysis::standard::StandardAnalyzer analyzer;
		lucene::index::IndexWriter writer(lucene::store::FSDirectory::getDirectory(p_s_output_index, true),
			&analyzer, true, true);
		writer.setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH);
		// prepare CLucene

		printf("reading documents ...\n");

		uint64_t n_total_size = 0;
		CTimer timer;
		double f_start_time = timer.f_Time();

		FILE *p_fr;
		if(fopen_s(&p_fr, p_s_input_file, "rb")) {
			fprintf(stderr, "error: error opening input file (\'%s\')\n", p_s_input_file);
			return -1;
		}
		// open input file ...

		std::basic_string<TCHAR> p_field[7]; // to keep strings allocated
		for(size_t n_doc = 0;; ++ n_doc) {
			const TCHAR *p_field_name_list[] = {L"path", L"modified", L"gw-id", L"gw-type", L"gw-headline", L"gw-dateline", L"contents"};
			const int n_nonparsed_text_field = lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_UNTOKENIZED;
			const int n_parsed_contents_field = lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED |
				lucene::document::Field::TERMVECTOR_YES | lucene::document::Field::TERMVECTOR_WITH_POSITIONS;
			const int p_field_flag_list[] = {n_nonparsed_text_field, n_nonparsed_text_field, n_nonparsed_text_field,
				n_nonparsed_text_field, n_nonparsed_text_field, n_nonparsed_text_field, n_parsed_contents_field};
			const size_t n_field_num = sizeof(p_field_name_list) / sizeof(p_field_name_list[0]);
			_ASSERTE(n_field_num == sizeof(p_field_flag_list) / sizeof(p_field_flag_list[0])); // arrays must have the same length
			// description of fields, saved to the raw file

			uint32_t n_length;
			int n_result;
			if((n_result = fread(&n_length, sizeof(uint32_t), 1, p_fr)) == 0)
				break; // end of file
			else if(n_result != 1) {
				fprintf(stderr, "error: error reading input file (\'%s\')\n", p_s_input_file);
				fclose(p_fr);
				return -1;
			}
			// read the first field length here to see if we're at the end of the file

			for(int i = 0; i < n_field_num; ++ i) {
				if(i) {
					if(fread(&n_length, sizeof(uint32_t), 1, p_fr) != 1) {
						fprintf(stderr, "error: error reading input file (\'%s\')\n", p_s_input_file);
						fclose(p_fr);
						return -1;
					}
				}
				// read field length (only the first field length is read above the cycle)

				p_field[i].clear();
				if(!stl_ut::Resize_To_N(p_field[i], n_length)) {
					fprintf(stderr, "error: not enough memory while reading input file (\'%s\')\n", p_s_input_file);
					fclose(p_fr);
					return -1;
				}
				if(fread(&p_field[i][0], sizeof(TCHAR), n_length, p_fr) != n_length) {
					fprintf(stderr, "error: error reading input file (\'%s\')\n", p_s_input_file);
					fclose(p_fr);
					return -1;
				}
				// alloc & read field contents
			}
			// read all the document fields

			lucene::document::Document *p_docu = _CLNEW lucene::document::Document();
			// create a new document instance

			/*if(n_doc == 35) {
				const TCHAR *p_s_text = p_field[6].c_str();
				wprintf(L"document %d: %s\n\nunicode: ", n_doc, p_s_text);
				for(size_t i = 0, n = wcslen(p_s_text); i < n; ++ i)
					printf(" %d", p_s_text[i]);
				// print the culprit document

				continue;
			}*/ // cna_eng contains an empty document (no text). that screws clucene

			if(!b_ContainsWords(p_field[6].c_str())) {
				fwprintf(stderr, L"warning: document %d (\'%s\'): doesn't contain terms\n", n_doc, p_field[2].c_str());
				continue;
			}
			// skip empty documents

			for(int i = 0; i < n_field_num; ++ i) {
				/*if(p_field[i].empty())
					continue; // !!*/
				try {
					lucene::document::Field *pf = _CLNEW lucene::document::Field(
						p_field_name_list[i], p_field[i].c_str(), p_field_flag_list[i]);
					p_docu->add(*pf);
				} catch(CLuceneError &err) {
					printf("lucene error: %s\n", err.what());
					fclose(p_fr);
					return -1;
				}
				// create the field
			}
			// specify document contents

			try {
				writer.addDocument(p_docu); // @t_odo - this crashes, god knows why ... do something! (it doesn't crash the first time, it just crunches some documents and crashes when doing something. it doesn't depend on wheter running multithreaded, or in a single (the original one) thread)
				_CLDELETE(p_docu);
			} catch(CLuceneError &err) {
				printf("lucene error: %s\n", err.what());

				const TCHAR *p_s_text = p_field[6].c_str();
				wprintf(L"document %d: %s\n\nunicode: ", n_doc, p_s_text);
				for(size_t i = 0, n = wcslen(p_s_text); i < n; ++ i)
					printf(" %d", p_s_text[i]);
				// print the culprit document

				fclose(p_fr);
				return -1;
			}
			// add the document to the database, delete document instance

			size_t n_doc_size = 0;
			for(int i = 0; i < n_field_num; ++ i)
				n_doc_size += sizeof(uint32_t) + p_field[i].length() * sizeof(TCHAR); // documents are small, this won't overflow
			if(n_total_size <= UINT64_MAX - n_doc_size)
				n_total_size += n_doc_size;
			else
				n_total_size = UINT64_MAX;
			// count document sizes

			wprintf(L"adding \'%s#%s\' (doc %d) ...    \r", p_field[0].c_str(), p_field[2].c_str(), n_doc);
			// verbose
		}

		fclose(p_fr);
		// close input file

		printf("%79s\rdone\n", "");
		// verbose

		double f_total_time = timer.f_Time() - f_start_time;
		printf("read " PRIsizeB "B @ " PRIsizeB "B/sec (it took " PRItime ")\n", PRIsizeBparams(n_total_size),
			PRIsizeBparams(n_total_size / f_total_time), PRItimeparams(f_total_time));

		printf("optimizing index ...\n");

		writer.optimize();

		printf("closing index ...\n");

		writer.close();

		printf("finished\n");
	}

	//DumpMemoryLeaks();

	return 0;
}
