/**
 *	@file BuildPositionalIndex/Main.cpp
 *	@author -tHE SWINe-
 *	@brief Utility for creating CLucene positional index from plain-english douments.
 *	@date 2010-07-04
 */

#include <../../UberLame_src/CallStack.h>
#include <stdio.h>
#include <../../UberLame_src/Dir.h>
#include <../../UberLame_src/StlUtils.h>

#include <CLucene.h>

/**
 *	@brief function object for directory traversal, calling CLucene on found documents
 */
class CDocumentIndexer {
protected:
	lucene::index::IndexWriter &m_r_writer; /**< @brief reference to CLucene index writer, specified in constructor */

public:
	/**
	 *	@brief default constructor
	 *
	 *	@param[in] r_writer is CLucene index writer, which will process found documents
	 */
	CDocumentIndexer(lucene::index::IndexWriter &r_writer)
		:m_r_writer(r_writer)
	{}

	/**
	 *	@brief directory traversal callback function
	 *
	 *	@param[in] r_t_file is file information, in case it is document file, it is indexed here
	 *
	 *	@return Returns true on success, false on failure.
	 */
	bool operator ()(const TFileInfo &r_t_file)
	{
		if(r_t_file.b_directory)
			return true;
		// ignore directories

		lucene::document::Document *p_docu = _CLNEW lucene::document::Document();
		// create a new document instance

		printf("\t%s   \r", r_t_file.p_s_Path());

		{
			TCHAR tf[CL_MAX_DIR];
			STRCPY_AtoT(tf, r_t_file.p_s_Path(), CL_MAX_DIR);
			p_docu->add(*_CLNEW lucene::document::Field(_T("path"), tf,
				lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_UNTOKENIZED));
		}
		// specify document path

		{
			std::string s_date;
			stl_ut::Format(s_date, "%04d-%02d-%02dT%02d:%02d:%02d",
				r_t_file.p_time[TFileInfo::time_LastWrite].n_year,
				r_t_file.p_time[TFileInfo::time_LastWrite].n_month,
				r_t_file.p_time[TFileInfo::time_LastWrite].n_day,
				r_t_file.p_time[TFileInfo::time_LastWrite].n_hour,
				r_t_file.p_time[TFileInfo::time_LastWrite].n_minute,
				r_t_file.p_time[TFileInfo::time_LastWrite].n_second);
			TCHAR dbuf[32];
			STRCPY_AtoT(dbuf, s_date.c_str(), s_date.length()+1);
			p_docu->add(*_CLNEW lucene::document::Field(_T("modified"), dbuf,
				lucene::document::Field::STORE_YES | lucene::document::Field::INDEX_UNTOKENIZED));
		}
		// specify date modified

		{
			FILE *p_fr;
			if(!(p_fr = fopen(r_t_file.p_s_Path(), "r")))
				return false;

			lucene::util::StringBuffer str;
			fseek(p_fr, 0, SEEK_END);
			str.reserve(ftell(p_fr));
			fseek(p_fr, 0, SEEK_SET);

			char abuf[1024];
			TCHAR tbuf[1024];
			for(;;) {
				size_t r = fread(abuf, 1, 1023, p_fr);
				if(!r)
					break;
				abuf[r] = 0;
				STRCPY_AtoT(tbuf, abuf, r);
				tbuf[r] = 0;
				str.append(tbuf);
			}
			fclose(p_fr);

			try {
				lucene::document::Field *pf = _CLNEW lucene::document::Field(_T("contents"), str.getBuffer(),
					lucene::document::Field::STORE_NO | lucene::document::Field::INDEX_TOKENIZED |
					lucene::document::Field::TERMVECTOR_YES | lucene::document::Field::TERMVECTOR_WITH_POSITIONS);
				p_docu->add(*pf);
			} catch(CLuceneError &err) {
				printf("lucene error: %s\n", err.what());
			}
		}
		// specify document contents

		m_r_writer.addDocument(p_docu);
		_CLDELETE(p_docu);

		return true;
	}
};

int main(int n_arg_num, const char **p_arg_list)
{
	const char *p_s_output_index = "..\\example-positional-index";
	const char *p_s_input_directory = "kjbible";

	lucene::analysis::standard::StandardAnalyzer analyzer;

	lucene::index::IndexWriter writer(lucene::store::FSDirectory::getDirectory(p_s_output_index, true),
		&analyzer, true, true);
	writer.setMaxFieldLength(lucene::index::IndexWriter::DEFAULT_MAX_FIELD_LENGTH);

	printf("reading documents ...\n");

	if(!CDirTraversal::Traverse2(p_s_input_directory, CDocumentIndexer(writer))) {
		fprintf(stderr, "error(s) occured while indexing ...\n");
		return -1;
	}

	printf("\noptimizing index ...\n");

	writer.optimize();

	printf("closing index ...\n");

	writer.close();

	printf("finished\n");

	return 0;
}

/*

/ *------------------------------------------------------------------------------
* Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
* 
* Distributable under the terms of either the Apache License (Version 2.0) or 
* the GNU Lesser General Public License, as specified in the COPYING file.
------------------------------------------------------------------------------* /
#include "stdafx.h"

#include "CLucene.h"
#include "CLucene/util/Reader.h"
#include "CLucene/util/Misc.h"
#include "CLucene/util/dirent.h"
#include <iostream>
#include <fstream>

using namespace std;
using namespace lucene::index;
using namespace lucene::analysis;
using namespace lucene::util;
using namespace lucene::store;
using namespace lucene::document;

Document* FileDocument(const char* f){
	// make a new, empty document
	Document* doc = _CLNEW Document();

	// Add the path of the file as a field named "path".  Use a Tex t field, so
	// that the index stores the path, and so that the path is searchable
   TCHAR tf[CL_MAX_DIR];
   STRCPY_AtoT(tf,f,CL_MAX_DIR);
   doc->add( *_CLNEW Field(_T("path"), tf, Field::STORE_YES | Field::INDEX_UNTOKENIZED ) );

	// Add the last modified date of the file a field named "modified".  Use a
	// Keyword field, so that it's searchable, but so that no attempt is made
	// to tokenize the field into words.
	//doc->add( *Field.Keyword("modified", DateField.timeToString(f->lastModified())));

	// Add the contents of the file a field named "contents".  Use a Text
	// field, specifying a Reader, so that the text of the file is tokenized.

    //read the data without any encoding. if you want to use special encoding
    //see the contrib/jstreams - they contain various types of stream readers
    FILE* fh = fopen(f,"r");
	if ( fh != NULL ){
		StringBuffer str;
		// use fstat for portability
		int fn = fileno(fh);
		struct stat filestat;
		fstat(fn, &filestat);
		str.reserve(filestat.st_size);
		//str.reserve(fileSize(fh->_file));
		char abuf[1024];
		TCHAR tbuf[1024];
		size_t r;
		do{
			r = fread(abuf,1,1023,fh);
			abuf[r]=0;
			STRCPY_AtoT(tbuf,abuf,r);
			tbuf[r]=0;
			str.append(tbuf);
		}while(r>0);
		fclose(fh);

		doc->add( *_CLNEW Field(_T("contents"),str.getBuffer(), Field::STORE_YES | Field::INDEX_TOKENIZED) );
	}

	//_tprintf(_T("%s\n"),doc->toString());
	// return the document
	return doc;
}

void indexDocs(IndexWriter* writer, char* directory) {
	DIR* dir = opendir(directory);
	if ( dir != NULL ){
		struct dirent* fl;
		
		struct fileStat buf;

		char path[CL_MAX_DIR];
		strcpy(path,directory);
		strcat(path,PATH_DELIMITERA);
		char* pathP = path + strlen(path);

		fl = readdir(dir);
		while ( fl != NULL ){
			if ( (strcmp(fl->d_name, ".")) && (strcmp(fl->d_name, "..")) ) {
			pathP[0]=0;
			strcat(pathP,fl->d_name);
			int32_t ret = fileStat(path,&buf);
			if ( buf.st_mode & S_IFDIR ) {
				indexDocs(writer, path );
			}else{
				printf( "adding: %s\n", fl->d_name );

				Document* doc = FileDocument( path );
				writer->addDocument( doc );
				_CLDELETE(doc);
			}
		}
		fl = readdir(dir);

		}
		closedir(dir);
	}else{
		    printf( "adding: %s\n", directory);

		    Document* doc = FileDocument( directory );
		    writer->addDocument( doc );
		    _CLDELETE(doc);
	}
}
void IndexFiles(char* path, char* target, const bool clearIndex){
	IndexWriter* writer = NULL;
	//lucene::analysis::SimpleAnalyzer* an = *_CLNEW lucene::analysis::SimpleAnalyzer();
	lucene::analysis::standard::StandardAnalyzer an;
	
	if ( !clearIndex && IndexReader::indexExists(target) ){
		if ( IndexReader::isLocked(target) ){
			printf("Index was locked... unlocking it.\n");
			IndexReader::unlock(target);
		}

		writer = _CLNEW IndexWriter( target, &an, false);
	}else{
		writer = _CLNEW IndexWriter( target ,&an, true);
	}
	writer->setMaxFieldLength(IndexWriter::DEFAULT_MAX_FIELD_LENGTH);
	/ *printf("Set MaxFieldLength: ");
	char mfl[250];
	fgets(mfl,250,stdin);
	mfl[strlen(mfl)-1] = 0;
	if ( mfl[0] != 0 )
		writer->setMaxFieldLength(atoi(mfl));* /
	//writer->infoStream = cout; //TODO: infoStream - unicode

	uint64_t str = lucene::util::Misc::currentTimeMillis();

	indexDocs(writer, path);
	writer->optimize();
	writer->close();
	_CLDELETE(writer);

	printf("Indexing took: %d ms.\n\n", lucene::util::Misc::currentTimeMillis() - str);
}

*/
