1 /*------------------------------------------------------------------------------
2 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
3 *
4 * Distributable under the terms of either the Apache License (Version 2.0) or
5 * the GNU Lesser General Public License, as specified in the COPYING file.
6 ------------------------------------------------------------------------------*/
7 
8 #include <stdio.h>
9 #include <iostream>
10 #include <fstream>
11 #include <sys/stat.h>
12 #include <cctype>
13 #include <string.h>
14 #include <algorithm>
15 
16 #include "CLucene/StdHeader.h"
17 #include "CLucene/_clucene-config.h"
18 
19 #include "CLucene.h"
20 #include "CLucene/util/CLStreams.h"
21 #include "CLucene/util/dirent.h"
22 #include "CLucene/config/repl_tchar.h"
23 #include "CLucene/util/Misc.h"
24 #include "CLucene/util/StringBuffer.h"
25 
26 using namespace std;
27 using namespace lucene::index;
28 using namespace lucene::analysis;
29 using namespace lucene::util;
30 using namespace lucene::store;
31 using namespace lucene::document;
32 
FileDocument(const char * f,Document * doc)33 void FileDocument(const char* f, Document* doc){
34 
35     // Add the path of the file as a field named "path".  Use an indexed and stored field, so
36     // that the index stores the path, and so that the path is searchable.
37     TCHAR tf[CL_MAX_DIR];
38     STRCPY_AtoT(tf,f,CL_MAX_DIR);
39     doc->add( *_CLNEW Field(_T("path"), tf, Field::STORE_YES | Field::INDEX_UNTOKENIZED ) );
40 
41     // Add the last modified date of the file a field named "modified". Again, we make it
42     // searchable, but no attempt is made to tokenize the field into words.
43     //doc->add( *_CLNEW Field(_T("modified"), DateTools::timeToString(f->lastModified()), Field::STORE_YES | Field::INDEX_NO));
44 
45     // Add the contents of the file a field named "contents".  This time we use a tokenized
46 	// field so that the text can be searched for words in it.
47 
48     // Here we read the data without any encoding. If you want to use special encoding
49     // see the contrib/jstreams - they contain various types of stream readers
50     FILE* fh = fopen(f,"r");
51 	if ( fh != NULL ){
52 		StringBuffer str;
53 		char abuf[1024];
54 		TCHAR tbuf[1024];
55 		size_t r;
56 		do{
57 			r = fread(abuf,1,1023,fh);
58 			abuf[r]=0;
59 			STRCPY_AtoT(tbuf,abuf,r);
60 			tbuf[r]=0;
61 			str.append(tbuf);
62 		}while(r>0);
63 		fclose(fh);
64 
65 		doc->add( *_CLNEW Field(_T("contents"), str.getBuffer(), Field::STORE_YES | Field::INDEX_TOKENIZED) );
66 	}
67 }
68 
indexDocs(IndexWriter * writer,const char * directory)69 void indexDocs(IndexWriter* writer, const char* directory) {
70     vector<string> files;
71     std::sort(files.begin(),files.end());
72     Misc::listFiles(directory,files,true);
73     vector<string>::iterator itr = files.begin();
74 
75     // Re-use the document object
76     Document doc;
77     int i=0;
78     while ( itr != files.end() ){
79         const char* path = itr->c_str();
80         printf( "adding file %d: %s\n", ++i, path );
81 
82         doc.clear();
83         FileDocument( path, &doc );
84         writer->addDocument( &doc );
85         ++itr;
86     }
87 }
IndexFiles(const char * path,const char * target,const bool clearIndex)88 void IndexFiles(const char* path, const char* target, const bool clearIndex){
89 	IndexWriter* writer = NULL;
90 	lucene::analysis::WhitespaceAnalyzer an;
91 
92 	if ( !clearIndex && IndexReader::indexExists(target) ){
93 		if ( IndexReader::isLocked(target) ){
94 			printf("Index was locked... unlocking it.\n");
95 			IndexReader::unlock(target);
96 		}
97 
98 		writer = _CLNEW IndexWriter( target, &an, false);
99 	}else{
100 		writer = _CLNEW IndexWriter( target ,&an, true);
101 	}
102 
103     //writer->setInfoStream(&std::cout);
104 
105     // We can tell the writer to flush at certain occasions
106     //writer->setRAMBufferSizeMB(0.5);
107     //writer->setMaxBufferedDocs(3);
108 
109     // To bypass a possible exception (we have no idea what we will be indexing...)
110     writer->setMaxFieldLength(0x7FFFFFFFL); // LUCENE_INT32_MAX_SHOULDBE
111 
112     // Turn this off to make indexing faster; we'll turn it on later before optimizing
113     writer->setUseCompoundFile(false);
114 
115 	uint64_t str = Misc::currentTimeMillis();
116 
117 	indexDocs(writer, path);
118 
119     // Make the index use as little files as possible, and optimize it
120     writer->setUseCompoundFile(true);
121     writer->optimize();
122 
123     // Close and clean up
124     writer->close();
125 	_CLLDELETE(writer);
126 
127 	printf("Indexing took: %d ms.\n\n", (int32_t)(Misc::currentTimeMillis() - str));
128 }
129