1 /*------------------------------------------------------------------------------
2 * Copyright (C) 2003-2006 Ben van Klinken and the CLucene Team
3 *
4 * Distributable under the terms of either the Apache License (Version 2.0) or
5 * the GNU Lesser General Public License, as specified in the COPYING file.
6 ------------------------------------------------------------------------------*/
7
8 #include <stdio.h>
9 #include <iostream>
10 #include <fstream>
11 #include <sys/stat.h>
12 #include <cctype>
13 #include <string.h>
14 #include <algorithm>
15
16 #include "CLucene/StdHeader.h"
17 #include "CLucene/_clucene-config.h"
18
19 #include "CLucene.h"
20 #include "CLucene/util/CLStreams.h"
21 #include "CLucene/util/dirent.h"
22 #include "CLucene/config/repl_tchar.h"
23 #include "CLucene/util/Misc.h"
24 #include "CLucene/util/StringBuffer.h"
25
26 using namespace std;
27 using namespace lucene::index;
28 using namespace lucene::analysis;
29 using namespace lucene::util;
30 using namespace lucene::store;
31 using namespace lucene::document;
32
FileDocument(const char * f,Document * doc)33 void FileDocument(const char* f, Document* doc){
34
35 // Add the path of the file as a field named "path". Use an indexed and stored field, so
36 // that the index stores the path, and so that the path is searchable.
37 TCHAR tf[CL_MAX_DIR];
38 STRCPY_AtoT(tf,f,CL_MAX_DIR);
39 doc->add( *_CLNEW Field(_T("path"), tf, Field::STORE_YES | Field::INDEX_UNTOKENIZED ) );
40
41 // Add the last modified date of the file a field named "modified". Again, we make it
42 // searchable, but no attempt is made to tokenize the field into words.
43 //doc->add( *_CLNEW Field(_T("modified"), DateTools::timeToString(f->lastModified()), Field::STORE_YES | Field::INDEX_NO));
44
45 // Add the contents of the file a field named "contents". This time we use a tokenized
46 // field so that the text can be searched for words in it.
47
48 // Here we read the data without any encoding. If you want to use special encoding
49 // see the contrib/jstreams - they contain various types of stream readers
50 FILE* fh = fopen(f,"r");
51 if ( fh != NULL ){
52 StringBuffer str;
53 char abuf[1024];
54 TCHAR tbuf[1024];
55 size_t r;
56 do{
57 r = fread(abuf,1,1023,fh);
58 abuf[r]=0;
59 STRCPY_AtoT(tbuf,abuf,r);
60 tbuf[r]=0;
61 str.append(tbuf);
62 }while(r>0);
63 fclose(fh);
64
65 doc->add( *_CLNEW Field(_T("contents"), str.getBuffer(), Field::STORE_YES | Field::INDEX_TOKENIZED) );
66 }
67 }
68
indexDocs(IndexWriter * writer,const char * directory)69 void indexDocs(IndexWriter* writer, const char* directory) {
70 vector<string> files;
71 std::sort(files.begin(),files.end());
72 Misc::listFiles(directory,files,true);
73 vector<string>::iterator itr = files.begin();
74
75 // Re-use the document object
76 Document doc;
77 int i=0;
78 while ( itr != files.end() ){
79 const char* path = itr->c_str();
80 printf( "adding file %d: %s\n", ++i, path );
81
82 doc.clear();
83 FileDocument( path, &doc );
84 writer->addDocument( &doc );
85 ++itr;
86 }
87 }
IndexFiles(const char * path,const char * target,const bool clearIndex)88 void IndexFiles(const char* path, const char* target, const bool clearIndex){
89 IndexWriter* writer = NULL;
90 lucene::analysis::WhitespaceAnalyzer an;
91
92 if ( !clearIndex && IndexReader::indexExists(target) ){
93 if ( IndexReader::isLocked(target) ){
94 printf("Index was locked... unlocking it.\n");
95 IndexReader::unlock(target);
96 }
97
98 writer = _CLNEW IndexWriter( target, &an, false);
99 }else{
100 writer = _CLNEW IndexWriter( target ,&an, true);
101 }
102
103 //writer->setInfoStream(&std::cout);
104
105 // We can tell the writer to flush at certain occasions
106 //writer->setRAMBufferSizeMB(0.5);
107 //writer->setMaxBufferedDocs(3);
108
109 // To bypass a possible exception (we have no idea what we will be indexing...)
110 writer->setMaxFieldLength(0x7FFFFFFFL); // LUCENE_INT32_MAX_SHOULDBE
111
112 // Turn this off to make indexing faster; we'll turn it on later before optimizing
113 writer->setUseCompoundFile(false);
114
115 uint64_t str = Misc::currentTimeMillis();
116
117 indexDocs(writer, path);
118
119 // Make the index use as little files as possible, and optimize it
120 writer->setUseCompoundFile(true);
121 writer->optimize();
122
123 // Close and clean up
124 writer->close();
125 _CLLDELETE(writer);
126
127 printf("Indexing took: %d ms.\n\n", (int32_t)(Misc::currentTimeMillis() - str));
128 }
129