1 //
2 // WordListOne.cc
3 //
4 // Part of the ht://Dig package <http://www.htdig.org/>
5 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group
6 // For copyright details, see the file COPYING in your distribution
7 // or the GNU General Public License version 2 or later
8 // <http://www.gnu.org/copyleft/gpl.html>
9 //
10 // $Id: WordListOne.cc,v 1.23 2001/06/29 14:14:08 loic Exp $
11 //
12
13 #ifdef HAVE_CONFIG_H
14 #include "config.h"
15 #endif /* HAVE_CONFIG_H */
16
17 #include "WordListOne.h"
18 #include "WordReference.h"
19 #include "WordRecord.h"
20 #include "WordType.h"
21 #include "WordContext.h"
22 #include "Configuration.h"
23 #include "htString.h"
24 #include "HtTime.h"
25 #include "WordDBCompress.h"
26 #include "WordDBCache.h"
27 #include "WordDead.h"
28 #include "WordMeta.h"
29
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <unistd.h>
33 #include <ctype.h>
34 #include <errno.h>
35
36 // *****************************************************************************
37 //
WordListOne(WordContext * ncontext)38 WordListOne::WordListOne(WordContext* ncontext)
39 {
40 context = ncontext;
41 db = new WordDB(ncontext->GetDBInfo());
42 dict = new WordDict();
43 dict->Initialize(this);
44 meta = new WordMeta();
45 meta->Initialize(this);
46 dead = new WordDead();
47 dead->Initialize(this);
48
49 // The database itself hasn't been opened yet
50 isopen = 0;
51 Configuration& config = context->GetConfiguration();
52 extended = config.Boolean("wordlist_extend");
53 verbose = config.Value("wordlist_verbose");
54 compressor = 0;
55 caches = 0;
56 flags = 0;
57 }
58
59 // *****************************************************************************
60 //
~WordListOne()61 WordListOne::~WordListOne()
62 {
63 BatchEnd();
64 Close();
65 delete dead;
66 delete meta;
67 delete dict;
68 delete db;
69 }
70
word_db_qcmp(WordContext * context,const WordDBCacheEntry * a,const WordDBCacheEntry * b)71 static int word_db_qcmp(WordContext* context, const WordDBCacheEntry *a, const WordDBCacheEntry *b)
72 {
73 return WordKey::Compare(context, (const unsigned char*)a->key, a->key_size, (const unsigned char*)b->key, b->key_size);
74 }
75
76 // *****************************************************************************
77 //
Open(const String & nfilename,int mode)78 int WordListOne::Open(const String& nfilename, int mode)
79 {
80 filename = nfilename;
81
82 int usecompress = 0;
83 Configuration& config = context->GetConfiguration();
84
85 if(config.Boolean("wordlist_compress") == 1) {
86 usecompress = DB_COMPRESS;
87 WordDBCompress* compressor = new WordDBCompress(context);
88 // compressor->debug = config.Value("wordlist_compress_debug");
89 SetCompressor(compressor);
90
91 context->GetDBInfo().dbenv->mp_cmpr_info = compressor->CmprInfo();
92 context->GetDBInfo().dbenv->flags |= DB_ENV_CMPR;
93 }
94
95 flags = (mode & O_RDWR) ? DB_CREATE : DB_RDONLY;
96 flags |= usecompress;
97 if(mode & O_TRUNC) {
98 if(mode & O_RDWR) {
99 unlink((char*)filename);
100 } else
101 fprintf(stderr, "WordListOne::Open: O_TRUNC | O_RDONLY is meaningless\n");
102 }
103
104 WordLock* lock;
105 Meta()->Lock("open", lock);
106
107 db->set_bt_compare(word_db_cmp, (void*)context);
108
109 if(config.Boolean("wordlist_cache_inserts", 0)) {
110 int size = config.Value("wordlist_cache_size", 0);
111 if(size / 2 < WORD_DB_CACHE_MINIMUM)
112 size = 0;
113 else
114 size /= 2;
115
116 db->CacheOn(context, size);
117 db->CacheCompare(word_db_qcmp);
118 }
119
120 db->set_pagesize(Pagesize());
121
122 int ret = db->Open(filename, "index", DB_BTREE, flags, 0666, WORD_DB_INDEX) == 0 ? OK : NOTOK;
123 if(ret == NOTOK) return ret;
124 if(dict->Open() != OK) return NOTOK;
125 if(meta->Open() != OK) return NOTOK;
126 if(dead->Open() != OK) return NOTOK;
127
128 isopen = 1;
129
130 Meta()->Unlock("open", lock);
131
132 return ret;
133 }
134
135 // *****************************************************************************
136 //
Close()137 int WordListOne::Close()
138 {
139 if(isopen) {
140 if(db->Close() != 0) return NOTOK;
141 if(dict->Close() != 0) return NOTOK;
142 if(meta->Close() != 0) return NOTOK;
143 if(dead->Close() != 0) return NOTOK;
144 isopen = 0;
145 }
146
147 {
148 WordDBCompress* compressor = GetCompressor();
149 if(compressor) {
150 delete compressor;
151 SetCompressor(0);
152 }
153 delete context->GetDBInfo().dbenv->mp_cmpr_info;
154 context->GetDBInfo().dbenv->mp_cmpr_info = 0;
155 context->GetDBInfo().dbenv->flags &= ~DB_ENV_CMPR;
156 }
157
158 return OK;
159 }
160
161 // ****************************************************************************
162 //
Size() const163 unsigned int WordListOne::Size() const
164 {
165 return db->Size();
166 }
167
168 // ****************************************************************************
169 //
Override(const WordReference & arg)170 int WordListOne::Override(const WordReference& arg)
171 {
172 if (arg.GetWord().length() == 0) {
173 fprintf(stderr, "WordListOne::Override(%s) word is zero length\n", (char*)arg.Get());
174 return NOTOK;
175 }
176 if (!arg.Key().Filled()) {
177 fprintf(stderr, "WordListOne::Override(%s) key is not fully defined\n", (char*)arg.Get());
178 return NOTOK;
179 }
180
181 WordType& wtype = context->GetType();
182 WordReference wordRef(arg);
183 String word = wordRef.GetWord();
184 if(wtype.Normalize(word) & WORD_NORMALIZE_NOTOK)
185 return NOTOK;
186 wordRef.SetWord(word);
187 unsigned int wordid = 0;
188 if(dict->SerialRef(word, wordid) != OK) return NOTOK;
189 wordRef.Key().Set(WORD_KEY_WORD, wordid);
190
191 int ret = NOTOK;
192
193 if(caches) {
194 String key;
195 String record;
196 if(wordRef.Pack(key, record) != OK)
197 return NOTOK;
198 ret = caches->Add(key.get(), key.length(), record.get(), record.length()) == 0 ? OK : NOTOK;
199 if(caches->Full()) caches->Merge(*db);
200 } else {
201 ret = db->Put(wordRef, 0) == 0 ? OK : NOTOK;
202 }
203
204 return ret;
205 }
206
207
208 // *****************************************************************************
209 //
operator [](const WordReference & wordRef)210 List *WordListOne::operator [] (const WordReference& wordRef)
211 {
212 return Collect(wordRef);
213 }
214
215 // *****************************************************************************
216 //
Prefix(const WordReference & prefix)217 List *WordListOne::Prefix (const WordReference& prefix)
218 {
219 List* result = new List();
220 WordDictCursor* cursor = Dict()->CursorPrefix(prefix.GetWord());
221 String word;
222 WordDictRecord record;
223 WordReference prefix2(prefix);
224 while(Dict()->NextPrefix(cursor, word, record) == 0) {
225 prefix2.Key().Set(WORD_KEY_WORD, record.Id());
226 List* tmp_result = Collect(prefix2);
227 while(tmp_result->Count() > 0) {
228 WordReference* entry = (WordReference*)tmp_result->Shift(LIST_REMOVE_RELEASE);
229 entry->SetWord(word);
230 result->Push(entry);
231 }
232 delete tmp_result;
233 }
234 return result;
235 }
236
237 // *****************************************************************************
238 //
WordRefs()239 List *WordListOne::WordRefs()
240 {
241 return Collect(WordReference(context));
242 }
243
244 // *****************************************************************************
245 //
Collect(const WordReference & wordRef)246 List *WordListOne::Collect(const WordReference& wordRef)
247 {
248 WordCursor *search = Cursor(wordRef.Key(), HTDIG_WORDLIST_COLLECTOR);
249 if(search->Walk() != OK) return 0;
250 List* result = search->GetResults();
251 delete search;
252 return result;
253 }
254
255 // *****************************************************************************
256 //
257 int
Read(FILE * f)258 WordListOne::Read(FILE* f)
259 {
260 WordReference wordRef(context);
261 #define WORD_BUFFER_SIZE 1024
262 char buffer[WORD_BUFFER_SIZE + 1];
263 String line;
264 int line_number = 0;
265 int inserted = 0;
266
267 BatchStart();
268
269 String key;
270 String record;
271
272 while(fgets(buffer, WORD_BUFFER_SIZE, f)) {
273 line_number++;
274 int buffer_length = strlen(buffer);
275 int eol = buffer[buffer_length - 1] == '\n';
276
277 if(eol) buffer[--buffer_length] = '\0';
278
279 line.append(buffer, buffer_length);
280 //
281 // Join big lines
282 //
283 if(!eol) continue;
284 //
285 // If line ends with a \ continue
286 //
287 if(line.last() == '\\') {
288 line.chop(1);
289 continue;
290 }
291
292 if(!line.empty()) {
293 StringList fields(line, "\t ");
294
295 //
296 // Convert the word to a wordid
297 //
298 String* word = (String*)fields.Get_First();
299 unsigned int wordid;
300 if(dict->SerialRef(*word, wordid) != OK) return NOTOK;
301 word->trunc();
302 (*word) << wordid;
303
304 if(wordRef.SetList(fields) != OK) {
305 fprintf(stderr, "WordList::Read: line %d : %s\n", line_number, (char*)line);
306 fprintf(stderr, " cannot build WordReference (ignored)\n");
307 } else {
308 if(wordRef.Pack(key, record) != OK) {
309 fprintf(stderr, "WordList::Read: line %d : %s\n", line_number, (char*)line);
310 fprintf(stderr, " pack failed (ignored)\n");
311 } else {
312 caches->Add(key.get(), key.length(), record.get(), record.length());
313 inserted++;
314 }
315 if(verbose && (inserted % 10000 == 0)) fprintf(stderr, "WordList::Read: inserted %d entries\n", inserted);
316 if(verbose > 1) fprintf(stderr, "WordList::Read: inserting %s\n", (char*)wordRef.Get());
317 }
318
319 line.trunc();
320 }
321 }
322
323 BatchEnd();
324
325 return inserted;
326 }
327
328 // *****************************************************************************
329 //
330 // streaming operators for ascii dumping and reading a list
331 class FileOutData : public Object
332 {
333 public:
334 FILE* f;
335 String word;
FileOutData(FILE * f_arg)336 FileOutData(FILE* f_arg) : f(f_arg) { }
337 };
338
339 // *****************************************************************************
340 //
341 static int
wordlist_walk_callback_file_out(WordList *,WordDBCursor &,const WordReference * wordRef,Object & ndata)342 wordlist_walk_callback_file_out(WordList *, WordDBCursor& , const WordReference *wordRef, Object &ndata)
343 {
344 FileOutData& data = (FileOutData&)ndata;
345 ((WordReference*)wordRef)->SetWord(data.word);
346 fprintf(data.f, "%s\n", (char*)wordRef->Get());
347 return OK;
348 }
349
Write(FILE * f)350 int WordListOne::Write(FILE* f)
351 {
352 FileOutData data(f);
353 WordDictCursor* cursor = dict->Cursor();
354 int ret;
355 String word;
356 WordDictRecord wordinfo;
357 while((ret = dict->Next(cursor, word, wordinfo)) == 0) {
358 WordKey key(context);
359 key.Set(WORD_KEY_WORD, wordinfo.Id());
360 data.word = word;
361 WordCursor *search = Cursor(key, wordlist_walk_callback_file_out, (Object *)&data);
362 search->Walk();
363 delete search;
364 }
365 return ret == DB_NOTFOUND ? OK : NOTOK;
366 }
367
368
369 // *****************************************************************************
370 //
371 // Callback data dedicated to Dump and dump_word communication
372 //
373 class DeleteWordData : public Object
374 {
375 public:
DeleteWordData()376 DeleteWordData() { count = 0; }
377
378 int count;
379 };
380
381 // *****************************************************************************
382 //
383 //
delete_word(WordList * words,WordDBCursor & cursor,const WordReference * word,Object & data)384 static int delete_word(WordList *words, WordDBCursor &cursor, const WordReference *word, Object &data)
385 {
386 WordListOne *words_one = (WordListOne*)words;
387 if(words_one->DeleteCursor(cursor) == 0) {
388 ((DeleteWordData&)data).count++;
389 return OK;
390 } else {
391 fprintf(stderr, "WordList delete_word: deleting %s failed\n", (char*)word->Get());
392 return NOTOK;
393 }
394 }
395
396 // *****************************************************************************
397 //
398 // Delete all records matching wordRef, return the number of
399 // deleted records.
400 //
WalkDelete(const WordReference & wordRef)401 int WordListOne::WalkDelete(const WordReference& wordRef)
402 {
403 DeleteWordData data;
404 WordKey key = wordRef.Key();
405
406 if(key.IsDefined(WORD_KEY_WORD)) {
407 WordCursor *description = Cursor(key, delete_word, &data);
408 description->Walk();
409 delete description;
410 dict->Decr(wordRef.GetWord(), data.count);
411 } else {
412 WordDictCursor* cursor = dict->Cursor();
413 int ret;
414 String word;
415 WordDictRecord wordinfo;
416 int total = 0;
417 while((ret = dict->Next(cursor, word, wordinfo)) == 0) {
418 key.Set(WORD_KEY_WORD, wordinfo.Id());
419 WordCursor *search = Cursor(key, delete_word, &data);
420 search->Walk();
421 delete search;
422 dict->Decr(word, data.count);
423 total += data.count;
424 data.count = 0;
425 }
426 data.count = total;
427 }
428 return data.count;
429 }
430
431 // *****************************************************************************
432 //
433 // Returns the reference count for word in <count> arg
434 //
Noccurrence(const String & word,unsigned int & noccurrence) const435 int WordListOne::Noccurrence(const String& word, unsigned int& noccurrence) const
436 {
437 return dict->Noccurrence(word, noccurrence);
438 }
439
Key(const String & bufferin)440 WordKey WordListOne::Key(const String& bufferin)
441 {
442 WordKey key(context);
443 StringList fields(bufferin, "\t ");
444 String* field = (String*)fields.Get_First();
445 unsigned int wordid;
446 Dict()->Serial(*field, wordid);
447 field->trunc();
448 (*field) << wordid;
449 key.SetList(fields);
450 return key;
451 }
452
Word(const String & bufferin,int exists)453 WordReference WordListOne::Word(const String& bufferin, int exists /* = 0 */)
454 {
455 WordReference wordRef(context);
456 StringList fields(bufferin, "\t ");
457 String* field = (String*)fields.Get_First();
458 if(context->GetType().Normalize(*field) & WORD_NORMALIZE_NOTOK) {
459 //
460 // If the goal is to build a WordReference object that may not be
461 // in the index, canonicalization failure is not a problem.
462 //
463 if(!exists)
464 fprintf(stderr, "WordListOne::Word: cannot normalize word %s\n", (char*)*field);
465 }
466 String word = *field;
467 unsigned int wordid;
468 if(exists)
469 Dict()->SerialExists(word, wordid);
470 else
471 Dict()->Serial(word, wordid);
472 field->trunc();
473 (*field) << wordid;
474 wordRef.SetList(fields);
475 wordRef.SetWord(word);
476 return wordRef;
477 }
478
479 void
BatchEnd()480 WordListOne::BatchEnd()
481 {
482 if(caches) {
483 caches->Merge(*db);
484 WordList::BatchEnd();
485 }
486 }
487