1 //
2 // WordListOne.cc
3 //
4 // Part of the ht://Dig package   <http://www.htdig.org/>
5 // Copyright (c) 1999, 2000, 2001 The ht://Dig Group
6 // For copyright details, see the file COPYING in your distribution
7 // or the GNU General Public License version 2 or later
8 // <http://www.gnu.org/copyleft/gpl.html>
9 //
10 // $Id: WordListOne.cc,v 1.23 2001/06/29 14:14:08 loic Exp $
11 //
12 
13 #ifdef HAVE_CONFIG_H
14 #include "config.h"
15 #endif /* HAVE_CONFIG_H */
16 
17 #include "WordListOne.h"
18 #include "WordReference.h"
19 #include "WordRecord.h"
20 #include "WordType.h"
21 #include "WordContext.h"
22 #include "Configuration.h"
23 #include "htString.h"
24 #include "HtTime.h"
25 #include "WordDBCompress.h"
26 #include "WordDBCache.h"
27 #include "WordDead.h"
28 #include "WordMeta.h"
29 
30 #include <stdio.h>
31 #include <stdlib.h>
32 #include <unistd.h>
33 #include <ctype.h>
34 #include <errno.h>
35 
36 // *****************************************************************************
37 //
WordListOne(WordContext * ncontext)38 WordListOne::WordListOne(WordContext* ncontext)
39 {
40   context = ncontext;
41   db = new WordDB(ncontext->GetDBInfo());
42   dict = new WordDict();
43   dict->Initialize(this);
44   meta = new WordMeta();
45   meta->Initialize(this);
46   dead = new WordDead();
47   dead->Initialize(this);
48 
49   // The database itself hasn't been opened yet
50   isopen = 0;
51   Configuration& config = context->GetConfiguration();
52   extended = config.Boolean("wordlist_extend");
53   verbose =  config.Value("wordlist_verbose");
54   compressor = 0;
55   caches = 0;
56   flags = 0;
57 }
58 
59 // *****************************************************************************
60 //
~WordListOne()61 WordListOne::~WordListOne()
62 {
63   BatchEnd();
64   Close();
65   delete dead;
66   delete meta;
67   delete dict;
68   delete db;
69 }
70 
word_db_qcmp(WordContext * context,const WordDBCacheEntry * a,const WordDBCacheEntry * b)71 static int word_db_qcmp(WordContext* context, const WordDBCacheEntry *a, const WordDBCacheEntry *b)
72 {
73   return WordKey::Compare(context, (const unsigned char*)a->key, a->key_size, (const unsigned char*)b->key, b->key_size);
74 }
75 
76 // *****************************************************************************
77 //
Open(const String & nfilename,int mode)78 int WordListOne::Open(const String& nfilename, int mode)
79 {
80   filename = nfilename;
81 
82   int usecompress = 0;
83   Configuration& config = context->GetConfiguration();
84 
85   if(config.Boolean("wordlist_compress") == 1) {
86     usecompress = DB_COMPRESS;
87     WordDBCompress* compressor = new WordDBCompress(context);
88     //      compressor->debug = config.Value("wordlist_compress_debug");
89     SetCompressor(compressor);
90 
91     context->GetDBInfo().dbenv->mp_cmpr_info = compressor->CmprInfo();
92     context->GetDBInfo().dbenv->flags |= DB_ENV_CMPR;
93   }
94 
95   flags = (mode & O_RDWR) ? DB_CREATE : DB_RDONLY;
96   flags |= usecompress;
97   if(mode & O_TRUNC) {
98     if(mode & O_RDWR) {
99       unlink((char*)filename);
100     } else
101       fprintf(stderr, "WordListOne::Open: O_TRUNC | O_RDONLY is meaningless\n");
102   }
103 
104   WordLock* lock;
105   Meta()->Lock("open", lock);
106 
107   db->set_bt_compare(word_db_cmp, (void*)context);
108 
109   if(config.Boolean("wordlist_cache_inserts", 0)) {
110     int size = config.Value("wordlist_cache_size", 0);
111     if(size / 2 < WORD_DB_CACHE_MINIMUM)
112       size = 0;
113     else
114       size /= 2;
115 
116     db->CacheOn(context, size);
117     db->CacheCompare(word_db_qcmp);
118   }
119 
120   db->set_pagesize(Pagesize());
121 
122   int ret = db->Open(filename, "index", DB_BTREE, flags, 0666, WORD_DB_INDEX) == 0 ? OK : NOTOK;
123   if(ret == NOTOK) return ret;
124   if(dict->Open() != OK) return NOTOK;
125   if(meta->Open() != OK) return NOTOK;
126   if(dead->Open() != OK) return NOTOK;
127 
128   isopen = 1;
129 
130   Meta()->Unlock("open", lock);
131 
132   return ret;
133 }
134 
135 // *****************************************************************************
136 //
Close()137 int WordListOne::Close()
138 {
139   if(isopen) {
140     if(db->Close() != 0) return NOTOK;
141     if(dict->Close() != 0) return NOTOK;
142     if(meta->Close() != 0) return NOTOK;
143     if(dead->Close() != 0) return NOTOK;
144     isopen = 0;
145   }
146 
147   {
148     WordDBCompress* compressor = GetCompressor();
149     if(compressor) {
150       delete compressor;
151       SetCompressor(0);
152     }
153     delete context->GetDBInfo().dbenv->mp_cmpr_info;
154     context->GetDBInfo().dbenv->mp_cmpr_info = 0;
155     context->GetDBInfo().dbenv->flags &= ~DB_ENV_CMPR;
156   }
157 
158   return OK;
159 }
160 
161 // ****************************************************************************
162 //
Size() const163 unsigned int WordListOne::Size() const
164 {
165   return db->Size();
166 }
167 
168 // ****************************************************************************
169 //
Override(const WordReference & arg)170 int WordListOne::Override(const WordReference& arg)
171 {
172   if (arg.GetWord().length() == 0) {
173     fprintf(stderr, "WordListOne::Override(%s) word is zero length\n", (char*)arg.Get());
174     return NOTOK;
175   }
176   if (!arg.Key().Filled()) {
177     fprintf(stderr, "WordListOne::Override(%s) key is not fully defined\n", (char*)arg.Get());
178     return NOTOK;
179   }
180 
181   WordType& wtype = context->GetType();
182   WordReference	wordRef(arg);
183   String 	word = wordRef.GetWord();
184   if(wtype.Normalize(word) & WORD_NORMALIZE_NOTOK)
185     return NOTOK;
186   wordRef.SetWord(word);
187   unsigned int wordid = 0;
188   if(dict->SerialRef(word, wordid) != OK) return NOTOK;
189   wordRef.Key().Set(WORD_KEY_WORD, wordid);
190 
191   int ret = NOTOK;
192 
193   if(caches) {
194     String key;
195     String record;
196     if(wordRef.Pack(key, record) != OK)
197       return NOTOK;
198     ret = caches->Add(key.get(), key.length(), record.get(), record.length()) == 0 ? OK : NOTOK;
199     if(caches->Full()) caches->Merge(*db);
200   } else {
201     ret = db->Put(wordRef, 0) == 0 ? OK : NOTOK;
202   }
203 
204   return ret;
205 }
206 
207 
208 // *****************************************************************************
209 //
operator [](const WordReference & wordRef)210 List *WordListOne::operator [] (const WordReference& wordRef)
211 {
212   return Collect(wordRef);
213 }
214 
215 // *****************************************************************************
216 //
Prefix(const WordReference & prefix)217 List *WordListOne::Prefix (const WordReference& prefix)
218 {
219   List* result = new List();
220   WordDictCursor* cursor = Dict()->CursorPrefix(prefix.GetWord());
221   String word;
222   WordDictRecord record;
223   WordReference prefix2(prefix);
224   while(Dict()->NextPrefix(cursor, word, record) == 0) {
225     prefix2.Key().Set(WORD_KEY_WORD, record.Id());
226     List* tmp_result = Collect(prefix2);
227     while(tmp_result->Count() > 0) {
228       WordReference* entry = (WordReference*)tmp_result->Shift(LIST_REMOVE_RELEASE);
229       entry->SetWord(word);
230       result->Push(entry);
231     }
232     delete tmp_result;
233   }
234   return result;
235 }
236 
237 // *****************************************************************************
238 //
WordRefs()239 List *WordListOne::WordRefs()
240 {
241   return Collect(WordReference(context));
242 }
243 
244 // *****************************************************************************
245 //
Collect(const WordReference & wordRef)246 List *WordListOne::Collect(const WordReference& wordRef)
247 {
248   WordCursor *search = Cursor(wordRef.Key(), HTDIG_WORDLIST_COLLECTOR);
249   if(search->Walk() != OK) return 0;
250   List* result = search->GetResults();
251   delete search;
252   return result;
253 }
254 
255 // *****************************************************************************
256 //
257 int
Read(FILE * f)258 WordListOne::Read(FILE* f)
259 {
260   WordReference wordRef(context);
261 #define WORD_BUFFER_SIZE	1024
262   char buffer[WORD_BUFFER_SIZE + 1];
263   String line;
264   int line_number = 0;
265   int inserted = 0;
266 
267   BatchStart();
268 
269   String key;
270   String record;
271 
272   while(fgets(buffer, WORD_BUFFER_SIZE, f)) {
273     line_number++;
274     int buffer_length = strlen(buffer);
275     int eol = buffer[buffer_length - 1] == '\n';
276 
277     if(eol) buffer[--buffer_length] = '\0';
278 
279     line.append(buffer, buffer_length);
280     //
281     // Join big lines
282     //
283     if(!eol) continue;
284     //
285     // If line ends with a \ continue
286     //
287     if(line.last() == '\\') {
288       line.chop(1);
289       continue;
290     }
291 
292     if(!line.empty()) {
293       StringList fields(line, "\t ");
294 
295       //
296       // Convert the word to a wordid
297       //
298       String* word = (String*)fields.Get_First();
299       unsigned int wordid;
300       if(dict->SerialRef(*word, wordid) != OK) return NOTOK;
301       word->trunc();
302       (*word) << wordid;
303 
304       if(wordRef.SetList(fields) != OK) {
305 	fprintf(stderr, "WordList::Read: line %d : %s\n", line_number, (char*)line);
306 	fprintf(stderr, " cannot build WordReference (ignored)\n");
307       } else {
308 	if(wordRef.Pack(key, record) != OK) {
309 	  fprintf(stderr, "WordList::Read: line %d : %s\n", line_number, (char*)line);
310 	  fprintf(stderr, " pack failed (ignored)\n");
311 	} else {
312 	  caches->Add(key.get(), key.length(), record.get(), record.length());
313 	  inserted++;
314 	}
315 	if(verbose && (inserted % 10000 == 0)) fprintf(stderr, "WordList::Read: inserted %d entries\n", inserted);
316 	if(verbose > 1) fprintf(stderr, "WordList::Read: inserting %s\n", (char*)wordRef.Get());
317       }
318 
319       line.trunc();
320     }
321   }
322 
323   BatchEnd();
324 
325   return inserted;
326 }
327 
328 // *****************************************************************************
329 //
330 // streaming operators for ascii dumping and reading a list
331 class FileOutData : public Object
332 {
333 public:
334   FILE* f;
335   String word;
FileOutData(FILE * f_arg)336   FileOutData(FILE* f_arg) : f(f_arg) { }
337 };
338 
339 // *****************************************************************************
340 //
341 static int
wordlist_walk_callback_file_out(WordList *,WordDBCursor &,const WordReference * wordRef,Object & ndata)342 wordlist_walk_callback_file_out(WordList *, WordDBCursor& , const WordReference *wordRef, Object &ndata)
343 {
344   FileOutData& data = (FileOutData&)ndata;
345   ((WordReference*)wordRef)->SetWord(data.word);
346   fprintf(data.f, "%s\n", (char*)wordRef->Get());
347   return OK;
348 }
349 
Write(FILE * f)350 int WordListOne::Write(FILE* f)
351 {
352   FileOutData data(f);
353   WordDictCursor* cursor = dict->Cursor();
354   int ret;
355   String word;
356   WordDictRecord wordinfo;
357   while((ret = dict->Next(cursor, word, wordinfo)) == 0) {
358     WordKey key(context);
359     key.Set(WORD_KEY_WORD, wordinfo.Id());
360     data.word = word;
361     WordCursor *search = Cursor(key, wordlist_walk_callback_file_out, (Object *)&data);
362     search->Walk();
363     delete search;
364   }
365   return ret == DB_NOTFOUND ? OK : NOTOK;
366 }
367 
368 
369 // *****************************************************************************
370 //
371 // Callback data dedicated to Dump and dump_word communication
372 //
373 class DeleteWordData : public Object
374 {
375 public:
DeleteWordData()376   DeleteWordData() { count = 0; }
377 
378   int count;
379 };
380 
381 // *****************************************************************************
382 //
383 //
delete_word(WordList * words,WordDBCursor & cursor,const WordReference * word,Object & data)384 static int delete_word(WordList *words, WordDBCursor &cursor, const WordReference *word, Object &data)
385 {
386   WordListOne *words_one = (WordListOne*)words;
387   if(words_one->DeleteCursor(cursor) == 0) {
388     ((DeleteWordData&)data).count++;
389     return OK;
390   } else {
391     fprintf(stderr, "WordList delete_word: deleting %s failed\n", (char*)word->Get());
392     return NOTOK;
393   }
394 }
395 
396 // *****************************************************************************
397 //
398 // Delete all records matching wordRef, return the number of
399 // deleted records.
400 //
WalkDelete(const WordReference & wordRef)401 int WordListOne::WalkDelete(const WordReference& wordRef)
402 {
403   DeleteWordData data;
404   WordKey key = wordRef.Key();
405 
406   if(key.IsDefined(WORD_KEY_WORD)) {
407     WordCursor *description = Cursor(key, delete_word, &data);
408     description->Walk();
409     delete description;
410     dict->Decr(wordRef.GetWord(), data.count);
411   } else {
412     WordDictCursor* cursor = dict->Cursor();
413     int ret;
414     String word;
415     WordDictRecord wordinfo;
416     int total = 0;
417     while((ret = dict->Next(cursor, word, wordinfo)) == 0) {
418       key.Set(WORD_KEY_WORD, wordinfo.Id());
419       WordCursor *search = Cursor(key, delete_word, &data);
420       search->Walk();
421       delete search;
422       dict->Decr(word, data.count);
423       total += data.count;
424       data.count = 0;
425     }
426     data.count = total;
427   }
428   return data.count;
429 }
430 
431 // *****************************************************************************
432 //
433 // Returns the reference count for word in <count> arg
434 //
Noccurrence(const String & word,unsigned int & noccurrence) const435 int WordListOne::Noccurrence(const String& word, unsigned int& noccurrence) const
436 {
437   return dict->Noccurrence(word, noccurrence);
438 }
439 
Key(const String & bufferin)440 WordKey WordListOne::Key(const String& bufferin)
441 {
442   WordKey key(context);
443   StringList fields(bufferin, "\t ");
444   String* field = (String*)fields.Get_First();
445   unsigned int wordid;
446   Dict()->Serial(*field, wordid);
447   field->trunc();
448   (*field) << wordid;
449   key.SetList(fields);
450   return key;
451 }
452 
Word(const String & bufferin,int exists)453 WordReference WordListOne::Word(const String& bufferin, int exists /* = 0 */)
454 {
455   WordReference wordRef(context);
456   StringList fields(bufferin, "\t ");
457   String* field = (String*)fields.Get_First();
458   if(context->GetType().Normalize(*field) & WORD_NORMALIZE_NOTOK) {
459     //
460     // If the goal is to build a WordReference object that may not be
461     // in the index, canonicalization failure is not a problem.
462     //
463     if(!exists)
464       fprintf(stderr, "WordListOne::Word: cannot normalize word %s\n", (char*)*field);
465   }
466   String word = *field;
467   unsigned int wordid;
468   if(exists)
469     Dict()->SerialExists(word, wordid);
470   else
471     Dict()->Serial(word, wordid);
472   field->trunc();
473   (*field) << wordid;
474   wordRef.SetList(fields);
475   wordRef.SetWord(word);
476   return wordRef;
477 }
478 
479 void
BatchEnd()480 WordListOne::BatchEnd()
481 {
482   if(caches) {
483     caches->Merge(*db);
484     WordList::BatchEnd();
485   }
486 }
487