1 //
2 // htmerge.cc
3 //
4 // htmerge: Merges two databases and/or updates databases to remove
5 // old documents and ensures the databases are consistent.
6 // Calls db.cc, docs.cc, and/or words.cc as necessary
7 //
8 // Part of the ht://Dig package <http://www.htdig.org/>
9 // Copyright (c) 1999-2004 The ht://Dig Group
10 // For copyright details, see the file COPYING in your distribution
11 // or the GNU Library General Public License (LGPL) version 2 or later
12 // <http://www.gnu.org/copyleft/lgpl.html>
13 //
14 // $Id: htmerge.cc,v 1.7 2004/05/28 13:15:25 lha Exp $
15 //
16
17 #ifdef HAVE_CONFIG_H
18 #include "htconfig.h"
19 #endif /* HAVE_CONFIG_H */
20
21 #include "WordContext.h"
22 #include "good_strtok.h"
23 #include "defaults.h"
24 #include "DocumentDB.h"
25 #include "HtURLCodec.h"
26 #include "HtWordList.h"
27 #include "HtWordReference.h"
28 #include "htString.h"
29
30 #ifdef HAVE_STD
31 #include <fstream>
32 #ifdef HAVE_NAMESPACES
33 using namespace std;
34 #endif
35 #else
36 #include <fstream.h>
37 #endif /* HAVE_STD */
38
39 #include <stdio.h>
40
41 #ifndef _MSC_VER /* _WIN32 */
42 #include <unistd.h>
43 #endif
44
45 #include <stdlib.h>
46 #include <ctype.h>
47 #include <string.h>
48
49 // If we have this, we probably want it.
50 #ifdef HAVE_GETOPT_H
51 #include <getopt.h>
52 #elif HAVE_GETOPT_LOCAL
53 #include <getopt_local.h>
54 #endif
55
56
57 //
58 // This hash is used to keep track of all the document IDs which have to be
59 // discarded.
60 // This is generated from the doc database and is used to prune words
61 // from the word db
62 //
63 Dictionary discard_list;
64
65
66 // This config is used for merging multiple databses
67 HtConfiguration merge_config;
68
69 int verbose = 0;
70 int stats = 0;
71
72 // Component procedures
73 void mergeDB();
74 void usage();
75 void reportError(char *msg);
76
77 //*****************************************************************************
78 // int main(int ac, char **av)
79 //
main(int ac,char ** av)80 int main(int ac, char **av)
81 {
82 int alt_work_area = 0;
83 String configfile = DEFAULT_CONFIG_FILE;
84 String merge_configfile = 0;
85 int c;
86 extern char *optarg;
87
88 while ((c = getopt(ac, av, "svm:c:dwa")) != -1)
89 {
90 switch (c)
91 {
92 case 'd':
93 break;
94 case 'w':
95 break;
96 case 'c':
97 configfile = optarg;
98 break;
99 case 'm':
100 merge_configfile = optarg;
101 break;
102 case 'v':
103 verbose++;
104 break;
105 case 's':
106 break;
107 case 'a':
108 alt_work_area++;
109 break;
110 case '?':
111 usage();
112 break;
113 }
114 }
115
116 HtConfiguration* config= HtConfiguration::config();
117 config->Defaults(&defaults[0]);
118
119 if (access((char*)configfile, R_OK) < 0)
120 {
121 reportError(form("Unable to find configuration file '%s'",
122 configfile.get()));
123 }
124
125 config->Read(configfile);
126
127 //
128 // Check url_part_aliases and common_url_parts for
129 // errors.
130 String url_part_errors = HtURLCodec::instance()->ErrMsg();
131
132 if (url_part_errors.length() != 0)
133 reportError(form("Invalid url_part_aliases or common_url_parts: %s",
134 url_part_errors.get()));
135
136 if (merge_configfile.length())
137 {
138 merge_config.Defaults(&defaults[0]);
139 if (access((char*)merge_configfile, R_OK) < 0)
140 {
141 reportError(form("Unable to find configuration file '%s'",
142 merge_configfile.get()));
143 }
144 merge_config.Read(merge_configfile);
145 }
146
147 if (alt_work_area != 0)
148 {
149 String configValue;
150
151 configValue = config->Find("word_db");
152 if (configValue.length() != 0)
153 {
154 configValue << ".work";
155 config->Add("word_db", configValue);
156 }
157
158 configValue = config->Find("doc_db");
159 if (configValue.length() != 0)
160 {
161 configValue << ".work";
162 config->Add("doc_db", configValue);
163 }
164
165 configValue = config->Find("doc_index");
166 if (configValue.length() != 0)
167 {
168 configValue << ".work";
169 config->Add("doc_index", configValue);
170 }
171
172 configValue = config->Find("doc_excerpt");
173 if (configValue.length() != 0)
174 {
175 configValue << ".work";
176 config->Add("doc_excerpt", configValue);
177 }
178 }
179
180 WordContext::Initialize(*config);
181
182 if (merge_configfile.length())
183 {
184 // Merge the databases specified in merge_configfile into the current
185 // databases. Do this first then update the other databases as usual
186 // Note: We don't have to specify anything, it's all in the config vars
187
188 mergeDB();
189 }
190
191 return 0;
192 }
193
194 //*****************************************************************************
195 // void mergeDB()
196 //
197 void
mergeDB()198 mergeDB()
199 {
200 HtConfiguration* config= HtConfiguration::config();
201 DocumentDB merge_db, db;
202 List *urls;
203 Dictionary merge_dup_ids, db_dup_ids; // Lists of DocIds to ignore
204 int docIDOffset;
205
206 const String doc_index = config->Find("doc_index");
207 if (access(doc_index, R_OK) < 0)
208 {
209 reportError(form("Unable to open document index '%s'", (const char*)doc_index));
210 }
211 const String doc_excerpt = config->Find("doc_excerpt");
212 if (access(doc_excerpt, R_OK) < 0)
213 {
214 reportError(form("Unable to open document excerpts '%s'", (const char*)doc_excerpt));
215 }
216 const String doc_db = config->Find("doc_db");
217 if (db.Open(doc_db, doc_index, doc_excerpt) < 0)
218 {
219 reportError(form("Unable to open/create document database '%s'",
220 (const char*)doc_db));
221 }
222
223
224 const String merge_doc_index = merge_config["doc_index"];
225 if (access(merge_doc_index, R_OK) < 0)
226 {
227 reportError(form("Unable to open document index '%s'", (const char*)merge_doc_index));
228 }
229 const String merge_doc_excerpt = merge_config["doc_excerpt"];
230 if (access(merge_doc_excerpt, R_OK) < 0)
231 {
232 reportError(form("Unable to open document excerpts '%s'", (const char*)merge_doc_excerpt));
233 }
234 const String merge_doc_db = merge_config["doc_db"];
235 if (merge_db.Open(merge_doc_db, merge_doc_index, merge_doc_excerpt) < 0)
236 {
237 reportError(form("Unable to open document database '%s'",
238 (const char*)merge_doc_db));
239 }
240
241 // Start the merging by going through all the URLs that are in
242 // the database to be merged
243
244 urls = merge_db.URLs();
245 // This ensures that every document added from merge_db has a unique ID
246 // in the new database
247 docIDOffset = db.NextDocID();
248
249 urls->Start_Get();
250 String *url;
251 String id;
252 while ((url = (String *) urls->Get_Next()))
253 {
254 DocumentRef *ref = merge_db[url->get()];
255 DocumentRef *old_ref = db[url->get()];
256 if (!ref)
257 continue;
258
259 if (old_ref)
260 {
261 // Oh well, we knew this would happen. Let's get the duplicate
262 // And we'll only use the most recent date.
263
264 if ( old_ref->DocTime() >= ref->DocTime() )
265 {
266 // Cool, the ref we're merging is too old, just ignore it
267 char str[20];
268 sprintf(str, "%d", ref->DocID());
269 merge_dup_ids.Add(str, 0);
270
271 if (verbose > 1)
272 {
273 cout << "htmerge: Duplicate, URL: " << url << " ignoring merging copy \n";
274 cout.flush();
275 }
276 }
277 else
278 {
279 // The ref we're merging is newer, delete the old one and add
280 char str[20];
281 sprintf(str, "%d", old_ref->DocID());
282 db_dup_ids.Add(str, 0);
283 db.Delete(old_ref->DocID());
284 ref->DocID(ref->DocID() + docIDOffset);
285 db.Add(*ref);
286 if (verbose > 1)
287 {
288 cout << "htmerge: Duplicate, URL: ";
289 cout << url->get() << " ignoring destination copy \n";
290 cout.flush();
291 }
292 }
293 }
294 else
295 {
296 // It's a new URL, just add it, making sure to load the excerpt
297 merge_db.ReadExcerpt(*ref);
298 ref->DocID(ref->DocID() + docIDOffset);
299 db.Add(*ref);
300 if (verbose > 1)
301 {
302 cout << "htmerge: Merged URL: " << url->get() << " \n";
303 cout.flush();
304 }
305 }
306 delete ref;
307 delete old_ref;
308 }
309 delete urls;
310
311 // As reported by Roman Dimov, we must update db.NextDocID()
312 // because of all the added records...
313 db.IncNextDocID( merge_db.NextDocID() );
314 merge_db.Close();
315 db.Close();
316
317 // OK, after merging the doc DBs, we do the same for the words
318 HtWordList mergeWordDB(*config), wordDB(*config);
319 List *words;
320 String docIDKey;
321
322 if (wordDB.Open(config->Find("word_db"), O_RDWR) < 0)
323 {
324 reportError(form("Unable to open/create document database '%s'",
325 (const char*)config->Find("word_db")));
326 }
327
328 if (mergeWordDB.Open(merge_config["word_db"], O_RDONLY) < 0)
329 {
330 reportError(form("Unable to open document database '%s'",
331 (const char *)merge_config["word_db"]));
332 }
333
334 // Start the merging by going through all the URLs that are in
335 // the database to be merged
336
337 words = mergeWordDB.WordRefs();
338
339 words->Start_Get();
340 HtWordReference *word;
341 while ((word = (HtWordReference *) words->Get_Next()))
342 {
343 docIDKey = word->DocID();
344 if (merge_dup_ids.Exists(docIDKey))
345 continue;
346
347 word->DocID(word->DocID() + docIDOffset);
348 wordDB.Override(*word);
349 }
350 delete words;
351
352 words = wordDB.WordRefs();
353 words->Start_Get();
354 while ((word = (HtWordReference *) words->Get_Next()))
355 {
356 docIDKey = word->DocID();
357 if (db_dup_ids.Exists(docIDKey))
358 wordDB.Delete(*word);
359 }
360 delete words;
361
362 // Cleanup--just close the two word databases
363 mergeWordDB.Close();
364 wordDB.Close();
365 }
366
367
368 //*****************************************************************************
369 // void usage()
370 // Display program usage information
371 //
usage()372 void usage()
373 {
374 cout << "usage: htmerge [-v][-c configfile][-m merge_configfile]\n";
375 cout << "This program is part of ht://Dig " << VERSION << "\n\n";
376 cout << "Options:\n";
377 cout << "\t-v\tVerbose mode. This increases the verbosity of the\n";
378 cout << "\t\tprogram. Using more than 2 is probably only useful\n";
379 cout << "\t\tfor debugging purposes. The default verbose mode\n";
380 cout << "\t\tgives a progress on what it is doing and where it is.\n\n";
381 cout << "\t-m merge_configfile\n";
382 cout << "\t\tMerge the databases specified into the databases specified\n";
383 cout << "\t\tby -c or the default.\n\n";
384 cout << "\t-c configfile\n";
385 cout << "\t\tUse the specified configuration file instead on the\n";
386 cout << "\t\tdefault.\n\n";
387 cout << "\t-a\tUse alternate work files.\n";
388 cout << "\t\tTells htmerge to append .work to database files causing\n";
389 cout << "\t\ta second copy of the database to be built. This allows\n";
390 cout << "\t\toriginal files to be used by htsearch during the indexing\n";
391 cout << "\t\trun.\n\n";
392 exit(0);
393 }
394
395
396 //*****************************************************************************
397 // Report an error and die
398 //
reportError(char * msg)399 void reportError(char *msg)
400 {
401 cout << "htmerge: " << msg << "\n\n";
402 exit(1);
403 }
404