1 //
2 // htmerge.cc
3 //
4 // htmerge: Merges two databases and/or updates databases to remove
5 //          old documents and ensures the databases are consistent.
6 //          Calls db.cc, docs.cc, and/or words.cc as necessary
7 //
8 // Part of the ht://Dig package   <http://www.htdig.org/>
9 // Copyright (c) 1999-2004 The ht://Dig Group
10 // For copyright details, see the file COPYING in your distribution
11 // or the GNU Library General Public License (LGPL) version 2 or later
12 // <http://www.gnu.org/copyleft/lgpl.html>
13 //
14 // $Id: htmerge.cc,v 1.7 2004/05/28 13:15:25 lha Exp $
15 //
16 
17 #ifdef HAVE_CONFIG_H
18 #include "htconfig.h"
19 #endif /* HAVE_CONFIG_H */
20 
21 #include "WordContext.h"
22 #include "good_strtok.h"
23 #include "defaults.h"
24 #include "DocumentDB.h"
25 #include "HtURLCodec.h"
26 #include "HtWordList.h"
27 #include "HtWordReference.h"
28 #include "htString.h"
29 
30 #ifdef HAVE_STD
31 #include <fstream>
32 #ifdef HAVE_NAMESPACES
33 using namespace std;
34 #endif
35 #else
36 #include <fstream.h>
37 #endif /* HAVE_STD */
38 
39 #include <stdio.h>
40 
41 #ifndef _MSC_VER /* _WIN32 */
42 #include <unistd.h>
43 #endif
44 
45 #include <stdlib.h>
46 #include <ctype.h>
47 #include <string.h>
48 
49 // If we have this, we probably want it.
50 #ifdef HAVE_GETOPT_H
51 #include <getopt.h>
52 #elif HAVE_GETOPT_LOCAL
53 #include <getopt_local.h>
54 #endif
55 
56 
57 //
58 // This hash is used to keep track of all the document IDs which have to be
59 // discarded.
60 // This is generated from the doc database and is used to prune words
61 // from the word db
62 //
63 Dictionary    discard_list;
64 
65 
66 // This config is used for merging multiple databses
67 HtConfiguration    merge_config;
68 
69 int		verbose = 0;
70 int		stats = 0;
71 
72 // Component procedures
73 void mergeDB();
74 void usage();
75 void reportError(char *msg);
76 
77 //*****************************************************************************
78 // int main(int ac, char **av)
79 //
main(int ac,char ** av)80 int main(int ac, char **av)
81 {
82     int			alt_work_area = 0;
83     String		configfile = DEFAULT_CONFIG_FILE;
84     String              merge_configfile = 0;
85     int			c;
86     extern char		*optarg;
87 
88     while ((c = getopt(ac, av, "svm:c:dwa")) != -1)
89     {
90 	switch (c)
91 	{
92 	    case 'd':
93 		break;
94 	    case 'w':
95 		break;
96 	    case 'c':
97 		configfile = optarg;
98 		break;
99 	    case 'm':
100 	      	merge_configfile = optarg;
101 	      	break;
102 	    case 'v':
103 		verbose++;
104 		break;
105 	    case 's':
106 		break;
107 	    case 'a':
108 		alt_work_area++;
109 		break;
110 	    case '?':
111 		usage();
112 		break;
113 	}
114     }
115 
116 	HtConfiguration* config= HtConfiguration::config();
117     config->Defaults(&defaults[0]);
118 
119     if (access((char*)configfile, R_OK) < 0)
120     {
121 	reportError(form("Unable to find configuration file '%s'",
122 			 configfile.get()));
123     }
124 
125     config->Read(configfile);
126 
127     //
128     // Check url_part_aliases and common_url_parts for
129     // errors.
130     String url_part_errors = HtURLCodec::instance()->ErrMsg();
131 
132     if (url_part_errors.length() != 0)
133       reportError(form("Invalid url_part_aliases or common_url_parts: %s",
134                        url_part_errors.get()));
135 
136     if (merge_configfile.length())
137     {
138     	merge_config.Defaults(&defaults[0]);
139 	if (access((char*)merge_configfile, R_OK) < 0)
140     	{
141 	reportError(form("Unable to find configuration file '%s'",
142 			 merge_configfile.get()));
143     	}
144 	merge_config.Read(merge_configfile);
145     }
146 
147     if (alt_work_area != 0)
148     {
149 	String	configValue;
150 
151 	configValue = config->Find("word_db");
152 	if (configValue.length() != 0)
153 	{
154 	    configValue << ".work";
155 	    config->Add("word_db", configValue);
156 	}
157 
158 	configValue = config->Find("doc_db");
159 	if (configValue.length() != 0)
160 	{
161 	    configValue << ".work";
162 	    config->Add("doc_db", configValue);
163 	}
164 
165 	configValue = config->Find("doc_index");
166 	if (configValue.length() != 0)
167 	{
168 	    configValue << ".work";
169 	    config->Add("doc_index", configValue);
170 	}
171 
172 	configValue = config->Find("doc_excerpt");
173 	if (configValue.length() != 0)
174 	{
175 	    configValue << ".work";
176 	    config->Add("doc_excerpt", configValue);
177 	}
178     }
179 
180     WordContext::Initialize(*config);
181 
182     if (merge_configfile.length())
183     {
184 	// Merge the databases specified in merge_configfile into the current
185 	// databases. Do this first then update the other databases as usual
186 	// Note: We don't have to specify anything, it's all in the config vars
187 
188 	mergeDB();
189     }
190 
191     return 0;
192 }
193 
194 //*****************************************************************************
195 // void mergeDB()
196 //
197 void
mergeDB()198 mergeDB()
199 {
200 	HtConfiguration* config= HtConfiguration::config();
201     DocumentDB	merge_db, db;
202     List	*urls;
203     Dictionary  merge_dup_ids, db_dup_ids; // Lists of DocIds to ignore
204     int         docIDOffset;
205 
206     const String doc_index = config->Find("doc_index");
207     if (access(doc_index, R_OK) < 0)
208     {
209 	reportError(form("Unable to open document index '%s'", (const char*)doc_index));
210     }
211     const String doc_excerpt = config->Find("doc_excerpt");
212     if (access(doc_excerpt, R_OK) < 0)
213     {
214 	reportError(form("Unable to open document excerpts '%s'", (const char*)doc_excerpt));
215     }
216     const String doc_db = config->Find("doc_db");
217     if (db.Open(doc_db, doc_index, doc_excerpt) < 0)
218     {
219 	reportError(form("Unable to open/create document database '%s'",
220 			 (const char*)doc_db));
221     }
222 
223 
224     const String merge_doc_index = merge_config["doc_index"];
225     if (access(merge_doc_index, R_OK) < 0)
226     {
227 	reportError(form("Unable to open document index '%s'", (const char*)merge_doc_index));
228     }
229     const String merge_doc_excerpt = merge_config["doc_excerpt"];
230     if (access(merge_doc_excerpt, R_OK) < 0)
231     {
232 	reportError(form("Unable to open document excerpts '%s'", (const char*)merge_doc_excerpt));
233     }
234     const String merge_doc_db = merge_config["doc_db"];
235     if (merge_db.Open(merge_doc_db, merge_doc_index, merge_doc_excerpt) < 0)
236     {
237 	reportError(form("Unable to open document database '%s'",
238 			 (const char*)merge_doc_db));
239     }
240 
241     // Start the merging by going through all the URLs that are in
242     // the database to be merged
243 
244     urls = merge_db.URLs();
245     // This ensures that every document added from merge_db has a unique ID
246     // in the new database
247     docIDOffset = db.NextDocID();
248 
249     urls->Start_Get();
250     String		*url;
251     String		id;
252     while ((url = (String *) urls->Get_Next()))
253     {
254 	DocumentRef	*ref = merge_db[url->get()];
255 	DocumentRef     *old_ref = db[url->get()];
256 	if (!ref)
257 	    continue;
258 
259 	if (old_ref)
260 	  {
261 	    // Oh well, we knew this would happen. Let's get the duplicate
262 	    // And we'll only use the most recent date.
263 
264 	    if ( old_ref->DocTime() >= ref->DocTime() )
265 	      {
266 		// Cool, the ref we're merging is too old, just ignore it
267 		char        str[20];
268 		sprintf(str, "%d", ref->DocID());
269 		merge_dup_ids.Add(str, 0);
270 
271 		if (verbose > 1)
272 		  {
273 		    cout << "htmerge: Duplicate, URL: " << url << " ignoring merging copy   \n";
274 		    cout.flush();
275 		  }
276 	      }
277 	    else
278 	      {
279 		// The ref we're merging is newer, delete the old one and add
280 		char        str[20];
281 		sprintf(str, "%d", old_ref->DocID());
282 		db_dup_ids.Add(str, 0);
283 		db.Delete(old_ref->DocID());
284 		ref->DocID(ref->DocID() + docIDOffset);
285 		db.Add(*ref);
286                 if (verbose > 1)
287                   {
288                     cout << "htmerge: Duplicate, URL: ";
289 		    cout << url->get() << " ignoring destination copy   \n";
290                     cout.flush();
291                   }
292 	      }
293 	  }
294 	else
295 	  {
296 	    // It's a new URL, just add it, making sure to load the excerpt
297 	    merge_db.ReadExcerpt(*ref);
298 	    ref->DocID(ref->DocID() + docIDOffset);
299 	    db.Add(*ref);
300 	    if (verbose > 1)
301 	      {
302 		cout << "htmerge: Merged URL: " << url->get() << "    \n";
303 		cout.flush();
304 	      }
305 	  }
306         delete ref;
307 	delete old_ref;
308     }
309     delete urls;
310 
311     // As reported by Roman Dimov, we must update db.NextDocID()
312     // because of all the added records...
313     db.IncNextDocID( merge_db.NextDocID() );
314     merge_db.Close();
315     db.Close();
316 
317     // OK, after merging the doc DBs, we do the same for the words
318     HtWordList	mergeWordDB(*config), wordDB(*config);
319     List	*words;
320     String	docIDKey;
321 
322     if (wordDB.Open(config->Find("word_db"), O_RDWR) < 0)
323     {
324 	reportError(form("Unable to open/create document database '%s'",
325 			 (const char*)config->Find("word_db")));
326     }
327 
328     if (mergeWordDB.Open(merge_config["word_db"], O_RDONLY) < 0)
329     {
330 	reportError(form("Unable to open document database '%s'",
331 			 (const char *)merge_config["word_db"]));
332     }
333 
334     // Start the merging by going through all the URLs that are in
335     // the database to be merged
336 
337     words = mergeWordDB.WordRefs();
338 
339     words->Start_Get();
340     HtWordReference   *word;
341     while ((word = (HtWordReference *) words->Get_Next()))
342     {
343       docIDKey = word->DocID();
344       if (merge_dup_ids.Exists(docIDKey))
345       continue;
346 
347       word->DocID(word->DocID() + docIDOffset);
348       wordDB.Override(*word);
349     }
350     delete words;
351 
352     words = wordDB.WordRefs();
353     words->Start_Get();
354     while ((word = (HtWordReference *) words->Get_Next()))
355     {
356       docIDKey = word->DocID();
357       if (db_dup_ids.Exists(docIDKey))
358       wordDB.Delete(*word);
359     }
360     delete words;
361 
362     // Cleanup--just close the two word databases
363     mergeWordDB.Close();
364     wordDB.Close();
365 }
366 
367 
368 //*****************************************************************************
369 // void usage()
370 //   Display program usage information
371 //
usage()372 void usage()
373 {
374     cout << "usage: htmerge [-v][-c configfile][-m merge_configfile]\n";
375     cout << "This program is part of ht://Dig " << VERSION << "\n\n";
376     cout << "Options:\n";
377     cout << "\t-v\tVerbose mode.  This increases the verbosity of the\n";
378     cout << "\t\tprogram.  Using more than 2 is probably only useful\n";
379     cout << "\t\tfor debugging purposes.  The default verbose mode\n";
380     cout << "\t\tgives a progress on what it is doing and where it is.\n\n";
381     cout << "\t-m merge_configfile\n";
382     cout << "\t\tMerge the databases specified into the databases specified\n";
383     cout << "\t\tby -c or the default.\n\n";
384     cout << "\t-c configfile\n";
385     cout << "\t\tUse the specified configuration file instead on the\n";
386     cout << "\t\tdefault.\n\n";
387     cout << "\t-a\tUse alternate work files.\n";
388     cout << "\t\tTells htmerge to append .work to database files causing\n";
389     cout << "\t\ta second copy of the database to be built.  This allows\n";
390     cout << "\t\toriginal files to be used by htsearch during the indexing\n";
391     cout << "\t\trun.\n\n";
392     exit(0);
393 }
394 
395 
396 //*****************************************************************************
397 // Report an error and die
398 //
reportError(char * msg)399 void reportError(char *msg)
400 {
401     cout << "htmerge: " << msg << "\n\n";
402     exit(1);
403 }
404