1 /** @file xapian-chert-update.cc
2  * @brief Update a chert database to the new format keys
3  */
4 /* Copyright (C) 2003,2004,2005,2006,2007,2008,2009,2011,2013 Olly Betts
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation; either version 2 of the
9  * License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
19  * USA
20  */
21 
22 #include <config.h>
23 
24 #include "safeerrno.h"
25 
26 #include <iostream>
27 
28 #include <cstdio> // for rename()
29 #include <cstdlib>
30 #include <cstring>
31 #include "safesysstat.h"
32 #include <sys/types.h>
33 #include "utils.h"
34 
35 #include "chert_table.h"
36 
37 #include "flint_table.h"
38 #include "flint_cursor.h"
39 #include "flint_utils.h"
40 #include "pack.h"
41 
42 #include "safeunistd.h"
43 #include "safefcntl.h"
44 
45 #ifdef __WIN32__
46 # include "safewindows.h"
47 #endif
48 
49 #include "stringutils.h"
50 
51 #include <xapian.h>
52 
53 #include "gnu_getopt.h"
54 
55 using namespace std;
56 
57 #define PROG_NAME "chert-update"
58 #define PROG_DESC "Update a chert database to the new format keys"
59 
60 #define OPT_HELP 1
61 #define OPT_VERSION 2
62 #define OPT_NO_RENUMBER 3
63 
show_usage()64 static void show_usage() {
65     cout << "Usage: " PROG_NAME " [OPTIONS] SOURCE_DATABASE DESTINATION_DATABASE\n\n"
66 "Options:\n"
67 "  -b                Set the blocksize in bytes (e.g. 4096) or K (e.g. 4K)\n"
68 "                    (must be between 2K and 64K and a power of 2, default 8K)\n"
69 "  --help            display this help and exit\n"
70 "  --version         output version information and exit" << endl;
71 }
72 
73 /// Append filename argument arg to command cmd with suitable escaping.
74 static bool
append_filename_argument(string & cmd,const string & arg)75 append_filename_argument(string & cmd, const string & arg) {
76 #ifdef __WIN32__
77     cmd.reserve(cmd.size() + arg.size() + 3);
78     cmd += " \"";
79     for (string::const_iterator i = arg.begin(); i != arg.end(); ++i) {
80 	if (*i == '/') {
81 	    // Convert Unix path separators to backslashes.  C library
82 	    // functions understand "/" in paths, but we are going to
83 	    // call commands like "deltree" or "rd" which don't.
84 	    cmd += '\\';
85 	} else if (*i < 32 || strchr("<>\"|*?", *i)) {
86 	    // Check for illegal characters in filename.
87 	    return false;
88 	} else {
89 	    cmd += *i;
90 	}
91     }
92     cmd += '"';
93 #else
94     // Allow for escaping a few characters.
95     cmd.reserve(cmd.size() + arg.size() + 10);
96 
97     // Prevent a leading "-" on the filename being interpreted as a command
98     // line option.
99     if (arg[0] == '-')
100 	cmd += " ./";
101     else
102 	cmd += ' ';
103 
104     for (string::const_iterator i = arg.begin(); i != arg.end(); ++i) {
105 	// Don't escape a few safe characters which are common in filenames.
106 	if (!C_isalnum(*i) && strchr("/._-", *i) == NULL) {
107 	    cmd += '\\';
108 	}
109 	cmd += *i;
110     }
111 #endif
112     return true;
113 }
114 
115 #ifdef __WIN32__
running_on_win9x()116 static bool running_on_win9x() {
117     static int win9x = -1;
118     if (win9x == -1) {
119 	OSVERSIONINFO info;
120 	memset(&info, 0, sizeof(OSVERSIONINFO));
121 	info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
122 	if (GetVersionEx(&info)) {
123 	    win9x = (info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS);
124 	}
125     }
126     return win9x;
127 }
128 #endif
129 
130 /// Remove a directory and contents, just like the Unix "rm -rf" command.
rm_rf(const string & filename)131 static void rm_rf(const string &filename) {
132     // Check filename exists and is actually a directory
133     struct stat sb;
134     if (filename.empty() || stat(filename, &sb) != 0 || !S_ISDIR(sb.st_mode))
135 	return;
136 
137 #ifdef __WIN32__
138     string cmd;
139     if (running_on_win9x()) {
140 	// For 95-like systems:
141 	cmd = "deltree /y";
142     } else {
143 	// For NT-like systems:
144 	cmd = "rd /s /q";
145     }
146 #else
147     string cmd("rm -rf");
148 #endif
149     if (!append_filename_argument(cmd, filename)) return;
150     system(cmd);
151 }
152 
153 static void
copy_position(FlintTable & in,ChertTable * out)154 copy_position(FlintTable &in, ChertTable *out)
155 {
156     in.open();
157     if (in.empty()) return;
158 
159     FlintCursor cur(&in);
160     cur.find_entry(string());
161 
162     string newkey;
163     while (cur.next()) {
164 	const string & key = cur.current_key;
165 	const char * d = key.data();
166 	const char * e = d + key.size();
167 	Xapian::docid did;
168 	if (!F_unpack_uint_preserving_sort(&d, e, &did) || d == e)
169 	    throw Xapian::DatabaseCorruptError("Bad docid key");
170 	newkey.resize(0);
171 	pack_uint_preserving_sort(newkey, did);
172 	newkey.append(d, e - d);
173 	bool compressed = cur.read_tag(true);
174 	out->add(newkey, cur.current_tag, compressed);
175     }
176 }
177 
178 static void
copy_postlist(FlintTable & in,ChertTable * out)179 copy_postlist(FlintTable &in, ChertTable *out)
180 {
181     const string firstvaluechunk("\0\xd8", 2);
182     const string firstdoclenchunk("\0\xe0", 2);
183     const string firstchunk("\0\xff", 2);
184 
185     in.open();
186     if (in.empty()) return;
187 
188     // Copy metainfo item and valuestats.
189     FlintCursor cur(&in);
190     cur.find_entry(string());
191     while (true) {
192 	if (!cur.next()) return;
193 	if (cur.current_key >= firstvaluechunk) break;
194 	bool compressed = cur.read_tag(true);
195 	out->add(cur.current_key, cur.current_tag, compressed);
196     }
197 
198     // Copy valuestream chunks, adjusting keys.
199     string newkey;
200     do {
201 	const string & key = cur.current_key;
202 	const char * d = key.data();
203 	const char * d_orig = d;
204 	const char * e = d + key.size();
205 	d += 2;
206 	Xapian::valueno slot;
207 	if (!unpack_uint(&d, e, &slot))
208 	    throw Xapian::DatabaseCorruptError("Bad value chunk key (no slot)");
209 	newkey.assign(d_orig, d - d_orig);
210 	Xapian::docid did;
211 	if (!F_unpack_uint_preserving_sort(&d, e, &did))
212 	    throw Xapian::DatabaseCorruptError("Bad value chunk key (no docid)");
213 	if (d != e)
214 	    throw Xapian::DatabaseCorruptError("Bad value chunk key (trailing junk)");
215 	pack_uint_preserving_sort(newkey, did);
216 	bool compressed = cur.read_tag(true);
217 	out->add(newkey, cur.current_tag, compressed);
218 	if (!cur.next()) return;
219     } while (cur.current_key < firstdoclenchunk);
220 
221     // Copy doclen chunks, adjusting keys.
222     do {
223 	const string & key = cur.current_key;
224 	const char * d = key.data();
225 	const char * e = d + key.size();
226 	newkey.assign(d, 2);
227 	d += 2;
228 	if (d != e) {
229 	    Xapian::docid did;
230 	    if (!F_unpack_uint_preserving_sort(&d, e, &did))
231 		throw Xapian::DatabaseCorruptError("Bad doclen chunk key (no docid)");
232 	    if (d != e)
233 		throw Xapian::DatabaseCorruptError("Bad doclen chunk key (trailing junk)");
234 	    pack_uint_preserving_sort(newkey, did);
235 	}
236 	bool compressed = cur.read_tag(true);
237 	out->add(newkey, cur.current_tag, compressed);
238 	if (!cur.next()) return;
239     } while (cur.current_key < firstchunk);
240 
241     do {
242 	const string & key = cur.current_key;
243 	const char * d = key.data();
244 	const char * e = d + key.size();
245 	string term;
246 	if (!F_unpack_string_preserving_sort(&d, e, term))
247 	    throw Xapian::DatabaseCorruptError("Bad postlist key");
248 	if (d == e) {
249 	    // This is an initial chunk for a term.
250 	    newkey = pack_chert_postlist_key(term);
251 	} else {
252 	    // Not an initial chunk.
253 	    Xapian::docid firstdid;
254 	    if (!F_unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
255 		throw Xapian::DatabaseCorruptError("Bad postlist key");
256 	    newkey = pack_chert_postlist_key(term, firstdid);
257 	}
258 	bool compressed = cur.read_tag(true);
259 	out->add(newkey, cur.current_tag, compressed);
260     } while (cur.next());
261 }
262 
263 static void
copy_unchanged(FlintTable & in,ChertTable * out)264 copy_unchanged(FlintTable &in, ChertTable *out)
265 {
266     in.open();
267     if (in.empty()) return;
268 
269     FlintCursor cur(&in);
270     cur.find_entry(string());
271     while (cur.next()) {
272 	bool compressed = cur.read_tag(true);
273 	out->add(cur.current_key, cur.current_tag, compressed);
274     }
275 }
276 
277 static void
copy_termlist(FlintTable & in,ChertTable * out)278 copy_termlist(FlintTable &in, ChertTable *out)
279 {
280     in.open();
281     if (in.empty()) return;
282 
283     FlintCursor cur(&in);
284     cur.find_entry(string());
285 
286     string newkey;
287     while (cur.next()) {
288 	const string & key = cur.current_key;
289 	const char * d = key.data();
290 	const char * e = d + key.size();
291 	Xapian::docid did;
292 	if (!F_unpack_uint_preserving_sort(&d, e, &did))
293 	    throw Xapian::DatabaseCorruptError("Bad termlist key");
294 	newkey.resize(0);
295 	pack_uint_preserving_sort(newkey, did);
296 	if (d != e) {
297 	    // slot keys have a single zero byte suffix.
298 	    if (*d++ != '\0' || d != e)
299 		throw Xapian::DatabaseCorruptError("Bad termlist key");
300 	    newkey.append(1, '\0');
301 	}
302 	bool compressed = cur.read_tag(true);
303 	out->add(newkey, cur.current_tag, compressed);
304     }
305 }
306 
307 static void
copy_docid_keyed(FlintTable & in,ChertTable * out)308 copy_docid_keyed(FlintTable &in, ChertTable *out)
309 {
310     in.open();
311     if (in.empty()) return;
312 
313     FlintCursor cur(&in);
314     cur.find_entry(string());
315 
316     string newkey;
317     while (cur.next()) {
318 	const string & key = cur.current_key;
319 	const char * d = key.data();
320 	const char * e = d + key.size();
321 	Xapian::docid did;
322 	if (!F_unpack_uint_preserving_sort(&d, e, &did) || d != e)
323 	    throw Xapian::DatabaseCorruptError("Bad docid key");
324 	newkey.resize(0);
325 	pack_uint_preserving_sort(newkey, did);
326 	bool compressed = cur.read_tag(true);
327 	out->add(newkey, cur.current_tag, compressed);
328     }
329 }
330 
331 int
main(int argc,char ** argv)332 main(int argc, char **argv)
333 {
334     const char * opts = "b:";
335     const struct option long_opts[] = {
336 	{"help",	no_argument, 0, OPT_HELP},
337 	{"version",	no_argument, 0, OPT_VERSION},
338 	{NULL,		0, 0, 0}
339     };
340 
341     size_t block_size = 8192;
342 
343     int c;
344     while ((c = gnu_getopt_long(argc, argv, opts, long_opts, 0)) != -1) {
345 	switch (c) {
346 	    case 'b': {
347 		char *p;
348 		block_size = strtoul(optarg, &p, 10);
349 		if (block_size <= 64 && (*p == 'K' || *p == 'k')) {
350 		    ++p;
351 		    block_size *= 1024;
352 		}
353 		if (*p || block_size < 2048 || block_size > 65536 ||
354 		    (block_size & (block_size - 1)) != 0) {
355 		    cerr << PROG_NAME ": Bad value '" << optarg
356 			 << "' passed for blocksize, must be a power of 2 between 2K and 64K"
357 			 << endl;
358 		    exit(1);
359 		}
360 		break;
361 	    }
362 	    case OPT_HELP:
363 		cout << PROG_NAME " - " PROG_DESC "\n\n";
364 		show_usage();
365 		exit(0);
366 	    case OPT_VERSION:
367 		cout << PROG_NAME " - " PACKAGE_STRING << endl;
368 		exit(0);
369 	    default:
370 		show_usage();
371 		exit(1);
372 	}
373     }
374 
375     if (argc - optind != 2) {
376 	show_usage();
377 	exit(1);
378     }
379 
380     // Path to the database to create.
381     const char *destdir = argv[argc - 1];
382 
383     try {
384 	const char *srcdir = argv[optind];
385 	// Check destdir isn't the same as the source directory...
386 	if (strcmp(srcdir, destdir) == 0) {
387 	    cout << argv[0]
388 		 << ": destination may not be the same as the source directory."
389 		 << endl;
390 	    exit(1);
391 	}
392 
393 	{
394 	    struct stat sb;
395 	    if (stat(string(srcdir) + "/iamchert", &sb) != 0) {
396 		cout << argv[0] << ": '" << srcdir
397 		     << "' is not a chert database directory" << endl;
398 		exit(1);
399 	    }
400 	    try {
401 		// Will throw an exception for old format chert.
402 		Xapian::Database db(srcdir);
403 		cout << argv[0] << ": '" << srcdir
404 		     << "' is already the latest chert format" << endl;
405 		exit(1);
406 	    } catch (const Xapian::DatabaseVersionError &) {
407 		// If we need to verify the version, e.get_msg() reports:
408 		// <DBDIR>/iamchert: Chert version file is version 200903070 but I only understand 200912150
409 	    }
410 	}
411 
412 	// If the destination database directory doesn't exist, create it.
413 	if (mkdir(destdir, 0755) < 0) {
414 	    // Check why mkdir failed.  It's ok if the directory already
415 	    // exists, but we also get EEXIST if there's an existing file with
416 	    // that name.
417 	    if (errno == EEXIST) {
418 		struct stat sb;
419 		if (stat(destdir, &sb) == 0 && S_ISDIR(sb.st_mode))
420 		    errno = 0;
421 		else
422 		    errno = EEXIST; // stat might have changed it
423 	    }
424 	    if (errno) {
425 		cerr << argv[0] << ": cannot create directory '"
426 		     << destdir << "': " << strerror(errno) << endl;
427 		exit(1);
428 	    }
429 	}
430 
431 	enum table_type {
432 	    POSTLIST, RECORD, TERMLIST, POSITION, SPELLING, SYNONYM
433 	};
434 	struct table_list {
435 	    // The "base name" of the table.
436 	    const char * name;
437 	    // The type.
438 	    table_type type;
439 	    // zlib compression strategy to use on tags.
440 	    int compress_strategy;
441 	    // Create tables after position lazily.
442 	    bool lazy;
443 	};
444 
445 	static const table_list tables[] = {
446 	    // name	    type	compress_strategy	lazy
447 	    { "postlist",   POSTLIST,	DONT_COMPRESS,		false },
448 	    { "record",	    RECORD,	Z_DEFAULT_STRATEGY,	false },
449 	    { "termlist",   TERMLIST,	Z_DEFAULT_STRATEGY,	false },
450 	    { "position",   POSITION,	DONT_COMPRESS,		true },
451 	    { "spelling",   SPELLING,	Z_DEFAULT_STRATEGY,	true },
452 	    { "synonym",    SYNONYM,	Z_DEFAULT_STRATEGY,	true }
453 	};
454 	const table_list * tables_end = tables +
455 	    (sizeof(tables) / sizeof(tables[0]));
456 
457 	for (const table_list * t = tables; t < tables_end; ++t) {
458 	    bool bad_stat = false;
459 	    off_t in_size = 0;
460 	    // The postlist requires an N-way merge, adjusting the headers of
461 	    // various blocks.  The other tables have keys sorted in docid
462 	    // order, so we can merge them by simply copying all the keys from
463 	    // each source table in turn.
464 	    cout << t->name << " ..." << flush;
465 
466 	    string s(srcdir);
467 	    s += '/';
468 	    s += t->name;
469 	    s += '.';
470 	    {
471 		struct stat sb;
472 		if (stat(s + "DB", &sb) == 0) {
473 		    in_size += sb.st_size / 1024;
474 		} else if (errno != ENOENT) {
475 		    // We get ENOENT for an optional table.
476 		    bad_stat = true;
477 		} else if (t->type == TERMLIST) {
478 		    cout << '\r' << t->name << ": doesn't exist" << endl;
479 		    continue;
480 		}
481 	    }
482 
483 	    FlintTable in(t->name, s, true, DONT_COMPRESS, t->lazy);
484 
485 	    string dest = destdir;
486 	    dest += '/';
487 	    dest += t->name;
488 	    dest += '.';
489 
490 	    ChertTable out(t->name, dest, false, t->compress_strategy, t->lazy);
491 	    if (!t->lazy) {
492 		out.create_and_open(block_size);
493 	    } else {
494 		out.erase();
495 		out.set_block_size(block_size);
496 	    }
497 
498 	    out.set_full_compaction(true);
499 	    // if (compaction == FULLER) out.set_max_item_size(1);
500 
501 	    // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
502 	    // on certain systems).
503 
504 	    switch (t->type) {
505 		case POSITION:
506 		    copy_position(in, &out);
507 		    break;
508 		case POSTLIST:
509 		    copy_postlist(in, &out);
510 		    break;
511 		case SPELLING: case SYNONYM:
512 		    copy_unchanged(in, &out);
513 		    break;
514 		case TERMLIST:
515 		    copy_termlist(in, &out);
516 		    break;
517 		default:
518 		    // Record
519 		    copy_docid_keyed(in, &out);
520 		    break;
521 	    }
522 
523 	    // Commit as revision 1.
524 	    out.flush_db();
525 	    out.commit(1);
526 
527 	    cout << '\r' << t->name << ": ";
528 	    off_t out_size = 0;
529 	    if (!bad_stat) {
530 		struct stat sb;
531 		if (stat(dest + "DB", &sb) == 0) {
532 		    out_size = sb.st_size / 1024;
533 		} else {
534 		    bad_stat = (errno != ENOENT);
535 		}
536 	    }
537 	    if (bad_stat) {
538 		cout << "Done (couldn't stat all the DB files)";
539 	    } else {
540 		if (out_size == in_size) {
541 		    cout << "Size unchanged (";
542 		} else if (out_size < in_size) {
543 		    cout << "Reduced by "
544 			 << 100 * double(in_size - out_size) / in_size << "% "
545 			 << in_size - out_size << "K (" << in_size << "K -> ";
546 		} else {
547 		    cout << "INCREASED by "
548 			 << 100 * double(out_size - in_size) / in_size << "% "
549 			 << out_size - in_size << "K (" << in_size << "K -> ";
550 		}
551 		cout << out_size << "K)";
552 	    }
553 	    cout << endl;
554 	}
555 
556 	// Create the version file ("iamchert").
557 	//
558 	// This file contains a UUID, and we want the copy to have a fresh
559 	// UUID since its revision counter is reset to 1.  Currently the
560 	// easiest way to do this is to create a dummy "donor" database and
561 	// harvest its "iamchert" file.
562 	string donor = destdir;
563 	donor += "/donor.tmp";
564 
565 	(void)Xapian::Chert::open(donor, Xapian::DB_CREATE_OR_OVERWRITE);
566 	string from = donor;
567 	from += "/iamchert";
568 	string to(destdir);
569 	to += "/iamchert";
570 	if (rename(from.c_str(), to.c_str()) == -1) {
571 	    cerr << argv[0] << ": cannot rename '" << from << "' to '"
572 		 << to << "': " << strerror(errno) << endl;
573 	    exit(1);
574 	}
575 
576 	rm_rf(donor);
577     } catch (const Xapian::Error &error) {
578 	cerr << argv[0] << ": " << error.get_description() << endl;
579 	exit(1);
580     } catch (const char * msg) {
581 	cerr << argv[0] << ": " << msg << endl;
582 	exit(1);
583     }
584 }
585