1 /** @file xapian-chert-update.cc
2 * @brief Update a chert database to the new format keys
3 */
4 /* Copyright (C) 2003,2004,2005,2006,2007,2008,2009,2011,2013 Olly Betts
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License as
8 * published by the Free Software Foundation; either version 2 of the
9 * License, or (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
19 * USA
20 */
21
22 #include <config.h>
23
24 #include "safeerrno.h"
25
26 #include <iostream>
27
28 #include <cstdio> // for rename()
29 #include <cstdlib>
30 #include <cstring>
31 #include "safesysstat.h"
32 #include <sys/types.h>
33 #include "utils.h"
34
35 #include "chert_table.h"
36
37 #include "flint_table.h"
38 #include "flint_cursor.h"
39 #include "flint_utils.h"
40 #include "pack.h"
41
42 #include "safeunistd.h"
43 #include "safefcntl.h"
44
45 #ifdef __WIN32__
46 # include "safewindows.h"
47 #endif
48
49 #include "stringutils.h"
50
51 #include <xapian.h>
52
53 #include "gnu_getopt.h"
54
55 using namespace std;
56
57 #define PROG_NAME "chert-update"
58 #define PROG_DESC "Update a chert database to the new format keys"
59
60 #define OPT_HELP 1
61 #define OPT_VERSION 2
62 #define OPT_NO_RENUMBER 3
63
show_usage()64 static void show_usage() {
65 cout << "Usage: " PROG_NAME " [OPTIONS] SOURCE_DATABASE DESTINATION_DATABASE\n\n"
66 "Options:\n"
67 " -b Set the blocksize in bytes (e.g. 4096) or K (e.g. 4K)\n"
68 " (must be between 2K and 64K and a power of 2, default 8K)\n"
69 " --help display this help and exit\n"
70 " --version output version information and exit" << endl;
71 }
72
73 /// Append filename argument arg to command cmd with suitable escaping.
74 static bool
append_filename_argument(string & cmd,const string & arg)75 append_filename_argument(string & cmd, const string & arg) {
76 #ifdef __WIN32__
77 cmd.reserve(cmd.size() + arg.size() + 3);
78 cmd += " \"";
79 for (string::const_iterator i = arg.begin(); i != arg.end(); ++i) {
80 if (*i == '/') {
81 // Convert Unix path separators to backslashes. C library
82 // functions understand "/" in paths, but we are going to
83 // call commands like "deltree" or "rd" which don't.
84 cmd += '\\';
85 } else if (*i < 32 || strchr("<>\"|*?", *i)) {
86 // Check for illegal characters in filename.
87 return false;
88 } else {
89 cmd += *i;
90 }
91 }
92 cmd += '"';
93 #else
94 // Allow for escaping a few characters.
95 cmd.reserve(cmd.size() + arg.size() + 10);
96
97 // Prevent a leading "-" on the filename being interpreted as a command
98 // line option.
99 if (arg[0] == '-')
100 cmd += " ./";
101 else
102 cmd += ' ';
103
104 for (string::const_iterator i = arg.begin(); i != arg.end(); ++i) {
105 // Don't escape a few safe characters which are common in filenames.
106 if (!C_isalnum(*i) && strchr("/._-", *i) == NULL) {
107 cmd += '\\';
108 }
109 cmd += *i;
110 }
111 #endif
112 return true;
113 }
114
115 #ifdef __WIN32__
running_on_win9x()116 static bool running_on_win9x() {
117 static int win9x = -1;
118 if (win9x == -1) {
119 OSVERSIONINFO info;
120 memset(&info, 0, sizeof(OSVERSIONINFO));
121 info.dwOSVersionInfoSize = sizeof(OSVERSIONINFO);
122 if (GetVersionEx(&info)) {
123 win9x = (info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS);
124 }
125 }
126 return win9x;
127 }
128 #endif
129
130 /// Remove a directory and contents, just like the Unix "rm -rf" command.
rm_rf(const string & filename)131 static void rm_rf(const string &filename) {
132 // Check filename exists and is actually a directory
133 struct stat sb;
134 if (filename.empty() || stat(filename, &sb) != 0 || !S_ISDIR(sb.st_mode))
135 return;
136
137 #ifdef __WIN32__
138 string cmd;
139 if (running_on_win9x()) {
140 // For 95-like systems:
141 cmd = "deltree /y";
142 } else {
143 // For NT-like systems:
144 cmd = "rd /s /q";
145 }
146 #else
147 string cmd("rm -rf");
148 #endif
149 if (!append_filename_argument(cmd, filename)) return;
150 system(cmd);
151 }
152
153 static void
copy_position(FlintTable & in,ChertTable * out)154 copy_position(FlintTable &in, ChertTable *out)
155 {
156 in.open();
157 if (in.empty()) return;
158
159 FlintCursor cur(&in);
160 cur.find_entry(string());
161
162 string newkey;
163 while (cur.next()) {
164 const string & key = cur.current_key;
165 const char * d = key.data();
166 const char * e = d + key.size();
167 Xapian::docid did;
168 if (!F_unpack_uint_preserving_sort(&d, e, &did) || d == e)
169 throw Xapian::DatabaseCorruptError("Bad docid key");
170 newkey.resize(0);
171 pack_uint_preserving_sort(newkey, did);
172 newkey.append(d, e - d);
173 bool compressed = cur.read_tag(true);
174 out->add(newkey, cur.current_tag, compressed);
175 }
176 }
177
178 static void
copy_postlist(FlintTable & in,ChertTable * out)179 copy_postlist(FlintTable &in, ChertTable *out)
180 {
181 const string firstvaluechunk("\0\xd8", 2);
182 const string firstdoclenchunk("\0\xe0", 2);
183 const string firstchunk("\0\xff", 2);
184
185 in.open();
186 if (in.empty()) return;
187
188 // Copy metainfo item and valuestats.
189 FlintCursor cur(&in);
190 cur.find_entry(string());
191 while (true) {
192 if (!cur.next()) return;
193 if (cur.current_key >= firstvaluechunk) break;
194 bool compressed = cur.read_tag(true);
195 out->add(cur.current_key, cur.current_tag, compressed);
196 }
197
198 // Copy valuestream chunks, adjusting keys.
199 string newkey;
200 do {
201 const string & key = cur.current_key;
202 const char * d = key.data();
203 const char * d_orig = d;
204 const char * e = d + key.size();
205 d += 2;
206 Xapian::valueno slot;
207 if (!unpack_uint(&d, e, &slot))
208 throw Xapian::DatabaseCorruptError("Bad value chunk key (no slot)");
209 newkey.assign(d_orig, d - d_orig);
210 Xapian::docid did;
211 if (!F_unpack_uint_preserving_sort(&d, e, &did))
212 throw Xapian::DatabaseCorruptError("Bad value chunk key (no docid)");
213 if (d != e)
214 throw Xapian::DatabaseCorruptError("Bad value chunk key (trailing junk)");
215 pack_uint_preserving_sort(newkey, did);
216 bool compressed = cur.read_tag(true);
217 out->add(newkey, cur.current_tag, compressed);
218 if (!cur.next()) return;
219 } while (cur.current_key < firstdoclenchunk);
220
221 // Copy doclen chunks, adjusting keys.
222 do {
223 const string & key = cur.current_key;
224 const char * d = key.data();
225 const char * e = d + key.size();
226 newkey.assign(d, 2);
227 d += 2;
228 if (d != e) {
229 Xapian::docid did;
230 if (!F_unpack_uint_preserving_sort(&d, e, &did))
231 throw Xapian::DatabaseCorruptError("Bad doclen chunk key (no docid)");
232 if (d != e)
233 throw Xapian::DatabaseCorruptError("Bad doclen chunk key (trailing junk)");
234 pack_uint_preserving_sort(newkey, did);
235 }
236 bool compressed = cur.read_tag(true);
237 out->add(newkey, cur.current_tag, compressed);
238 if (!cur.next()) return;
239 } while (cur.current_key < firstchunk);
240
241 do {
242 const string & key = cur.current_key;
243 const char * d = key.data();
244 const char * e = d + key.size();
245 string term;
246 if (!F_unpack_string_preserving_sort(&d, e, term))
247 throw Xapian::DatabaseCorruptError("Bad postlist key");
248 if (d == e) {
249 // This is an initial chunk for a term.
250 newkey = pack_chert_postlist_key(term);
251 } else {
252 // Not an initial chunk.
253 Xapian::docid firstdid;
254 if (!F_unpack_uint_preserving_sort(&d, e, &firstdid) || d != e)
255 throw Xapian::DatabaseCorruptError("Bad postlist key");
256 newkey = pack_chert_postlist_key(term, firstdid);
257 }
258 bool compressed = cur.read_tag(true);
259 out->add(newkey, cur.current_tag, compressed);
260 } while (cur.next());
261 }
262
263 static void
copy_unchanged(FlintTable & in,ChertTable * out)264 copy_unchanged(FlintTable &in, ChertTable *out)
265 {
266 in.open();
267 if (in.empty()) return;
268
269 FlintCursor cur(&in);
270 cur.find_entry(string());
271 while (cur.next()) {
272 bool compressed = cur.read_tag(true);
273 out->add(cur.current_key, cur.current_tag, compressed);
274 }
275 }
276
277 static void
copy_termlist(FlintTable & in,ChertTable * out)278 copy_termlist(FlintTable &in, ChertTable *out)
279 {
280 in.open();
281 if (in.empty()) return;
282
283 FlintCursor cur(&in);
284 cur.find_entry(string());
285
286 string newkey;
287 while (cur.next()) {
288 const string & key = cur.current_key;
289 const char * d = key.data();
290 const char * e = d + key.size();
291 Xapian::docid did;
292 if (!F_unpack_uint_preserving_sort(&d, e, &did))
293 throw Xapian::DatabaseCorruptError("Bad termlist key");
294 newkey.resize(0);
295 pack_uint_preserving_sort(newkey, did);
296 if (d != e) {
297 // slot keys have a single zero byte suffix.
298 if (*d++ != '\0' || d != e)
299 throw Xapian::DatabaseCorruptError("Bad termlist key");
300 newkey.append(1, '\0');
301 }
302 bool compressed = cur.read_tag(true);
303 out->add(newkey, cur.current_tag, compressed);
304 }
305 }
306
307 static void
copy_docid_keyed(FlintTable & in,ChertTable * out)308 copy_docid_keyed(FlintTable &in, ChertTable *out)
309 {
310 in.open();
311 if (in.empty()) return;
312
313 FlintCursor cur(&in);
314 cur.find_entry(string());
315
316 string newkey;
317 while (cur.next()) {
318 const string & key = cur.current_key;
319 const char * d = key.data();
320 const char * e = d + key.size();
321 Xapian::docid did;
322 if (!F_unpack_uint_preserving_sort(&d, e, &did) || d != e)
323 throw Xapian::DatabaseCorruptError("Bad docid key");
324 newkey.resize(0);
325 pack_uint_preserving_sort(newkey, did);
326 bool compressed = cur.read_tag(true);
327 out->add(newkey, cur.current_tag, compressed);
328 }
329 }
330
331 int
main(int argc,char ** argv)332 main(int argc, char **argv)
333 {
334 const char * opts = "b:";
335 const struct option long_opts[] = {
336 {"help", no_argument, 0, OPT_HELP},
337 {"version", no_argument, 0, OPT_VERSION},
338 {NULL, 0, 0, 0}
339 };
340
341 size_t block_size = 8192;
342
343 int c;
344 while ((c = gnu_getopt_long(argc, argv, opts, long_opts, 0)) != -1) {
345 switch (c) {
346 case 'b': {
347 char *p;
348 block_size = strtoul(optarg, &p, 10);
349 if (block_size <= 64 && (*p == 'K' || *p == 'k')) {
350 ++p;
351 block_size *= 1024;
352 }
353 if (*p || block_size < 2048 || block_size > 65536 ||
354 (block_size & (block_size - 1)) != 0) {
355 cerr << PROG_NAME ": Bad value '" << optarg
356 << "' passed for blocksize, must be a power of 2 between 2K and 64K"
357 << endl;
358 exit(1);
359 }
360 break;
361 }
362 case OPT_HELP:
363 cout << PROG_NAME " - " PROG_DESC "\n\n";
364 show_usage();
365 exit(0);
366 case OPT_VERSION:
367 cout << PROG_NAME " - " PACKAGE_STRING << endl;
368 exit(0);
369 default:
370 show_usage();
371 exit(1);
372 }
373 }
374
375 if (argc - optind != 2) {
376 show_usage();
377 exit(1);
378 }
379
380 // Path to the database to create.
381 const char *destdir = argv[argc - 1];
382
383 try {
384 const char *srcdir = argv[optind];
385 // Check destdir isn't the same as the source directory...
386 if (strcmp(srcdir, destdir) == 0) {
387 cout << argv[0]
388 << ": destination may not be the same as the source directory."
389 << endl;
390 exit(1);
391 }
392
393 {
394 struct stat sb;
395 if (stat(string(srcdir) + "/iamchert", &sb) != 0) {
396 cout << argv[0] << ": '" << srcdir
397 << "' is not a chert database directory" << endl;
398 exit(1);
399 }
400 try {
401 // Will throw an exception for old format chert.
402 Xapian::Database db(srcdir);
403 cout << argv[0] << ": '" << srcdir
404 << "' is already the latest chert format" << endl;
405 exit(1);
406 } catch (const Xapian::DatabaseVersionError &) {
407 // If we need to verify the version, e.get_msg() reports:
408 // <DBDIR>/iamchert: Chert version file is version 200903070 but I only understand 200912150
409 }
410 }
411
412 // If the destination database directory doesn't exist, create it.
413 if (mkdir(destdir, 0755) < 0) {
414 // Check why mkdir failed. It's ok if the directory already
415 // exists, but we also get EEXIST if there's an existing file with
416 // that name.
417 if (errno == EEXIST) {
418 struct stat sb;
419 if (stat(destdir, &sb) == 0 && S_ISDIR(sb.st_mode))
420 errno = 0;
421 else
422 errno = EEXIST; // stat might have changed it
423 }
424 if (errno) {
425 cerr << argv[0] << ": cannot create directory '"
426 << destdir << "': " << strerror(errno) << endl;
427 exit(1);
428 }
429 }
430
431 enum table_type {
432 POSTLIST, RECORD, TERMLIST, POSITION, SPELLING, SYNONYM
433 };
434 struct table_list {
435 // The "base name" of the table.
436 const char * name;
437 // The type.
438 table_type type;
439 // zlib compression strategy to use on tags.
440 int compress_strategy;
441 // Create tables after position lazily.
442 bool lazy;
443 };
444
445 static const table_list tables[] = {
446 // name type compress_strategy lazy
447 { "postlist", POSTLIST, DONT_COMPRESS, false },
448 { "record", RECORD, Z_DEFAULT_STRATEGY, false },
449 { "termlist", TERMLIST, Z_DEFAULT_STRATEGY, false },
450 { "position", POSITION, DONT_COMPRESS, true },
451 { "spelling", SPELLING, Z_DEFAULT_STRATEGY, true },
452 { "synonym", SYNONYM, Z_DEFAULT_STRATEGY, true }
453 };
454 const table_list * tables_end = tables +
455 (sizeof(tables) / sizeof(tables[0]));
456
457 for (const table_list * t = tables; t < tables_end; ++t) {
458 bool bad_stat = false;
459 off_t in_size = 0;
460 // The postlist requires an N-way merge, adjusting the headers of
461 // various blocks. The other tables have keys sorted in docid
462 // order, so we can merge them by simply copying all the keys from
463 // each source table in turn.
464 cout << t->name << " ..." << flush;
465
466 string s(srcdir);
467 s += '/';
468 s += t->name;
469 s += '.';
470 {
471 struct stat sb;
472 if (stat(s + "DB", &sb) == 0) {
473 in_size += sb.st_size / 1024;
474 } else if (errno != ENOENT) {
475 // We get ENOENT for an optional table.
476 bad_stat = true;
477 } else if (t->type == TERMLIST) {
478 cout << '\r' << t->name << ": doesn't exist" << endl;
479 continue;
480 }
481 }
482
483 FlintTable in(t->name, s, true, DONT_COMPRESS, t->lazy);
484
485 string dest = destdir;
486 dest += '/';
487 dest += t->name;
488 dest += '.';
489
490 ChertTable out(t->name, dest, false, t->compress_strategy, t->lazy);
491 if (!t->lazy) {
492 out.create_and_open(block_size);
493 } else {
494 out.erase();
495 out.set_block_size(block_size);
496 }
497
498 out.set_full_compaction(true);
499 // if (compaction == FULLER) out.set_max_item_size(1);
500
501 // Sometimes stat can fail for benign reasons (e.g. >= 2GB file
502 // on certain systems).
503
504 switch (t->type) {
505 case POSITION:
506 copy_position(in, &out);
507 break;
508 case POSTLIST:
509 copy_postlist(in, &out);
510 break;
511 case SPELLING: case SYNONYM:
512 copy_unchanged(in, &out);
513 break;
514 case TERMLIST:
515 copy_termlist(in, &out);
516 break;
517 default:
518 // Record
519 copy_docid_keyed(in, &out);
520 break;
521 }
522
523 // Commit as revision 1.
524 out.flush_db();
525 out.commit(1);
526
527 cout << '\r' << t->name << ": ";
528 off_t out_size = 0;
529 if (!bad_stat) {
530 struct stat sb;
531 if (stat(dest + "DB", &sb) == 0) {
532 out_size = sb.st_size / 1024;
533 } else {
534 bad_stat = (errno != ENOENT);
535 }
536 }
537 if (bad_stat) {
538 cout << "Done (couldn't stat all the DB files)";
539 } else {
540 if (out_size == in_size) {
541 cout << "Size unchanged (";
542 } else if (out_size < in_size) {
543 cout << "Reduced by "
544 << 100 * double(in_size - out_size) / in_size << "% "
545 << in_size - out_size << "K (" << in_size << "K -> ";
546 } else {
547 cout << "INCREASED by "
548 << 100 * double(out_size - in_size) / in_size << "% "
549 << out_size - in_size << "K (" << in_size << "K -> ";
550 }
551 cout << out_size << "K)";
552 }
553 cout << endl;
554 }
555
556 // Create the version file ("iamchert").
557 //
558 // This file contains a UUID, and we want the copy to have a fresh
559 // UUID since its revision counter is reset to 1. Currently the
560 // easiest way to do this is to create a dummy "donor" database and
561 // harvest its "iamchert" file.
562 string donor = destdir;
563 donor += "/donor.tmp";
564
565 (void)Xapian::Chert::open(donor, Xapian::DB_CREATE_OR_OVERWRITE);
566 string from = donor;
567 from += "/iamchert";
568 string to(destdir);
569 to += "/iamchert";
570 if (rename(from.c_str(), to.c_str()) == -1) {
571 cerr << argv[0] << ": cannot rename '" << from << "' to '"
572 << to << "': " << strerror(errno) << endl;
573 exit(1);
574 }
575
576 rm_rf(donor);
577 } catch (const Xapian::Error &error) {
578 cerr << argv[0] << ": " << error.get_description() << endl;
579 exit(1);
580 } catch (const char * msg) {
581 cerr << argv[0] << ": " << msg << endl;
582 exit(1);
583 }
584 }
585