1 /** @file
2  * @brief Allow inspection of the contents of a Xapian database
3  */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5  * Copyright 2002 Ananova Ltd
6  * Copyright 2002,2003,2004,2006,2007,2008,2009,2010,2011,2012,2013,2014,2016,2017,2018 Olly Betts
7  *
8  * This program is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU General Public License as
10  * published by the Free Software Foundation; either version 2 of the
11  * License, or (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
21  * USA
22  */
23 
24 #include <config.h>
25 
26 #include <xapian.h>
27 
28 #include <algorithm>
29 #include <ios>
30 #include <iostream>
31 #include <vector>
32 
33 #include "gnu_getopt.h"
34 
35 #include <cerrno>
36 #include <cstring>
37 #include <cstdlib>
38 #include "unicode/description_append.h"
39 
40 #include "unicode/description_append.cc"
41 
42 using namespace Xapian;
43 using namespace std;
44 
45 static char separator = ' ';
46 
47 static int verbose = 0;
48 static bool showvalues = false;
49 static bool showdocdata = false;
50 static bool count_zero_length_docs = false;
51 
52 // How to decode document values.
53 static enum {
54     VALUE_ESCAPE,
55     VALUE_SORTABLE_SERIALISE,
56     VALUE_PACKED_INT,
57     VALUE_RAW
58 } value_decode = VALUE_ESCAPE;
59 
60 #define PROG_NAME "delve"
61 #define PROG_DESC "Inspect the contents of a Xapian database"
62 
show_usage()63 static void show_usage() {
64     cout << "Usage: " PROG_NAME " [OPTIONS] DATABASE...\n\n"
65 "Options:\n"
66 "  -a                    show all terms in the database\n"
67 "  -A <prefix>           show all terms in the database with given prefix\n"
68 "  -r <recno>            for term list(s)\n"
69 "  -t <term>             for posting list(s)\n"
70 "  -t <term> -r <recno>  for position list(s)\n"
71 "  -s, --stemmer=LANG    set the stemming language, the default is 'none'\n"
72 "  -1                    output one list entry per line\n"
73 "  -V[<type>]<valueno>   output value valueno for each document referred to\n"
74 "                        (or each document in the database if no -r options).\n"
75 "                        <type> can be:\n"
76 "                        E: escape in a C-like way (default)\n"
77 "                        I: decode as a packed integer\n"
78 "                        R: show the raw value (which may contain binary data,\n"
79 "                           newlines, invalid UTF-8, etc)\n"
80 "                        S: decode using Xapian::sortable_unserialise()\n"
81 "  -V[<type>]            output all values for each document referred to.\n"
82 "                        <type> is as above.\n"
83 "  -d                    output document data for each document referred to\n"
84 "  -z                    for db, count documents with length 0\n"
85 "  -v                    extra info (wdf and len for postlist;\n"
86 "                        wdf and termfreq for termlist; number of terms for db;\n"
87 "                        termfreq when showing all terms)\n"
88 "  -vv                   even more info (also show collection freq and wdf\n"
89 "                        upper bound for terms)\n"
90 "      --help            display this help and exit\n"
91 "      --version         output version information and exit" << endl;
92 }
93 
94 static void
show_db_stats(Database & db)95 show_db_stats(Database &db)
96 {
97     // Display a few database stats.
98     cout << "UUID = " << db.get_uuid() << endl;
99     cout << "number of documents = " << db.get_doccount() << endl;
100     cout << "average document length = " << db.get_avlength() << endl;
101     cout << "document length lower bound = " << db.get_doclength_lower_bound()
102 	 << endl;
103     cout << "document length upper bound = " << db.get_doclength_upper_bound()
104 	 << endl;
105     cout << "highest document id ever used = " << db.get_lastdocid() << endl;
106     cout << boolalpha;
107     cout << "has positional information = " << db.has_positions() << endl;
108     cout << "revision = ";
109     if (db.size() > 1) {
110 	cout << "N/A (sharded DB)\n";
111     } else {
112 	try {
113 	    cout << db.get_revision() << endl;
114 	} catch (const Xapian::InvalidOperationError& e) {
115 	    cout << e.get_description() << endl;
116 	} catch (const Xapian::UnimplementedError& e) {
117 	    cout << "N/A (" << e.get_msg() << ")\n";
118 	}
119     }
120     cout << "currently open for writing = ";
121     try {
122 	cout << db.locked() << endl;
123     } catch (const Xapian::Error& e) {
124 	cout << e.get_description() << endl;
125     }
126 
127     if (count_zero_length_docs) {
128 	Xapian::doccount empty_docs = 0;
129 	if (db.get_total_length() == 0) {
130 	    // All documents are empty.
131 	    empty_docs = db.get_doccount();
132 	} else {
133 	    Xapian::PostingIterator d = db.postlist_begin(string());
134 	    while (d != db.postlist_end(string())) {
135 		if (d.get_doclength() == 0)
136 		    ++empty_docs;
137 		++d;
138 	    }
139 	}
140 	cout << "number of zero-length documents = " << empty_docs << endl;
141     }
142 
143     if (verbose) {
144 	// To find the number of terms, we have to count them!
145 	// This will take a few seconds or minutes, so only do it if -v
146 	// was specified.
147 	termcount terms = 0;
148 	TermIterator t = db.allterms_begin();
149 	while (t != db.allterms_end()) {
150 	    ++terms;
151 	    ++t;
152 	}
153 	cout << "number of distinct terms = " << terms << endl;
154     }
155 }
156 
157 static void
decode_and_show_value(const string & value)158 decode_and_show_value(const string& value)
159 {
160     switch (value_decode) {
161 	case VALUE_ESCAPE: {
162 	    string esc;
163 	    description_append(esc, value);
164 	    cout << esc;
165 	    break;
166 	}
167 	case VALUE_SORTABLE_SERIALISE:
168 	    cout << Xapian::sortable_unserialise(value);
169 	    break;
170 	case VALUE_PACKED_INT: {
171 	    unsigned long long i = 0;
172 	    for (unsigned char ch : value) {
173 		i = (i << 8) | ch;
174 	    }
175 	    cout << i;
176 	    break;
177 	}
178 	default: // VALUE_RAW
179 	    cout << value;
180 	    break;
181     }
182 }
183 
184 static void
show_values(Database & db,docid docid,char sep)185 show_values(Database &db, docid docid, char sep)
186 {
187     Document doc = db.get_document(docid);
188     ValueIterator v = doc.values_begin();
189     while (v != doc.values_end()) {
190 	cout << sep << v.get_valueno() << ':';
191 	decode_and_show_value(*v);
192 	++v;
193     }
194 }
195 
196 static void
show_values(Database & db,vector<docid>::const_iterator i,vector<docid>::const_iterator end)197 show_values(Database &db,
198 	    vector<docid>::const_iterator i,
199 	    vector<docid>::const_iterator end)
200 {
201     while (i != end) {
202 	cout << "Values for record #" << *i << ':';
203 	show_values(db, *i, separator);
204 	cout << endl;
205 	++i;
206     }
207 }
208 
209 static void
show_value(Database & db,vector<docid>::const_iterator i,vector<docid>::const_iterator end,Xapian::valueno slot)210 show_value(Database &db,
211 	   vector<docid>::const_iterator i,
212 	   vector<docid>::const_iterator end,
213 	   Xapian::valueno slot)
214 {
215     while (i != end) {
216 	Xapian::docid did = *i;
217 	cout << "Value " << slot << " for record #" << did << ": ";
218 	decode_and_show_value(db.get_document(did).get_value(slot));
219 	cout << endl;
220 	++i;
221     }
222 }
223 
224 static void
show_docdata(Database & db,docid docid,char sep)225 show_docdata(Database &db, docid docid, char sep)
226 {
227     cout << sep << "[" << db.get_document(docid).get_data() << ']';
228 }
229 
230 static void
show_docdata(Database & db,vector<docid>::const_iterator i,vector<docid>::const_iterator end)231 show_docdata(Database &db,
232 	     vector<docid>::const_iterator i,
233 	     vector<docid>::const_iterator end)
234 {
235     while (i != end) {
236 	cout << "Data for record #" << *i << ':' << endl;
237 	cout << db.get_document(*i).get_data() << endl;
238 	++i;
239     }
240 }
241 
242 static void
show_termlist(const Database & db,Xapian::docid did,const char * all_pfx=NULL)243 show_termlist(const Database &db, Xapian::docid did,
244 	      const char * all_pfx = NULL)
245 {
246     TermIterator t, tend;
247     if (all_pfx) {
248 	t = db.allterms_begin(all_pfx);
249 	tend = db.allterms_end(all_pfx);
250 	cout << "All terms in database";
251 	if (all_pfx[0])
252 	    cout << " with prefix \"" << all_pfx << "\"";
253     } else {
254 	t = db.termlist_begin(did);
255 	tend = db.termlist_end(did);
256 	cout << "Term List for record #" << did;
257     }
258     if (verbose) {
259 	cout << " (";
260 	if (did != 0)
261 	    cout << "wdf, ";
262 	cout << "termfreq";
263 	if (verbose > 1)
264 	    cout << ", collection freq, wdf upper bound";
265 	cout << ')';
266     }
267     cout << ':';
268 
269     while (t != tend) {
270 	const string & term = *t;
271 	cout << separator << term;
272 	if (verbose) {
273 	    if (did != 0)
274 		cout << ' ' << t.get_wdf();
275 	    cout << ' ' << t.get_termfreq();
276 	    if (verbose > 1) {
277 		cout << ' ' << db.get_collection_freq(term)
278 		     << ' ' << db.get_wdf_upper_bound(term);
279 	    }
280 	}
281 	++t;
282     }
283     cout << endl;
284 }
285 
286 static void
show_termlists(Database & db,vector<docid>::const_iterator i,vector<docid>::const_iterator end)287 show_termlists(Database &db,
288 	       vector<docid>::const_iterator i,
289 	       vector<docid>::const_iterator end)
290 {
291     // Display termlists
292     while (i != end) {
293 	show_termlist(db, *i);
294 	++i;
295     }
296 }
297 
298 int
main(int argc,char ** argv)299 main(int argc, char **argv) try {
300     if (argc > 1 && argv[1][0] == '-') {
301 	if (strcmp(argv[1], "--help") == 0) {
302 	    cout << PROG_NAME " - " PROG_DESC "\n\n";
303 	    show_usage();
304 	    exit(0);
305 	}
306 	if (strcmp(argv[1], "--version") == 0) {
307 	    cout << PROG_NAME " - " PACKAGE_STRING << endl;
308 	    exit(0);
309 	}
310     }
311 
312     const char * all_terms = NULL;
313     vector<docid> recnos;
314     vector<string> terms;
315     vector<string> dbs;
316     Stem stemmer;
317 
318     valueno slot = 0; // Avoid "may be used uninitialised" warnings.
319     bool slot_set = false;
320 
321     int c;
322     while ((c = gnu_getopt(argc, argv, "aA:r:t:s:1vV::dz")) != -1) {
323 	switch (c) {
324 	    case 'a':
325 		all_terms = "";
326 		break;
327 	    case 'A':
328 		all_terms = optarg;
329 		break;
330 	    case 'r': {
331 		char * end;
332 		errno = 0;
333 		unsigned long n = strtoul(optarg, &end, 10);
334 		if (optarg == end || *end) {
335 		    cout << "Non-numeric document id: " << optarg << endl;
336 		    exit(1);
337 		}
338 		Xapian::docid did(n);
339 		if (errno == ERANGE || n == 0 || did != n) {
340 		    cout << "Document id out of range: " << optarg << endl;
341 		    exit(1);
342 		}
343 		recnos.push_back(did);
344 		break;
345 	    }
346 	    case 't':
347 		terms.push_back(optarg);
348 		break;
349 	    case 's':
350 		stemmer = Stem(optarg);
351 		break;
352 	    case '1':
353 		separator = '\n';
354 		break;
355 	    case 'V':
356 		if (optarg) {
357 		    switch (*optarg) {
358 			case 'R':
359 			    value_decode = VALUE_RAW;
360 			    ++optarg;
361 			    break;
362 			case 'I':
363 			    value_decode = VALUE_PACKED_INT;
364 			    ++optarg;
365 			    break;
366 			case 'S':
367 			    value_decode = VALUE_SORTABLE_SERIALISE;
368 			    ++optarg;
369 			    break;
370 			case 'E':
371 			    value_decode = VALUE_ESCAPE;
372 			    ++optarg;
373 			    break;
374 		    }
375 		    char * end;
376 		    errno = 0;
377 		    unsigned long n = strtoul(optarg, &end, 10);
378 		    if (optarg == end || *end) {
379 			cout << "Non-numeric value slot: " << optarg << endl;
380 			exit(1);
381 		    }
382 		    slot = Xapian::valueno(n);
383 		    if (errno == ERANGE || slot != n) {
384 			cout << "Value slot out of range: " << optarg << endl;
385 			exit(1);
386 		    }
387 		    slot_set = true;
388 		} else {
389 		    showvalues = true;
390 		}
391 		break;
392 	    case 'd':
393 		showdocdata = true;
394 		break;
395 	    case 'v':
396 		++verbose;
397 		break;
398 	    case 'z':
399 		count_zero_length_docs = true;
400 		break;
401 	    default:
402 		show_usage();
403 		exit(1);
404 	}
405     }
406 
407     while (argv[optind]) dbs.push_back(argv[optind++]);
408 
409     if (dbs.empty()) {
410 	show_usage();
411 	exit(1);
412     }
413 
414     std::sort(recnos.begin(), recnos.end());
415 
416     Database db;
417     {
418 	vector<string>::const_iterator i;
419 	for (i = dbs.begin(); i != dbs.end(); ++i) {
420 	    try {
421 		db.add_database(Database(*i));
422 	    } catch (const Error &e) {
423 		cerr << "Error opening database '" << *i << "': ";
424 		cerr << e.get_description() << endl;
425 		return 1;
426 	    }
427 	}
428     }
429 
430     if (!all_terms && terms.empty() && recnos.empty() && !slot_set) {
431 	// Show some statistics about the database.
432 	show_db_stats(db);
433 	return 0;
434     }
435 
436     if (all_terms) {
437 	show_termlist(db, 0, all_terms);
438     }
439 
440     if (!recnos.empty()) {
441 	if (showvalues) {
442 	    show_values(db, recnos.begin(), recnos.end());
443 	} else if (slot_set) {
444 	    show_value(db, recnos.begin(), recnos.end(), slot);
445 	}
446 
447 	if (showdocdata) {
448 	    show_docdata(db, recnos.begin(), recnos.end());
449 	}
450     } else {
451 	if (slot_set) {
452 	    cout << "Value " << slot << " for each document:";
453 	    ValueIterator it = db.valuestream_begin(slot);
454 	    while (it != db.valuestream_end(slot)) {
455 		cout << separator << it.get_docid() << ':';
456 		decode_and_show_value(*it);
457 		++it;
458 	    }
459 	    cout << endl;
460 	}
461     }
462 
463     if (terms.empty()) {
464 	show_termlists(db, recnos.begin(), recnos.end());
465 	return 0;
466     }
467 
468     vector<string>::const_iterator i;
469     for (i = terms.begin(); i != terms.end(); ++i) {
470 	string term = stemmer(*i);
471 	PostingIterator p = db.postlist_begin(term);
472 	PostingIterator pend = db.postlist_end(term);
473 	if (p == pend) {
474 	    cout << "term '" << term << "' not in database\n";
475 	    continue;
476 	}
477 	if (recnos.empty()) {
478 	    // Display posting list
479 	    cout << "Posting List for term '" << term << "' (termfreq "
480 		 << db.get_termfreq(term) << ", collfreq "
481 		 << db.get_collection_freq(term) << ", wdf_max "
482 		 << db.get_wdf_upper_bound(term) << "):";
483 	    while (p != pend) {
484 		cout << separator << *p;
485 		if (verbose) {
486 		    cout << ' ' << p.get_wdf() << ' ' << p.get_doclength();
487 		}
488 		if (showvalues) show_values(db, *p, ' ');
489 		if (showdocdata) show_docdata(db, *p, ' ');
490 		++p;
491 	    }
492 	    cout << endl;
493 	} else {
494 	    // Display position lists
495 	    vector<docid>::const_iterator j;
496 	    for (j = recnos.begin(); j != recnos.end(); ++j) {
497 		p.skip_to(*j);
498 		if (p == pend || *p != *j) {
499 		    cout << "term '" << term <<
500 			"' doesn't index document #" << *j << endl;
501 		} else {
502 		    cout << "Position List for term '" << term
503 			<< "', record #" << *j << ':';
504 		    try {
505 			PositionIterator pos = p.positionlist_begin();
506 			while (pos != p.positionlist_end()) {
507 			    cout << separator << *pos;
508 			    ++pos;
509 			}
510 			cout << endl;
511 		    } catch (const Error &e) {
512 			cerr << "Error: " << e.get_description() << endl;
513 		    }
514 		}
515 	    }
516 	}
517     }
518 } catch (const Error &e) {
519     cerr << "\nError: " << e.get_description() << endl;
520     return 1;
521 }
522