1 /** @file
2 * @brief Allow inspection of the contents of a Xapian database
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2002 Ananova Ltd
6 * Copyright 2002,2003,2004,2006,2007,2008,2009,2010,2011,2012,2013,2014,2016,2017,2018 Olly Betts
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License as
10 * published by the Free Software Foundation; either version 2 of the
11 * License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
21 * USA
22 */
23
24 #include <config.h>
25
26 #include <xapian.h>
27
28 #include <algorithm>
29 #include <ios>
30 #include <iostream>
31 #include <vector>
32
33 #include "gnu_getopt.h"
34
35 #include <cerrno>
36 #include <cstring>
37 #include <cstdlib>
38 #include "unicode/description_append.h"
39
40 #include "unicode/description_append.cc"
41
42 using namespace Xapian;
43 using namespace std;
44
45 static char separator = ' ';
46
47 static int verbose = 0;
48 static bool showvalues = false;
49 static bool showdocdata = false;
50 static bool count_zero_length_docs = false;
51
52 // How to decode document values.
53 static enum {
54 VALUE_ESCAPE,
55 VALUE_SORTABLE_SERIALISE,
56 VALUE_PACKED_INT,
57 VALUE_RAW
58 } value_decode = VALUE_ESCAPE;
59
60 #define PROG_NAME "delve"
61 #define PROG_DESC "Inspect the contents of a Xapian database"
62
show_usage()63 static void show_usage() {
64 cout << "Usage: " PROG_NAME " [OPTIONS] DATABASE...\n\n"
65 "Options:\n"
66 " -a show all terms in the database\n"
67 " -A <prefix> show all terms in the database with given prefix\n"
68 " -r <recno> for term list(s)\n"
69 " -t <term> for posting list(s)\n"
70 " -t <term> -r <recno> for position list(s)\n"
71 " -s, --stemmer=LANG set the stemming language, the default is 'none'\n"
72 " -1 output one list entry per line\n"
73 " -V[<type>]<valueno> output value valueno for each document referred to\n"
74 " (or each document in the database if no -r options).\n"
75 " <type> can be:\n"
76 " E: escape in a C-like way (default)\n"
77 " I: decode as a packed integer\n"
78 " R: show the raw value (which may contain binary data,\n"
79 " newlines, invalid UTF-8, etc)\n"
80 " S: decode using Xapian::sortable_unserialise()\n"
81 " -V[<type>] output all values for each document referred to.\n"
82 " <type> is as above.\n"
83 " -d output document data for each document referred to\n"
84 " -z for db, count documents with length 0\n"
85 " -v extra info (wdf and len for postlist;\n"
86 " wdf and termfreq for termlist; number of terms for db;\n"
87 " termfreq when showing all terms)\n"
88 " -vv even more info (also show collection freq and wdf\n"
89 " upper bound for terms)\n"
90 " --help display this help and exit\n"
91 " --version output version information and exit" << endl;
92 }
93
94 static void
show_db_stats(Database & db)95 show_db_stats(Database &db)
96 {
97 // Display a few database stats.
98 cout << "UUID = " << db.get_uuid() << endl;
99 cout << "number of documents = " << db.get_doccount() << endl;
100 cout << "average document length = " << db.get_avlength() << endl;
101 cout << "document length lower bound = " << db.get_doclength_lower_bound()
102 << endl;
103 cout << "document length upper bound = " << db.get_doclength_upper_bound()
104 << endl;
105 cout << "highest document id ever used = " << db.get_lastdocid() << endl;
106 cout << boolalpha;
107 cout << "has positional information = " << db.has_positions() << endl;
108 cout << "revision = ";
109 if (db.size() > 1) {
110 cout << "N/A (sharded DB)\n";
111 } else {
112 try {
113 cout << db.get_revision() << endl;
114 } catch (const Xapian::InvalidOperationError& e) {
115 cout << e.get_description() << endl;
116 } catch (const Xapian::UnimplementedError& e) {
117 cout << "N/A (" << e.get_msg() << ")\n";
118 }
119 }
120 cout << "currently open for writing = ";
121 try {
122 cout << db.locked() << endl;
123 } catch (const Xapian::Error& e) {
124 cout << e.get_description() << endl;
125 }
126
127 if (count_zero_length_docs) {
128 Xapian::doccount empty_docs = 0;
129 if (db.get_total_length() == 0) {
130 // All documents are empty.
131 empty_docs = db.get_doccount();
132 } else {
133 Xapian::PostingIterator d = db.postlist_begin(string());
134 while (d != db.postlist_end(string())) {
135 if (d.get_doclength() == 0)
136 ++empty_docs;
137 ++d;
138 }
139 }
140 cout << "number of zero-length documents = " << empty_docs << endl;
141 }
142
143 if (verbose) {
144 // To find the number of terms, we have to count them!
145 // This will take a few seconds or minutes, so only do it if -v
146 // was specified.
147 termcount terms = 0;
148 TermIterator t = db.allterms_begin();
149 while (t != db.allterms_end()) {
150 ++terms;
151 ++t;
152 }
153 cout << "number of distinct terms = " << terms << endl;
154 }
155 }
156
157 static void
decode_and_show_value(const string & value)158 decode_and_show_value(const string& value)
159 {
160 switch (value_decode) {
161 case VALUE_ESCAPE: {
162 string esc;
163 description_append(esc, value);
164 cout << esc;
165 break;
166 }
167 case VALUE_SORTABLE_SERIALISE:
168 cout << Xapian::sortable_unserialise(value);
169 break;
170 case VALUE_PACKED_INT: {
171 unsigned long long i = 0;
172 for (unsigned char ch : value) {
173 i = (i << 8) | ch;
174 }
175 cout << i;
176 break;
177 }
178 default: // VALUE_RAW
179 cout << value;
180 break;
181 }
182 }
183
184 static void
show_values(Database & db,docid docid,char sep)185 show_values(Database &db, docid docid, char sep)
186 {
187 Document doc = db.get_document(docid);
188 ValueIterator v = doc.values_begin();
189 while (v != doc.values_end()) {
190 cout << sep << v.get_valueno() << ':';
191 decode_and_show_value(*v);
192 ++v;
193 }
194 }
195
196 static void
show_values(Database & db,vector<docid>::const_iterator i,vector<docid>::const_iterator end)197 show_values(Database &db,
198 vector<docid>::const_iterator i,
199 vector<docid>::const_iterator end)
200 {
201 while (i != end) {
202 cout << "Values for record #" << *i << ':';
203 show_values(db, *i, separator);
204 cout << endl;
205 ++i;
206 }
207 }
208
209 static void
show_value(Database & db,vector<docid>::const_iterator i,vector<docid>::const_iterator end,Xapian::valueno slot)210 show_value(Database &db,
211 vector<docid>::const_iterator i,
212 vector<docid>::const_iterator end,
213 Xapian::valueno slot)
214 {
215 while (i != end) {
216 Xapian::docid did = *i;
217 cout << "Value " << slot << " for record #" << did << ": ";
218 decode_and_show_value(db.get_document(did).get_value(slot));
219 cout << endl;
220 ++i;
221 }
222 }
223
224 static void
show_docdata(Database & db,docid docid,char sep)225 show_docdata(Database &db, docid docid, char sep)
226 {
227 cout << sep << "[" << db.get_document(docid).get_data() << ']';
228 }
229
230 static void
show_docdata(Database & db,vector<docid>::const_iterator i,vector<docid>::const_iterator end)231 show_docdata(Database &db,
232 vector<docid>::const_iterator i,
233 vector<docid>::const_iterator end)
234 {
235 while (i != end) {
236 cout << "Data for record #" << *i << ':' << endl;
237 cout << db.get_document(*i).get_data() << endl;
238 ++i;
239 }
240 }
241
242 static void
show_termlist(const Database & db,Xapian::docid did,const char * all_pfx=NULL)243 show_termlist(const Database &db, Xapian::docid did,
244 const char * all_pfx = NULL)
245 {
246 TermIterator t, tend;
247 if (all_pfx) {
248 t = db.allterms_begin(all_pfx);
249 tend = db.allterms_end(all_pfx);
250 cout << "All terms in database";
251 if (all_pfx[0])
252 cout << " with prefix \"" << all_pfx << "\"";
253 } else {
254 t = db.termlist_begin(did);
255 tend = db.termlist_end(did);
256 cout << "Term List for record #" << did;
257 }
258 if (verbose) {
259 cout << " (";
260 if (did != 0)
261 cout << "wdf, ";
262 cout << "termfreq";
263 if (verbose > 1)
264 cout << ", collection freq, wdf upper bound";
265 cout << ')';
266 }
267 cout << ':';
268
269 while (t != tend) {
270 const string & term = *t;
271 cout << separator << term;
272 if (verbose) {
273 if (did != 0)
274 cout << ' ' << t.get_wdf();
275 cout << ' ' << t.get_termfreq();
276 if (verbose > 1) {
277 cout << ' ' << db.get_collection_freq(term)
278 << ' ' << db.get_wdf_upper_bound(term);
279 }
280 }
281 ++t;
282 }
283 cout << endl;
284 }
285
286 static void
show_termlists(Database & db,vector<docid>::const_iterator i,vector<docid>::const_iterator end)287 show_termlists(Database &db,
288 vector<docid>::const_iterator i,
289 vector<docid>::const_iterator end)
290 {
291 // Display termlists
292 while (i != end) {
293 show_termlist(db, *i);
294 ++i;
295 }
296 }
297
298 int
main(int argc,char ** argv)299 main(int argc, char **argv) try {
300 if (argc > 1 && argv[1][0] == '-') {
301 if (strcmp(argv[1], "--help") == 0) {
302 cout << PROG_NAME " - " PROG_DESC "\n\n";
303 show_usage();
304 exit(0);
305 }
306 if (strcmp(argv[1], "--version") == 0) {
307 cout << PROG_NAME " - " PACKAGE_STRING << endl;
308 exit(0);
309 }
310 }
311
312 const char * all_terms = NULL;
313 vector<docid> recnos;
314 vector<string> terms;
315 vector<string> dbs;
316 Stem stemmer;
317
318 valueno slot = 0; // Avoid "may be used uninitialised" warnings.
319 bool slot_set = false;
320
321 int c;
322 while ((c = gnu_getopt(argc, argv, "aA:r:t:s:1vV::dz")) != -1) {
323 switch (c) {
324 case 'a':
325 all_terms = "";
326 break;
327 case 'A':
328 all_terms = optarg;
329 break;
330 case 'r': {
331 char * end;
332 errno = 0;
333 unsigned long n = strtoul(optarg, &end, 10);
334 if (optarg == end || *end) {
335 cout << "Non-numeric document id: " << optarg << endl;
336 exit(1);
337 }
338 Xapian::docid did(n);
339 if (errno == ERANGE || n == 0 || did != n) {
340 cout << "Document id out of range: " << optarg << endl;
341 exit(1);
342 }
343 recnos.push_back(did);
344 break;
345 }
346 case 't':
347 terms.push_back(optarg);
348 break;
349 case 's':
350 stemmer = Stem(optarg);
351 break;
352 case '1':
353 separator = '\n';
354 break;
355 case 'V':
356 if (optarg) {
357 switch (*optarg) {
358 case 'R':
359 value_decode = VALUE_RAW;
360 ++optarg;
361 break;
362 case 'I':
363 value_decode = VALUE_PACKED_INT;
364 ++optarg;
365 break;
366 case 'S':
367 value_decode = VALUE_SORTABLE_SERIALISE;
368 ++optarg;
369 break;
370 case 'E':
371 value_decode = VALUE_ESCAPE;
372 ++optarg;
373 break;
374 }
375 char * end;
376 errno = 0;
377 unsigned long n = strtoul(optarg, &end, 10);
378 if (optarg == end || *end) {
379 cout << "Non-numeric value slot: " << optarg << endl;
380 exit(1);
381 }
382 slot = Xapian::valueno(n);
383 if (errno == ERANGE || slot != n) {
384 cout << "Value slot out of range: " << optarg << endl;
385 exit(1);
386 }
387 slot_set = true;
388 } else {
389 showvalues = true;
390 }
391 break;
392 case 'd':
393 showdocdata = true;
394 break;
395 case 'v':
396 ++verbose;
397 break;
398 case 'z':
399 count_zero_length_docs = true;
400 break;
401 default:
402 show_usage();
403 exit(1);
404 }
405 }
406
407 while (argv[optind]) dbs.push_back(argv[optind++]);
408
409 if (dbs.empty()) {
410 show_usage();
411 exit(1);
412 }
413
414 std::sort(recnos.begin(), recnos.end());
415
416 Database db;
417 {
418 vector<string>::const_iterator i;
419 for (i = dbs.begin(); i != dbs.end(); ++i) {
420 try {
421 db.add_database(Database(*i));
422 } catch (const Error &e) {
423 cerr << "Error opening database '" << *i << "': ";
424 cerr << e.get_description() << endl;
425 return 1;
426 }
427 }
428 }
429
430 if (!all_terms && terms.empty() && recnos.empty() && !slot_set) {
431 // Show some statistics about the database.
432 show_db_stats(db);
433 return 0;
434 }
435
436 if (all_terms) {
437 show_termlist(db, 0, all_terms);
438 }
439
440 if (!recnos.empty()) {
441 if (showvalues) {
442 show_values(db, recnos.begin(), recnos.end());
443 } else if (slot_set) {
444 show_value(db, recnos.begin(), recnos.end(), slot);
445 }
446
447 if (showdocdata) {
448 show_docdata(db, recnos.begin(), recnos.end());
449 }
450 } else {
451 if (slot_set) {
452 cout << "Value " << slot << " for each document:";
453 ValueIterator it = db.valuestream_begin(slot);
454 while (it != db.valuestream_end(slot)) {
455 cout << separator << it.get_docid() << ':';
456 decode_and_show_value(*it);
457 ++it;
458 }
459 cout << endl;
460 }
461 }
462
463 if (terms.empty()) {
464 show_termlists(db, recnos.begin(), recnos.end());
465 return 0;
466 }
467
468 vector<string>::const_iterator i;
469 for (i = terms.begin(); i != terms.end(); ++i) {
470 string term = stemmer(*i);
471 PostingIterator p = db.postlist_begin(term);
472 PostingIterator pend = db.postlist_end(term);
473 if (p == pend) {
474 cout << "term '" << term << "' not in database\n";
475 continue;
476 }
477 if (recnos.empty()) {
478 // Display posting list
479 cout << "Posting List for term '" << term << "' (termfreq "
480 << db.get_termfreq(term) << ", collfreq "
481 << db.get_collection_freq(term) << ", wdf_max "
482 << db.get_wdf_upper_bound(term) << "):";
483 while (p != pend) {
484 cout << separator << *p;
485 if (verbose) {
486 cout << ' ' << p.get_wdf() << ' ' << p.get_doclength();
487 }
488 if (showvalues) show_values(db, *p, ' ');
489 if (showdocdata) show_docdata(db, *p, ' ');
490 ++p;
491 }
492 cout << endl;
493 } else {
494 // Display position lists
495 vector<docid>::const_iterator j;
496 for (j = recnos.begin(); j != recnos.end(); ++j) {
497 p.skip_to(*j);
498 if (p == pend || *p != *j) {
499 cout << "term '" << term <<
500 "' doesn't index document #" << *j << endl;
501 } else {
502 cout << "Position List for term '" << term
503 << "', record #" << *j << ':';
504 try {
505 PositionIterator pos = p.positionlist_begin();
506 while (pos != p.positionlist_end()) {
507 cout << separator << *pos;
508 ++pos;
509 }
510 cout << endl;
511 } catch (const Error &e) {
512 cerr << "Error: " << e.get_description() << endl;
513 }
514 }
515 }
516 }
517 }
518 } catch (const Error &e) {
519 cerr << "\nError: " << e.get_description() << endl;
520 return 1;
521 }
522