1 /** @file
2 * @brief index arbitrary data as described by an index script
3 */
4 /* Copyright 1999,2000,2001 BrightStation PLC
5 * Copyright 2001 Sam Liddicott
6 * Copyright 2001,2002 Ananova Ltd
7 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014,2015,2017,2018 Olly Betts
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License as
11 * published by the Free Software Foundation; either version 2 of the
12 * License, or (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
22 * USA
23 */
24
25 #include <config.h>
26
27 #include <xapian.h>
28
29 #include <algorithm>
30 #include <fstream>
31 #include <iostream>
32 #include <list>
33 #include <map>
34 #include <memory>
35 #include <string>
36 #include <unordered_set>
37 #include <vector>
38 #include <cstring>
39
40 #include <cerrno>
41 #include <cstdio>
42 #include <cstdlib>
43 #include <ctime>
44
45 #include "commonhelp.h"
46 #include "hashterm.h"
47 #include "loadfile.h"
48 #include "myhtmlparse.h"
49 #include "parseint.h"
50 #include "setenv.h"
51 #include "str.h"
52 #include "stringutils.h"
53 #include "timegm.h"
54 #include "utf8truncate.h"
55 #include "utils.h"
56 #include "values.h"
57
58 #ifndef HAVE_STRPTIME
59 #include "portability/strptime.h"
60 #endif
61
62 #include "gnu_getopt.h"
63
64 using namespace std;
65
66 #define PROG_NAME "scriptindex"
67 #define PROG_DESC "index arbitrary data as described by an index script"
68
69 static bool verbose;
70 static int addcount;
71 static int repcount;
72 static int delcount;
73
74 static inline bool
prefix_needs_colon(const string & prefix,unsigned ch)75 prefix_needs_colon(const string & prefix, unsigned ch)
76 {
77 if (!C_isupper(ch) && ch != ':') return false;
78 string::size_type len = prefix.length();
79 return (len > 1 && prefix[len - 1] != ':');
80 }
81
82 const char * action_names[] = {
83 "bad", "new",
84 "boolean", "date", "field", "gap", "hash", "hextobin", "index",
85 "indexnopos", "load", "lower", "parsedate", "spell", "split", "truncate",
86 "unhtml", "unique", "value", "valuenumeric", "valuepacked", "weight"
87 };
88
89 // For debugging:
90 #define DUMP_ACTION(A) cout << action_names[(A).get_action()] << "(" << (A).get_string_arg() << "," << (A).get_num_arg() << ")" << endl
91
92 class Action {
93 public:
94 typedef enum {
95 BAD, NEW,
96 BOOLEAN, DATE, FIELD, GAP, HASH, HEXTOBIN, INDEX, INDEXNOPOS, LOAD,
97 LOWER, PARSEDATE, SPELL, SPLIT, TRUNCATE, UNHTML, UNIQUE, VALUE,
98 VALUENUMERIC, VALUEPACKED, WEIGHT
99 } type;
100 enum { SPLIT_NONE, SPLIT_DEDUP, SPLIT_SORT, SPLIT_PREFIXES };
101 private:
102 type action;
103 int num_arg;
104 string string_arg;
105 // Offset into indexscript line.
106 size_t pos;
107 public:
Action(type action_,size_t pos_)108 Action(type action_, size_t pos_)
109 : action(action_), num_arg(0), pos(pos_) { }
Action(type action_,size_t pos_,const string & arg)110 Action(type action_, size_t pos_, const string & arg)
111 : action(action_), string_arg(arg), pos(pos_) {
112 num_arg = atoi(string_arg.c_str());
113 }
Action(type action_,size_t pos_,const string & arg,int num)114 Action(type action_, size_t pos_, const string & arg, int num)
115 : action(action_), num_arg(num), string_arg(arg), pos(pos_) { }
get_action() const116 type get_action() const { return action; }
get_num_arg() const117 int get_num_arg() const { return num_arg; }
set_num_arg(int num)118 void set_num_arg(int num) { num_arg = num; }
get_string_arg() const119 const string & get_string_arg() const { return string_arg; }
get_pos() const120 size_t get_pos() const { return pos; }
121 };
122
123 // These allow searching for an Action with a particular Action::type using
124 // std::find().
125
126 inline bool
operator ==(const Action & a,Action::type t)127 operator==(const Action& a, Action::type t) { return a.get_action() == t; }
128
129 inline bool
operator ==(Action::type t,const Action & a)130 operator==(Action::type t, const Action& a) { return a.get_action() == t; }
131
132 inline bool
operator !=(const Action & a,Action::type t)133 operator!=(const Action& a, Action::type t) { return !(a == t); }
134
135 inline bool
operator !=(Action::type t,const Action & a)136 operator!=(Action::type t, const Action& a) { return !(t == a); }
137
138 enum diag_type { DIAG_ERROR, DIAG_WARN, DIAG_NOTE };
139
140 static void
report_location(enum diag_type type,const string & filename,size_t line=0,size_t pos=string::npos)141 report_location(enum diag_type type,
142 const string& filename,
143 size_t line = 0,
144 size_t pos = string::npos)
145 {
146 cerr << filename;
147 if (line != 0) {
148 cerr << ':' << line;
149 }
150 if (pos != string::npos) {
151 // The first column is numbered 1.
152 cerr << ':' << pos + 1;
153 }
154 switch (type) {
155 case DIAG_ERROR:
156 cerr << ": error: ";
157 break;
158 case DIAG_WARN:
159 cerr << ": warning: ";
160 break;
161 case DIAG_NOTE:
162 cerr << ": note: ";
163 break;
164 }
165 }
166
167 static void
report_useless_action(const string & file,size_t line,size_t pos,const string & action)168 report_useless_action(const string &file, size_t line, size_t pos,
169 const string &action)
170 {
171 report_location(DIAG_WARN, file, line, pos);
172 cerr << "Index action '" << action << "' has no effect" << endl;
173
174 static bool given_left_to_right_warning = false;
175 if (!given_left_to_right_warning) {
176 given_left_to_right_warning = true;
177 report_location(DIAG_NOTE, file, line, pos);
178 cerr << "Actions are executed from left to right" << endl;
179 }
180 }
181
182 static map<string, vector<Action>> index_spec;
183
184 static void
parse_index_script(const string & filename)185 parse_index_script(const string &filename)
186 {
187 ifstream script(filename.c_str());
188 if (!script.is_open()) {
189 report_location(DIAG_ERROR, filename);
190 cerr << strerror(errno) << endl;
191 exit(1);
192 }
193 string line;
194 size_t line_no = 0;
195 bool had_unique = false;
196 while (getline(script, line)) {
197 ++line_no;
198 vector<string> fields;
199 vector<Action> actions;
200 string::const_iterator i, j;
201 const string &s = line;
202 i = find_if(s.begin(), s.end(), [](char ch) { return !C_isspace(ch); });
203 if (i == s.end() || *i == '#') {
204 // Blank line or comment.
205 continue;
206 }
207 while (true) {
208 if (!C_isalnum(*i)) {
209 report_location(DIAG_ERROR, filename, line_no, i - s.begin());
210 cerr << "field name must start with alphanumeric" << endl;
211 exit(1);
212 }
213 j = find_if(i, s.end(),
214 [](char ch) { return !C_isalnum(ch) && ch != '_'; });
215 fields.push_back(string(i, j));
216 i = find_if(j, s.end(), [](char ch) { return !C_isspace(ch); });
217 if (i == s.end()) break;
218 if (*i == ':') {
219 ++i;
220 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
221 break;
222 }
223 if (i == j) {
224 report_location(DIAG_ERROR, filename, line_no, i - s.begin());
225 cerr << "bad character '" << *i << "' in fieldname" << endl;
226 exit(1);
227 }
228 }
229 Xapian::termcount weight = 1;
230 size_t useless_weight_pos = string::npos;
231 map<string, Action::type> boolmap;
232 j = i;
233 while (j != s.end()) {
234 size_t action_pos = j - s.begin();
235 i = find_if(j, s.end(), [](char ch) { return !C_isalnum(ch); });
236 string action(s, j - s.begin(), i - j);
237 Action::type code = Action::BAD;
238 unsigned min_args = 0, max_args = 0;
239 bool takes_integer_argument = false;
240 if (!action.empty()) {
241 switch (action[0]) {
242 case 'b':
243 if (action == "boolean") {
244 code = Action::BOOLEAN;
245 max_args = 1;
246 }
247 break;
248 case 'd':
249 if (action == "date") {
250 code = Action::DATE;
251 min_args = max_args = 1;
252 }
253 break;
254 case 'f':
255 if (action == "field") {
256 code = Action::FIELD;
257 max_args = 1;
258 }
259 break;
260 case 'g':
261 if (action == "gap") {
262 code = Action::GAP;
263 max_args = 1;
264 takes_integer_argument = true;
265 }
266 break;
267 case 'h':
268 if (action == "hash") {
269 code = Action::HASH;
270 max_args = 1;
271 takes_integer_argument = true;
272 } else if (action == "hextobin") {
273 code = Action::HEXTOBIN;
274 }
275 break;
276 case 'i':
277 if (action == "index") {
278 code = Action::INDEX;
279 max_args = 1;
280 } else if (action == "indexnopos") {
281 code = Action::INDEXNOPOS;
282 max_args = 1;
283 }
284 break;
285 case 'l':
286 if (action == "lower") {
287 code = Action::LOWER;
288 } else if (action == "load") {
289 code = Action::LOAD;
290 }
291 break;
292 case 'p':
293 if (action == "parsedate") {
294 code = Action::PARSEDATE;
295 min_args = max_args = 1;
296 }
297 break;
298 case 's':
299 if (action == "spell") {
300 code = Action::SPELL;
301 } else if (action == "split") {
302 code = Action::SPLIT;
303 min_args = 1;
304 max_args = 2;
305 }
306 break;
307 case 't':
308 if (action == "truncate") {
309 code = Action::TRUNCATE;
310 min_args = max_args = 1;
311 takes_integer_argument = true;
312 }
313 break;
314 case 'u':
315 if (action == "unhtml") {
316 code = Action::UNHTML;
317 } else if (action == "unique") {
318 code = Action::UNIQUE;
319 min_args = max_args = 1;
320 }
321 break;
322 case 'v':
323 if (action == "value") {
324 code = Action::VALUE;
325 min_args = max_args = 1;
326 takes_integer_argument = true;
327 } else if (action == "valuenumeric") {
328 code = Action::VALUENUMERIC;
329 min_args = max_args = 1;
330 takes_integer_argument = true;
331 } else if (action == "valuepacked") {
332 code = Action::VALUEPACKED;
333 min_args = max_args = 1;
334 takes_integer_argument = true;
335 }
336 break;
337 case 'w':
338 if (action == "weight") {
339 code = Action::WEIGHT;
340 min_args = max_args = 1;
341 takes_integer_argument = true;
342 }
343 break;
344 }
345 }
346 if (code == Action::BAD) {
347 report_location(DIAG_ERROR, filename, line_no, action_pos);
348 cerr << "Unknown index action '" << action << "'" << endl;
349 exit(1);
350 }
351 auto i_after_action = i;
352 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
353
354 if (i != s.end() && *i == '=') {
355 if (i != i_after_action) {
356 report_location(DIAG_WARN, filename, line_no,
357 i_after_action - s.begin());
358 cerr << "putting spaces between the action and '=' is "
359 "deprecated." << endl;
360 }
361
362 if (max_args == 0) {
363 report_location(DIAG_ERROR, filename, line_no,
364 i - s.begin());
365 cerr << "Index action '" << action
366 << "' doesn't take an argument" << endl;
367 exit(1);
368 }
369
370 ++i;
371 j = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
372 if (i != j) {
373 report_location(DIAG_WARN, filename, line_no,
374 i - s.begin());
375 cerr << "putting spaces between '=' and the argument is "
376 "deprecated." << endl;
377 }
378
379 vector<string> vals;
380 while (true) {
381 if (j != s.end() && *j == '"') {
382 // Quoted argument.
383 ++j;
384 string arg;
385 while (true) {
386 i = find_if(j, s.end(),
387 [](char ch) {
388 return ch == '"' || ch == '\\';
389 });
390 if (i == s.end()) {
391 report_location(DIAG_ERROR, filename, line_no,
392 s.size());
393 cerr << "No closing quote" << endl;
394 exit(1);
395 }
396 arg.append(j, i);
397 if (*i++ == '"')
398 break;
399
400 // Escape sequence.
401 if (i == s.end()) {
402 bad_escaping:
403 report_location(DIAG_ERROR, filename, line_no,
404 i - s.begin());
405 cerr << "Bad escaping in quoted action argument"
406 << endl;
407 exit(1);
408 }
409
410 char ch = *i;
411 switch (ch) {
412 case '\\':
413 case '"':
414 break;
415 case '0':
416 ch = '\0';
417 break;
418 case 'n':
419 ch = '\n';
420 break;
421 case 'r':
422 ch = '\r';
423 break;
424 case 't':
425 ch = '\t';
426 break;
427 case 'x': {
428 if (++i == s.end())
429 goto bad_escaping;
430 char ch1 = *i;
431 if (++i == s.end())
432 goto bad_escaping;
433 char ch2 = *i;
434 if (!C_isxdigit(ch1) ||
435 !C_isxdigit(ch2))
436 goto bad_escaping;
437 ch = hex_digit(ch1) << 4 |
438 hex_digit(ch2);
439 break;
440 }
441 default:
442 goto bad_escaping;
443 }
444 arg += ch;
445 j = i + 1;
446 }
447 vals.emplace_back(std::move(arg));
448 if (i == s.end() || C_isspace(*i)) break;
449 if (*i != ',') {
450 report_location(DIAG_ERROR, filename, line_no,
451 i - s.begin());
452 cerr << "Unexpected character '" << *i
453 << "' after closing quote" << endl;
454 exit(1);
455 }
456 ++i;
457 } else if (max_args > 1) {
458 // Unquoted argument, split on comma.
459 i = find_if(j, s.end(),
460 [](char ch) {
461 return C_isspace(ch) || ch == ',';
462 });
463 vals.emplace_back(j, i);
464 if (*i != ',') break;
465 ++i;
466 } else {
467 // Unquoted argument, including any commas.
468 i = find_if(j, s.end(),
469 [](char ch) { return C_isspace(ch); });
470 vals.emplace_back(j, i);
471 break;
472 }
473 j = i;
474
475 if (vals.size() == max_args) {
476 report_location(DIAG_ERROR, filename, line_no,
477 i - s.begin());
478 cerr << "Index action '" << action
479 << "' takes at most " << max_args << " arguments"
480 << endl;
481 exit(1);
482 }
483 }
484
485 if (vals.size() < min_args) {
486 report_location(DIAG_ERROR, filename, line_no,
487 i - s.begin());
488 if (min_args == max_args) {
489 cerr << "Index action '" << action
490 << "' requires " << min_args << " arguments"
491 << endl;
492 exit(1);
493 }
494 cerr << "Index action '" << action
495 << "' requires at least " << min_args << " arguments"
496 << endl;
497 exit(1);
498 }
499
500 string val;
501 if (!vals.empty()) {
502 val = vals.front();
503 }
504
505 if (takes_integer_argument) {
506 auto dot = val.find('.');
507 if (dot != string::npos) {
508 report_location(DIAG_WARN, filename, line_no,
509 j - s.begin() + dot);
510 cerr << "Index action '" << action
511 << "' takes an integer argument" << endl;
512 }
513 }
514 switch (code) {
515 case Action::DATE:
516 if (val != "unix" &&
517 val != "unixutc" &&
518 val != "yyyymmdd") {
519 report_location(DIAG_ERROR, filename, line_no);
520 cerr << "Invalid parameter '" << val << "' for "
521 "action 'date'" << endl;
522 exit(1);
523 }
524 actions.emplace_back(code, action_pos, val);
525 break;
526 case Action::INDEX:
527 case Action::INDEXNOPOS:
528 actions.emplace_back(code, action_pos, val, weight);
529 useless_weight_pos = string::npos;
530 break;
531 case Action::WEIGHT:
532 // We don't push an Action for WEIGHT - instead we
533 // store it ready to use in the INDEX and INDEXNOPOS
534 // Actions.
535 weight = atoi(val.c_str());
536 if (useless_weight_pos != string::npos) {
537 report_useless_action(filename, line_no,
538 useless_weight_pos, action);
539 }
540 useless_weight_pos = action_pos;
541 break;
542 case Action::PARSEDATE: {
543 if (val.find("%Z") != val.npos) {
544 report_location(DIAG_ERROR, filename, line_no);
545 cerr << "Parsing timezone names with %Z is not supported" << endl;
546 exit(1);
547 }
548 #ifndef HAVE_STRUCT_TM_TM_GMTOFF
549 if (val.find("%z") != val.npos) {
550 report_location(DIAG_ERROR, filename, line_no);
551 cerr << "Parsing timezone offsets with %z is not supported on "
552 "this platform" << endl;
553 exit(1);
554 }
555 #endif
556 actions.emplace_back(code, action_pos, val);
557 break;
558 }
559 case Action::SPLIT: {
560 if (val.empty()) {
561 report_location(DIAG_ERROR, filename, line_no);
562 cerr << "Split delimiter can't be empty" << endl;
563 exit(1);
564 }
565 int operation = Action::SPLIT_NONE;
566 if (vals.size() >= 2) {
567 if (vals[1] == "dedup") {
568 operation = Action::SPLIT_DEDUP;
569 } else if (vals[1] == "sort") {
570 operation = Action::SPLIT_SORT;
571 } else if (vals[1] == "none") {
572 operation = Action::SPLIT_NONE;
573 } else if (vals[1] == "prefixes") {
574 operation = Action::SPLIT_PREFIXES;
575 } else {
576 report_location(DIAG_ERROR, filename, line_no);
577 cerr << "Bad split operation '" << vals[1]
578 << "'" << endl;
579 exit(1);
580 }
581 }
582 actions.emplace_back(code, action_pos, val, operation);
583 break;
584 }
585 case Action::TRUNCATE:
586 if (!actions.empty() &&
587 actions.back().get_action() == Action::LOAD) {
588 /* Turn "load truncate=n" into "load" with
589 * num_arg n, so that we don't needlessly
590 * allocate memory and read data we're just
591 * going to ignore.
592 */
593 actions.pop_back();
594 code = Action::LOAD;
595 }
596 actions.emplace_back(code, action_pos, val);
597 break;
598 case Action::UNIQUE:
599 if (had_unique) {
600 report_location(DIAG_ERROR, filename, line_no,
601 action_pos);
602 cerr << "Index action 'unique' used more than once"
603 << endl;
604 exit(1);
605 }
606 had_unique = true;
607 if (boolmap.find(val) == boolmap.end())
608 boolmap[val] = Action::UNIQUE;
609 actions.emplace_back(code, action_pos, val);
610 break;
611 case Action::GAP: {
612 actions.emplace_back(code, action_pos, val);
613 auto& obj = actions.back();
614 auto gap_size = obj.get_num_arg();
615 if (gap_size <= 0) {
616 report_location(DIAG_ERROR, filename, line_no,
617 obj.get_pos() + 3 + 1);
618 cerr << "Index action 'gap' takes a strictly "
619 "positive integer argument" << endl;
620 exit(1);
621 }
622 break;
623 }
624 case Action::HASH: {
625 actions.emplace_back(code, action_pos, val);
626 auto& obj = actions.back();
627 auto max_length = obj.get_num_arg();
628 if (max_length < 6) {
629 report_location(DIAG_ERROR, filename, line_no,
630 obj.get_pos() + 4 + 1);
631 cerr << "Index action 'hash' takes an integer "
632 "argument which must be at least 6" << endl;
633 exit(1);
634 }
635 break;
636 }
637 case Action::BOOLEAN:
638 boolmap[val] = Action::BOOLEAN;
639 /* FALLTHRU */
640 default:
641 actions.emplace_back(code, action_pos, val);
642 }
643 i = find_if(i, s.end(), [](char ch) { return !C_isspace(ch); });
644 } else {
645 if (min_args > 0) {
646 report_location(DIAG_ERROR, filename, line_no,
647 i_after_action - s.begin());
648 if (min_args == max_args) {
649 cerr << "Index action '" << action << "' requires "
650 << min_args << " arguments" << endl;
651 exit(1);
652 }
653 cerr << "Index action '" << action << "' requires at least "
654 << min_args << " arguments" << endl;
655 exit(1);
656 }
657 if (code == Action::INDEX || code == Action::INDEXNOPOS) {
658 useless_weight_pos = string::npos;
659 actions.emplace_back(code, action_pos, "", weight);
660 } else if (code == Action::GAP) {
661 actions.emplace_back(code, action_pos, "", 100);
662 } else if (code == Action::HASH) {
663 actions.emplace_back(code, action_pos, "",
664 MAX_SAFE_TERM_LENGTH - 1);
665 } else {
666 actions.emplace_back(code, action_pos);
667 }
668 }
669 j = i;
670 }
671
672 if (useless_weight_pos != string::npos) {
673 report_useless_action(filename, line_no, useless_weight_pos,
674 "weight");
675 }
676
677 while (!actions.empty()) {
678 bool done = true;
679 Action::type action = actions.back().get_action();
680 switch (action) {
681 case Action::HASH:
682 case Action::HEXTOBIN:
683 case Action::LOWER:
684 case Action::PARSEDATE:
685 case Action::SPELL:
686 case Action::TRUNCATE:
687 case Action::UNHTML:
688 done = false;
689 report_useless_action(filename, line_no,
690 actions.back().get_pos(),
691 action_names[action]);
692 actions.pop_back();
693 break;
694 default:
695 break;
696 }
697 if (done) break;
698 }
699
700 map<string, Action::type>::const_iterator boolpfx;
701 for (boolpfx = boolmap.begin(); boolpfx != boolmap.end(); ++boolpfx) {
702 if (boolpfx->second == Action::UNIQUE) {
703 report_location(DIAG_WARN, filename, line_no);
704 cerr << "Index action 'unique=" << boolpfx->first
705 << "' without 'boolean=" << boolpfx->first << "'" << endl;
706 static bool given_doesnt_imply_boolean_warning = false;
707 if (!given_doesnt_imply_boolean_warning) {
708 given_doesnt_imply_boolean_warning = true;
709 report_location(DIAG_NOTE, filename, line_no);
710 cerr << "'unique' doesn't implicitly add a boolean term"
711 << endl;
712 }
713 }
714 }
715
716 vector<string>::const_iterator field;
717 for (field = fields.begin(); field != fields.end(); ++field) {
718 vector<Action> &v = index_spec[*field];
719 if (v.empty()) {
720 if (fields.size() == 1) {
721 // Optimise common case where there's only one fieldname
722 // for a list of actions.
723 v = std::move(actions);
724 } else {
725 v = actions;
726 }
727 } else {
728 v.emplace_back(Action::NEW, string::npos);
729 v.insert(v.end(), actions.begin(), actions.end());
730 }
731 }
732 }
733
734 if (index_spec.empty()) {
735 report_location(DIAG_ERROR, filename, line_no);
736 cerr << "No rules found in index script" << endl;
737 exit(1);
738 }
739 }
740
741 static bool
run_actions(vector<Action>::const_iterator action_it,vector<Action>::const_iterator action_end,Xapian::WritableDatabase & database,Xapian::TermGenerator & indexer,const string & old_value,bool & this_field_is_content,Xapian::Document & doc,map<string,list<string>> & fields,string & field,const char * fname,size_t line_no,Xapian::docid & docid)742 run_actions(vector<Action>::const_iterator action_it,
743 vector<Action>::const_iterator action_end,
744 Xapian::WritableDatabase& database,
745 Xapian::TermGenerator& indexer,
746 const string& old_value,
747 bool& this_field_is_content, Xapian::Document& doc,
748 map<string, list<string>>& fields,
749 string& field, const char* fname,
750 size_t line_no, Xapian::docid& docid)
751 {
752 string value = old_value;
753 while (action_it != action_end) {
754 auto& action = *action_it++;
755 switch (action.get_action()) {
756 case Action::BAD:
757 abort();
758 case Action::NEW:
759 value = old_value;
760 // We're processing the same field again - give it a reprieve.
761 this_field_is_content = true;
762 break;
763 case Action::FIELD:
764 if (!value.empty()) {
765 string f = action.get_string_arg();
766 if (f.empty()) f = field;
767 // replace newlines with spaces
768 string s = value;
769 string::size_type j = 0;
770 while ((j = s.find('\n', j)) != string::npos)
771 s[j] = ' ';
772 fields[f].push_back(s);
773 }
774 break;
775 case Action::INDEX:
776 indexer.index_text(value,
777 action.get_num_arg(),
778 action.get_string_arg());
779 break;
780 case Action::INDEXNOPOS:
781 // No positional information so phrase searching won't work.
782 // However, the database will use much less diskspace.
783 indexer.index_text_without_positions(value,
784 action.get_num_arg(),
785 action.get_string_arg());
786 break;
787 case Action::BOOLEAN: {
788 // Do nothing if there's no text.
789 if (value.empty()) break;
790
791 string term = action.get_string_arg();
792 if (prefix_needs_colon(term, value[0])) term += ':';
793 term += value;
794
795 doc.add_boolean_term(term);
796 break;
797 }
798 case Action::GAP:
799 indexer.increase_termpos(action.get_num_arg());
800 break;
801 case Action::HASH: {
802 unsigned int max_length = action.get_num_arg();
803 if (value.length() > max_length)
804 value = hash_long_term(value, max_length);
805 break;
806 }
807 case Action::HEXTOBIN: {
808 size_t len = value.length();
809 if (len & 1) {
810 report_location(DIAG_ERROR, fname, line_no);
811 cerr << "hextobin: input must have even length"
812 << endl;
813 } else {
814 string output;
815 output.reserve(len / 2);
816 for (size_t j = 0; j < len; j += 2) {
817 char a = value[j];
818 char b = value[j + 1];
819 if (!C_isxdigit(a) || !C_isxdigit(b)) {
820 report_location(DIAG_ERROR, fname, line_no);
821 cerr << "hextobin: input must be all hex "
822 "digits" << endl;
823 goto badhex;
824 }
825 char r = (hex_digit(a) << 4) | hex_digit(b);
826 output.push_back(r);
827 }
828 value = std::move(output);
829 }
830 badhex:
831 break;
832 }
833 case Action::LOWER:
834 value = Xapian::Unicode::tolower(value);
835 break;
836 case Action::LOAD: {
837 // If there's no input, just issue a warning.
838 if (value.empty()) {
839 report_location(DIAG_WARN, fname, line_no);
840 cerr << "Empty filename in LOAD action" << endl;
841 break;
842 }
843 bool truncated = false;
844 string filename = std::move(value);
845 // FIXME: Use NOATIME if we own the file or are root.
846 if (!load_file(filename, action.get_num_arg(), NOCACHE,
847 value, truncated)) {
848 report_location(DIAG_ERROR, fname, line_no);
849 cerr << "Couldn't load file '" << filename << "': "
850 << strerror(errno) << endl;
851 value.resize(0);
852 break;
853 }
854 if (!truncated) break;
855 }
856 /* FALLTHRU */
857 case Action::TRUNCATE:
858 utf8_truncate(value, action.get_num_arg());
859 break;
860 case Action::SPELL:
861 indexer.set_flags(indexer.FLAG_SPELLING);
862 break;
863 case Action::SPLIT: {
864 // Find the end of the actions which split should execute.
865 auto split_end = find(action_it, action_end, Action::NEW);
866
867 int split_type = action.get_num_arg();
868 if (value.empty()) {
869 // Nothing to do.
870 } else if (split_type != Action::SPLIT_SORT) {
871 // Generate split as we consume it.
872 const string& delimiter = action.get_string_arg();
873
874 unique_ptr<unordered_set<string>> seen;
875 if (split_type == Action::SPLIT_DEDUP) {
876 seen.reset(new unordered_set<string>);
877 }
878
879 if (delimiter.size() == 1) {
880 // Special case for common single character delimiter.
881 char ch = delimiter[0];
882 string::size_type i = 0;
883 while (true) {
884 string::size_type j = value.find(ch, i);
885 if (split_type == Action::SPLIT_PREFIXES) {
886 if (j > 0) {
887 string val(value, 0, j);
888 run_actions(action_it, split_end,
889 database, indexer,
890 val,
891 this_field_is_content, doc,
892 fields,
893 field, fname, line_no,
894 docid);
895 }
896 } else if (i != j) {
897 string val(value, i, j - i);
898 if (!seen.get() || seen->insert(val).second) {
899 run_actions(action_it, split_end,
900 database, indexer,
901 val,
902 this_field_is_content, doc,
903 fields,
904 field, fname, line_no,
905 docid);
906 }
907 }
908 if (j == string::npos) break;
909 i = j + 1;
910 }
911 } else {
912 string::size_type i = 0;
913 while (true) {
914 string::size_type j = value.find(delimiter, i);
915 if (split_type == Action::SPLIT_PREFIXES) {
916 if (j > 0) {
917 string val(value, 0, j);
918 run_actions(action_it, split_end,
919 database, indexer,
920 val,
921 this_field_is_content, doc,
922 fields,
923 field, fname, line_no,
924 docid);
925 }
926 } else if (i != j) {
927 string val(value, i, j - i);
928 if (!seen.get() || seen->insert(val).second) {
929 run_actions(action_it, split_end,
930 database, indexer,
931 val,
932 this_field_is_content, doc,
933 fields,
934 field, fname, line_no,
935 docid);
936 }
937 }
938 if (j == string::npos) break;
939 i = j + delimiter.size();
940 }
941 }
942 } else {
943 vector<string> split_values;
944 const string& delimiter = action.get_string_arg();
945 if (delimiter.size() == 1) {
946 // Special case for common single character delimiter.
947 char ch = delimiter[0];
948 string::size_type i = 0;
949 while (true) {
950 string::size_type j = value.find(ch, i);
951 if (i != j) {
952 split_values.emplace_back(value, i, j - i);
953 }
954 if (j == string::npos) break;
955 i = j + 1;
956 }
957 } else {
958 string::size_type i = 0;
959 while (true) {
960 string::size_type j = value.find(delimiter, i);
961 if (i != j) {
962 split_values.emplace_back(value, i, j - i);
963 }
964 if (j == string::npos) break;
965 i = j + delimiter.size();
966 }
967 }
968
969 sort(split_values.begin(), split_values.end());
970
971 for (auto&& val : split_values) {
972 run_actions(action_it, split_end,
973 database, indexer, val,
974 this_field_is_content, doc, fields,
975 field, fname, line_no,
976 docid);
977 }
978 }
979
980 action_it = split_end;
981 break;
982 }
983 case Action::UNHTML: {
984 MyHtmlParser p;
985 try {
986 // Default HTML character set is latin 1, though
987 // not specifying one is deprecated these days.
988 p.parse_html(value, "iso-8859-1", false);
989 } catch (const string & newcharset) {
990 p.reset();
991 p.parse_html(value, newcharset, true);
992 }
993 if (p.indexing_allowed)
994 value = p.dump;
995 else
996 value = "";
997 break;
998 }
999 case Action::UNIQUE: {
1000 // If there's no text, just issue a warning.
1001 if (value.empty()) {
1002 report_location(DIAG_WARN, fname, line_no);
1003 cerr << "Ignoring UNIQUE action on empty text"
1004 << endl;
1005 break;
1006 }
1007
1008 // Ensure that the value of this field is unique.
1009 // If a record already exists with the same value,
1010 // it will be replaced with the new record.
1011
1012 // Unique fields aren't considered content - if
1013 // there are no other fields in the document, the
1014 // document is to be deleted.
1015 this_field_is_content = false;
1016
1017 // Argument is the prefix to add to the field value
1018 // to get the unique term.
1019 string t = action.get_string_arg();
1020 if (prefix_needs_colon(t, value[0])) t += ':';
1021 t += value;
1022 Xapian::PostingIterator p = database.postlist_begin(t);
1023 if (p != database.postlist_end(t)) {
1024 docid = *p;
1025 }
1026 break;
1027 }
1028 case Action::VALUE:
1029 if (!value.empty())
1030 doc.add_value(action.get_num_arg(), value);
1031 break;
1032 case Action::VALUENUMERIC: {
1033 if (value.empty()) break;
1034 char * end;
1035 double dbl = strtod(value.c_str(), &end);
1036 if (*end) {
1037 report_location(DIAG_WARN, fname, line_no);
1038 cerr << "Trailing characters in VALUENUMERIC: '"
1039 << value << "'" << endl;
1040 }
1041 doc.add_value(action.get_num_arg(),
1042 Xapian::sortable_serialise(dbl));
1043 break;
1044 }
1045 case Action::VALUEPACKED: {
1046 uint32_t word = 0;
1047 if (value.empty() || !C_isdigit(value[0])) {
1048 // strtoul() accepts leading whitespace and negated
1049 // values, neither of which we want to allow.
1050 errno = EINVAL;
1051 } else {
1052 errno = 0;
1053 char* q;
1054 word = strtoul(value.c_str(), &q, 10);
1055 if (!errno && *q != '\0') {
1056 // Trailing characters after converted value.
1057 errno = EINVAL;
1058 }
1059 }
1060 if (errno) {
1061 report_location(DIAG_WARN, fname, line_no);
1062 cerr << "valuepacked \"" << value << "\" ";
1063 if (errno == ERANGE) {
1064 cerr << "out of range";
1065 } else {
1066 cerr << "not an unsigned integer";
1067 }
1068 cerr << endl;
1069 }
1070 int valueslot = action.get_num_arg();
1071 doc.add_value(valueslot, int_to_binary_string(word));
1072 break;
1073 }
1074 case Action::DATE: {
1075 // Do nothing for empty input.
1076 if (value.empty()) break;
1077
1078 const string & type = action.get_string_arg();
1079 string yyyymmdd;
1080 if (type == "unix") {
1081 time_t t;
1082 if (!parse_signed(value.c_str(), t)) {
1083 report_location(DIAG_WARN, fname, line_no);
1084 cerr << "Date value (in secs) for action DATE "
1085 "must be an integer - ignoring" << endl;
1086 break;
1087 }
1088 struct tm *tm = localtime(&t);
1089 int y = tm->tm_year + 1900;
1090 int m = tm->tm_mon + 1;
1091 yyyymmdd = date_to_string(y, m, tm->tm_mday);
1092 } else if (type == "unixutc") {
1093 time_t t;
1094 if (!parse_signed(value.c_str(), t)) {
1095 report_location(DIAG_WARN, fname, line_no);
1096 cerr << "Date value (in secs) for action DATE "
1097 "must be an integer - ignoring" << endl;
1098 break;
1099 }
1100 struct tm *tm = gmtime(&t);
1101 int y = tm->tm_year + 1900;
1102 int m = tm->tm_mon + 1;
1103 yyyymmdd = date_to_string(y, m, tm->tm_mday);
1104 } else if (type == "yyyymmdd") {
1105 if (value.length() != 8) {
1106 report_location(DIAG_WARN, fname, line_no);
1107 cerr << "date=yyyymmdd expects an 8 character value "
1108 "- ignoring" << endl;
1109 break;
1110 }
1111 yyyymmdd = value;
1112 }
1113
1114 // Date (YYYYMMDD)
1115 doc.add_boolean_term("D" + yyyymmdd);
1116 yyyymmdd.resize(6);
1117 // Month (YYYYMM)
1118 doc.add_boolean_term("M" + yyyymmdd);
1119 yyyymmdd.resize(4);
1120 // Year (YYYY)
1121 doc.add_boolean_term("Y" + yyyymmdd);
1122 break;
1123 }
1124 case Action::PARSEDATE: {
1125 string dateformat = action.get_string_arg();
1126 struct tm tm;
1127 memset(&tm, 0, sizeof(tm));
1128 auto ret = strptime(value.c_str(), dateformat.c_str(), &tm);
1129 if (ret == NULL) {
1130 report_location(DIAG_WARN, fname, line_no);
1131 cerr << "\"" << value << "\" doesn't match format "
1132 "\"" << dateformat << '\"' << endl;
1133 break;
1134 }
1135
1136 if (*ret != '\0') {
1137 report_location(DIAG_WARN, fname, line_no);
1138 cerr << "\"" << value << "\" not fully matched by "
1139 "format \"" << dateformat << "\" "
1140 "(\"" << ret << "\" left over) but "
1141 "indexing anyway" << endl;
1142 }
1143 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1144 auto gmtoff = tm.tm_gmtoff;
1145 #endif
1146 auto secs_since_epoch = timegm(&tm);
1147 #ifdef HAVE_STRUCT_TM_TM_GMTOFF
1148 secs_since_epoch -= gmtoff;
1149 #endif
1150 value = str(secs_since_epoch);
1151 break;
1152 }
1153 default:
1154 /* Empty default case to avoid "unhandled enum value"
1155 * warnings. */
1156 break;
1157 }
1158 }
1159 return true;
1160 }
1161
1162 static void
index_file(const char * fname,istream & stream,Xapian::WritableDatabase & database,Xapian::TermGenerator & indexer)1163 index_file(const char *fname, istream &stream,
1164 Xapian::WritableDatabase &database, Xapian::TermGenerator &indexer)
1165 {
1166 string line;
1167 size_t line_no = 0;
1168 while (!stream.eof() && getline(stream, line)) {
1169 ++line_no;
1170 Xapian::Document doc;
1171 indexer.set_document(doc);
1172 Xapian::docid docid = 0;
1173 map<string, list<string>> fields;
1174 bool seen_content = false;
1175 while (!line.empty()) {
1176 // Cope with files from MS Windows (\r\n end of lines).
1177 // Trim multiple \r characters, since that seems the best way
1178 // to handle that case.
1179 string::size_type last = line.find_last_not_of('\r');
1180 if (last == string::npos) break;
1181 line.resize(last + 1);
1182
1183 string::size_type eq = line.find('=');
1184 if (eq == string::npos && !line.empty()) {
1185 report_location(DIAG_ERROR, fname, line_no, line.size());
1186 cerr << "expected = somewhere in this line" << endl;
1187 // FIXME: die or what?
1188 }
1189 string field(line, 0, eq);
1190 string value(line, eq + 1, string::npos);
1191 while (getline(stream, line)) {
1192 ++line_no;
1193 if (line.empty() || line[0] != '=') break;
1194 // Cope with files from MS Windows (\r\n end of lines).
1195 // Trim multiple \r characters, since that seems the best way
1196 // to handle that case.
1197 last = line.find_last_not_of('\r');
1198 // line[0] == '=', so last != string::npos.
1199 // Replace the '=' with a '\n' so we don't have to use substr.
1200 line[0] = '\n';
1201 line.resize(last + 1);
1202 value += line;
1203 }
1204
1205 // Default to not indexing spellings.
1206 indexer.set_flags(Xapian::TermGenerator::flags(0));
1207
1208 bool this_field_is_content = true;
1209 const vector<Action>& v = index_spec[field];
1210 run_actions(v.begin(), v.end(),
1211 database, indexer, value,
1212 this_field_is_content, doc, fields,
1213 field, fname, line_no,
1214 docid);
1215 if (this_field_is_content) seen_content = true;
1216 if (stream.eof()) break;
1217 }
1218
1219 // If we haven't seen any fields (other than unique identifiers)
1220 // the document is to be deleted.
1221 if (!seen_content) {
1222 if (docid) {
1223 database.delete_document(docid);
1224 if (verbose) cout << "Del: " << docid << endl;
1225 ++delcount;
1226 }
1227 } else {
1228 string data;
1229 for (auto&& i : fields) {
1230 for (auto&& field_val : i.second) {
1231 data += i.first;
1232 data += '=';
1233 data += field_val;
1234 data += '\n';
1235 }
1236 }
1237
1238 // Put the data in the document
1239 doc.set_data(data);
1240
1241 // Add the document to the database
1242 if (docid) {
1243 database.replace_document(docid, doc);
1244 if (verbose) cout << "Replace: " << docid << endl;
1245 ++repcount;
1246 } else {
1247 docid = database.add_document(doc);
1248 if (verbose) cout << "Add: " << docid << endl;
1249 ++addcount;
1250 }
1251 }
1252 }
1253
1254 // Commit after each file to make sure all changes from that file make it
1255 // in.
1256 if (verbose) cout << "Committing: " << endl;
1257 database.commit();
1258 }
1259
1260 static void
show_help(int exit_code)1261 show_help(int exit_code)
1262 {
1263 cout << PROG_NAME " - " PROG_DESC "\n"
1264 "Usage: " PROG_NAME " [OPTIONS] DATABASE INDEXER_SCRIPT [INPUT_FILE]...\n"
1265 "\n"
1266 "Creates or updates a Xapian database with the data from the input files listed\n"
1267 "on the command line. If no files are specified, data is read from stdin.\n"
1268 "\n"
1269 "See https://xapian.org/docs/omega/scriptindex.html for documentation of the\n"
1270 "format for INDEXER_SCRIPT.\n"
1271 "\n"
1272 "Options:\n"
1273 " -v, --verbose display additional messages to aid debugging\n"
1274 " --overwrite create the database anew (the default is to update if\n"
1275 " the database already exists)\n";
1276 print_stemmer_help("");
1277 print_help_and_version_help("");
1278 exit(exit_code);
1279 }
1280
1281 int
main(int argc,char ** argv)1282 main(int argc, char **argv)
1283 try {
1284 // If the database already exists, default to updating not overwriting.
1285 int database_mode = Xapian::DB_CREATE_OR_OPEN;
1286 verbose = false;
1287 Xapian::Stem stemmer("english");
1288
1289 // Without this, strptime() seems to treat formats without a timezone as
1290 // being local time, including %s.
1291 setenv("TZ", "UTC", 1);
1292
1293 constexpr auto NO_ARG = no_argument;
1294 constexpr auto REQ_ARG = required_argument;
1295 static const struct option longopts[] = {
1296 { "help", NO_ARG, NULL, 'h' },
1297 { "version", NO_ARG, NULL, 'V' },
1298 { "stemmer", REQ_ARG, NULL, 's' },
1299 { "overwrite", NO_ARG, NULL, 'o' },
1300 { "verbose", NO_ARG, NULL, 'v' },
1301 { 0, 0, NULL, 0 }
1302 };
1303
1304 int getopt_ret;
1305 while ((getopt_ret = gnu_getopt_long(argc, argv, "vs:hV",
1306 longopts, NULL)) != -1) {
1307 switch (getopt_ret) {
1308 default:
1309 show_help(1);
1310 break;
1311 case 'h': // --help
1312 show_help(0);
1313 break;
1314 case 'V': // --version
1315 print_package_info(PROG_NAME);
1316 return 0;
1317 case 'o': // --overwrite
1318 database_mode = Xapian::DB_CREATE_OR_OVERWRITE;
1319 break;
1320 case 'v':
1321 verbose = true;
1322 break;
1323 case 's':
1324 try {
1325 stemmer = Xapian::Stem(optarg);
1326 } catch (const Xapian::InvalidArgumentError &) {
1327 cerr << "Unknown stemming language '" << optarg << "'.\n";
1328 cerr << "Available language names are: "
1329 << Xapian::Stem::get_available_languages() << endl;
1330 return 1;
1331 }
1332 break;
1333 }
1334 }
1335
1336 argv += optind;
1337 argc -= optind;
1338 if (argc < 2) {
1339 show_help(1);
1340 }
1341
1342 parse_index_script(argv[1]);
1343
1344 // Open the database. If another process is currently updating the
1345 // database, wait for the lock to become available.
1346 auto flags = database_mode | Xapian::DB_RETRY_LOCK;
1347 Xapian::WritableDatabase database(argv[0], flags);
1348
1349 Xapian::TermGenerator indexer;
1350 indexer.set_stemmer(stemmer);
1351 // Set the database for spellings to be added to by the "spell" action.
1352 indexer.set_database(database);
1353
1354 addcount = 0;
1355 repcount = 0;
1356 delcount = 0;
1357
1358 if (argc == 2) {
1359 // Read from stdin.
1360 index_file("<stdin>", cin, database, indexer);
1361 } else {
1362 // Read file(s) listed on the command line.
1363 for (int i = 2; i < argc; ++i) {
1364 ifstream stream(argv[i]);
1365 if (stream) {
1366 index_file(argv[i], stream, database, indexer);
1367 } else {
1368 cerr << "Can't open file " << argv[i] << endl;
1369 }
1370 }
1371 }
1372
1373 cout << "records (added, replaced, deleted) = (" << addcount << ", "
1374 << repcount << ", " << delcount << ")" << endl;
1375 } catch (const Xapian::Error &error) {
1376 cerr << "Exception: " << error.get_description() << endl;
1377 exit(1);
1378 } catch (const std::bad_alloc &) {
1379 cerr << "Exception: std::bad_alloc" << endl;
1380 exit(1);
1381 } catch (...) {
1382 cerr << "Unknown Exception" << endl;
1383 exit(1);
1384 }
1385