1 /* query.cc: query executor for omega
2 *
3 * Copyright 1999,2000,2001 BrightStation PLC
4 * Copyright 2001 James Aylett
5 * Copyright 2001,2002 Ananova Ltd
6 * Copyright 2002 Intercede 1749 Ltd
7 * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014 Olly Betts
8 * Copyright 2008 Thomas Viehmann
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License as
12 * published by the Free Software Foundation; either version 2 of the
13 * License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
23 * USA
24 */
25
26 #include <config.h>
27
28 #include <algorithm>
29 #include <iostream>
30 #include <map>
31 #include <set>
32 #include <vector>
33
34 #include <cassert>
35 #include <cctype>
36 #include "safeerrno.h"
37 #include <stdio.h>
38 #include <cstdlib>
39 #include <cstring>
40 #include "strcasecmp.h"
41 #include <ctime>
42
43 #include "safeunistd.h"
44 #include <sys/types.h>
45 #include "safesysstat.h"
46 #include "safefcntl.h"
47
48 #include "realtime.h"
49
50 #include <cdb.h>
51
52 #include "date.h"
53 #include "datematchdecider.h"
54 #include "utils.h"
55 #include "omega.h"
56 #include "query.h"
57 #include "cgiparam.h"
58 #include "loadfile.h"
59 #include "str.h"
60 #include "stringutils.h"
61 #include "transform.h"
62 #include "urldecode.h"
63 #include "urlencode.h"
64 #include "unixperm.h"
65 #include "values.h"
66 #include "weight.h"
67 #include "expand.h"
68
69 #include <xapian.h>
70
71 #ifndef XAPIAN_AT_LEAST
72 #define XAPIAN_AT_LEAST(A,B,C) \
73 (XAPIAN_MAJOR_VERSION > (A) || \
74 (XAPIAN_MAJOR_VERSION == (A) && \
75 (XAPIAN_MINOR_VERSION > (B) || \
76 (XAPIAN_MINOR_VERSION == (B) && XAPIAN_REVISION >= (C)))))
77 #endif
78
79 using namespace std;
80
81 using Xapian::Utf8Iterator;
82
83 using Xapian::Unicode::is_wordchar;
84
85 #ifndef SNPRINTF
86 #include <cstdarg>
87
my_snprintf(char * str,size_t size,const char * format,...)88 static int my_snprintf(char *str, size_t size, const char *format, ...)
89 {
90 int res;
91 va_list ap;
92 va_start(ap, format);
93 str[size - 1] = '\0';
94 res = vsprintf(str, format, ap);
95 if (str[size - 1] || res < 0 || size_t(res) >= size)
96 abort(); /* Overflowed! */
97 va_end(ap);
98 return res;
99 }
100 #else
101 #define my_snprintf SNPRINTF
102 #endif
103
104 static bool query_parsed = false;
105 static bool done_query = false;
106 static Xapian::docid last = 0;
107
108 static Xapian::MSet mset;
109
110 static map<Xapian::docid, bool> ticked;
111
112 static void ensure_query_parsed();
113 static void ensure_match();
114
115 static Xapian::Query query;
116 //static string url_query_string;
117 Xapian::Query::op default_op = Xapian::Query::OP_OR; // default matching mode
118
119 static Xapian::QueryParser qp;
120 static Xapian::NumberValueRangeProcessor * size_vrp = NULL;
121 static Xapian::Stem *stemmer = NULL;
122
123 static string eval_file(const string &fmtfile);
124
125 static set<string> termset;
126
127 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
128 static map<string, string> termprefix_to_userprefix;
129
130 static string queryterms;
131
132 static string error_msg;
133
134 static double secs = -1;
135
136 static const char DEFAULT_LOG_ENTRY[] =
137 "$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
138 "[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
139 "$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
140 "$dbname\t"
141 "$query\t"
142 "$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
143
144 class MyStopper : public Xapian::Stopper {
145 public:
operator ()(const string & t) const146 bool operator()(const string &t) const {
147 switch (t[0]) {
148 case 'a':
149 return (t == "a" || t == "about" || t == "an" || t == "and" ||
150 t == "are" || t == "as" || t == "at");
151 case 'b':
152 return (t == "be" || t == "by");
153 case 'e':
154 return (t == "en");
155 case 'f':
156 return (t == "for" || t == "from");
157 case 'h':
158 return (t == "how");
159 case 'i':
160 return (t == "i" || t == "in" || t == "is" || t == "it");
161 case 'o':
162 return (t == "of" || t == "on" || t == "or");
163 case 't':
164 return (t == "that" || t == "the" || t == "this" || t == "to");
165 case 'w':
166 return (t == "was" || t == "what" || t == "when" ||
167 t == "where" || t == "which" || t == "who" ||
168 t == "why" || t == "will" || t == "with");
169 case 'y':
170 return (t == "you" || t == "your");
171 default:
172 return false;
173 }
174 }
175 };
176
177 static size_t
prefix_from_term(string & prefix,const string & term)178 prefix_from_term(string &prefix, const string &term)
179 {
180 if (term.empty()) {
181 prefix.resize(0);
182 return 0;
183 }
184 if (term[0] == 'X') {
185 const string::const_iterator begin = term.begin();
186 string::const_iterator i = begin + 1;
187 while (i != term.end() && C_isupper(*i)) ++i;
188 prefix.assign(begin, i);
189 if (i != term.end() && *i == ':') ++i;
190 return i - begin;
191 }
192
193 prefix = term[0];
194 return 1;
195 }
196
197 // Don't allow ".." in format names, log file names, etc as this would allow
198 // people to open a format "../../etc/passwd" or similar.
199 // FIXME: make this check more exact ("foo..bar" is safe)
200 // FIXME: log when this check fails
201 static bool
vet_filename(const string & filename)202 vet_filename(const string &filename)
203 {
204 string::size_type i = filename.find("..");
205 return (i == string::npos);
206 }
207
208 // Heuristics:
209 // * If any terms have been removed, it's a "fresh query" so we discard any
210 // relevance judgements
211 // * If all previous terms are there but more have been added then we keep
212 // the relevance judgements, but return the first page of hits
213 //
214 // NEW_QUERY entirely new query
215 // SAME_QUERY unchanged query
216 // EXTENDED_QUERY new query, but based on the old one
217 // BAD_QUERY parse error (message in error_msg)
218 typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
219
220 static querytype
set_probabilistic(const string & oldp)221 set_probabilistic(const string &oldp)
222 {
223 // Parse the query string.
224 qp.set_stemmer(Xapian::Stem(option["stemmer"]));
225 qp.set_stemming_strategy(option["stem_all"] == "true" ? Xapian::QueryParser::STEM_ALL : Xapian::QueryParser::STEM_SOME);
226 qp.set_stopper(new MyStopper());
227 qp.set_default_op(default_op);
228 qp.set_database(db);
229 // FIXME: provide a custom VRP which handles size:10..20K, etc.
230 if (!size_vrp)
231 size_vrp = new Xapian::NumberValueRangeProcessor(VALUE_SIZE, "size:",
232 true);
233 qp.add_valuerangeprocessor(size_vrp);
234 // std::map::insert() won't overwrite an existing entry, so we'll prefer
235 // the first user_prefix for which a particular term prefix is specified.
236 map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
237 for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
238 string user_prefix = pfx->first.substr(7);
239 qp.add_prefix(user_prefix, pfx->second);
240 termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
241 }
242 pfx = option.lower_bound("boolprefix,");
243 for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
244 string user_prefix = pfx->first.substr(11);
245 qp.add_boolean_prefix(user_prefix, pfx->second);
246 termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
247 }
248
249 try {
250 unsigned f = 0;
251 map<string, string>::const_iterator i = option.lower_bound("flag_");
252 for (; i != option.end() && startswith(i->first, "flag_"); ++i) {
253 if (i->second.empty()) continue;
254 const string & s = i->first;
255 switch (s[5]) {
256 case 'a':
257 if (s == "flag_auto_multiword_synonyms") {
258 f |= Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
259 break;
260 }
261 if (s == "flag_auto_synonyms") {
262 f |= Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
263 break;
264 }
265 break;
266 case 'b':
267 if (s == "flag_boolean") {
268 f |= Xapian::QueryParser::FLAG_BOOLEAN;
269 break;
270 }
271 if (s == "flag_boolean_any_case") {
272 f |= Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
273 break;
274 }
275 break;
276 #if XAPIAN_AT_LEAST(1,2,22)
277 case 'c':
278 if (s == "flag_cjk_ngram") {
279 f |= Xapian::QueryParser::FLAG_CJK_NGRAM;
280 break;
281 }
282 break;
283 #endif
284 case 'd':
285 if (s == "flag_default") {
286 f |= Xapian::QueryParser::FLAG_DEFAULT;
287 break;
288 }
289 break;
290 case 'l':
291 if (s == "flag_lovehate") {
292 f |= Xapian::QueryParser::FLAG_LOVEHATE;
293 break;
294 }
295 break;
296 case 'p':
297 if (s == "flag_partial") {
298 f |= Xapian::QueryParser::FLAG_PARTIAL;
299 break;
300 }
301 if (s == "flag_phrase") {
302 f |= Xapian::QueryParser::FLAG_PHRASE;
303 break;
304 }
305 if (s == "flag_pure_not") {
306 f |= Xapian::QueryParser::FLAG_PURE_NOT;
307 break;
308 }
309 break;
310 case 's':
311 if (s == "flag_spelling_correction") {
312 f |= Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
313 break;
314 }
315 if (s == "flag_synonym") {
316 f |= Xapian::QueryParser::FLAG_SYNONYM;
317 break;
318 }
319 break;
320 case 'w':
321 if (s == "flag_wildcard") {
322 f |= Xapian::QueryParser::FLAG_WILDCARD;
323 break;
324 }
325 break;
326 }
327 }
328 if (option["spelling"] == "true")
329 f |= qp.FLAG_SPELLING_CORRECTION;
330 query = qp.parse_query(query_string, f);
331 } catch (Xapian::QueryParserError &e) {
332 error_msg = e.get_msg();
333 return BAD_QUERY;
334 }
335
336 Xapian::termcount n_new_terms = 0;
337 for (Xapian::TermIterator i = query.get_terms_begin();
338 i != query.get_terms_end(); ++i) {
339 if (termset.find(*i) == termset.end()) {
340 termset.insert(*i);
341 if (!queryterms.empty()) queryterms += '\t';
342 queryterms += *i;
343 }
344 n_new_terms++;
345 }
346
347 // Check new query against the previous one
348 if (oldp.empty()) return query_string.empty() ? SAME_QUERY : NEW_QUERY;
349
350 // Long, long ago we used "word1#word2#" (with trailing #) but some broken
351 // old browsers (versions of MSIE) don't quote # in form GET submissions
352 // and everything after the # gets interpreted as an anchor. We now allow
353 // terms like `c#' so we want to avoid '#' anyway.
354 //
355 // So we switched to using "word1.word2." but that doesn't work if
356 // the terms contain "." themselves (e.g. Tapplication/vnd.ms-excel)
357 // so now we use "word1\tword2" instead (with no trailing separator).
358 //
359 // However for compatibility with templates which haven't been updated and
360 // bookmarked queries from Omega 0.9.6 and earlier we still support ".".
361 char separator = '\t';
362 unsigned int n_old_terms = count(oldp.begin(), oldp.end(), '\t') + 1;
363 if (n_old_terms == 1 && oldp[oldp.size() - 1] == '.') {
364 separator = '.';
365 n_old_terms = count(oldp.begin(), oldp.end(), '.');
366 }
367
368 // short-cut: if the new query has fewer terms, it must be a new one
369 if (n_new_terms < n_old_terms) return NEW_QUERY;
370
371 const char *term = oldp.c_str();
372 const char *pend;
373 while ((pend = strchr(term, separator)) != NULL) {
374 if (termset.find(string(term, pend - term)) == termset.end())
375 return NEW_QUERY;
376 term = pend + 1;
377 }
378 if (*term) {
379 if (termset.find(string(term)) == termset.end())
380 return NEW_QUERY;
381 }
382
383 // Use termset.size() rather than n_new_terms so we correctly handle
384 // the case when the query has repeated terms.
385 // This works wrongly in the case when the user extends the query
386 // by adding a term already in it, but that's unlikely and the behaviour
387 // isn't too bad (we just don't reset page 1). We also mishandle a few
388 // other obscure cases e.g. adding quotes to turn a query into a phrase.
389 if (termset.size() > n_old_terms) return EXTENDED_QUERY;
390 return SAME_QUERY;
391 }
392
393 static multimap<string, string> filter_map;
394
395 typedef multimap<string, string>::const_iterator FMCI;
396
add_bterm(const string & term)397 void add_bterm(const string &term) {
398 string prefix;
399 if (prefix_from_term(prefix, term) > 0)
400 filter_map.insert(multimap<string, string>::value_type(prefix, term));
401 }
402
403 static void
run_query()404 run_query()
405 {
406 bool force_boolean = false;
407 if (!filter_map.empty()) {
408 // OR together filters with the same prefix, then AND together
409 vector<Xapian::Query> filter_vec;
410 vector<string> or_vec;
411 string current;
412 for (FMCI i = filter_map.begin(); ; i++) {
413 bool over = (i == filter_map.end());
414 if (over || i->first != current) {
415 switch (or_vec.size()) {
416 case 0:
417 break;
418 case 1:
419 filter_vec.push_back(Xapian::Query(or_vec[0]));
420 break;
421 default:
422 filter_vec.push_back(Xapian::Query(Xapian::Query::OP_OR,
423 or_vec.begin(),
424 or_vec.end()));
425 break;
426 }
427 or_vec.clear();
428 if (over) break;
429 current = i->first;
430 }
431 or_vec.push_back(i->second);
432 }
433
434 Xapian::Query filter(Xapian::Query::OP_AND,
435 filter_vec.begin(), filter_vec.end());
436
437 if (query.empty()) {
438 // If no probabilistic query is provided then promote the filters
439 // to be THE query - filtering an empty query will give no
440 // matches.
441 std::swap(query, filter);
442 force_boolean = true;
443 } else {
444 query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
445 }
446 }
447
448 Xapian::MatchDecider * mdecider = NULL;
449 if (!date_start.empty() || !date_end.empty() || !date_span.empty()) {
450 MCI i = cgi_params.find("DATEVALUE");
451 if (i != cgi_params.end()) {
452 Xapian::valueno datevalue = string_to_int(i->second);
453 mdecider = new DateMatchDecider(datevalue, date_start, date_end, date_span);
454 } else {
455 Xapian::Query date_filter(Xapian::Query::OP_OR,
456 date_range_filter(date_start, date_end,
457 date_span),
458 Xapian::Query("Dlatest"));
459
460 // If no probabilistic query is provided then promote the daterange
461 // filter to be THE query instead of filtering an empty query.
462 if (query.empty()) {
463 query = date_filter;
464 } else {
465 query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
466 }
467 }
468 }
469
470 if (!enquire || !error_msg.empty()) return;
471
472 set_weighting_scheme(*enquire, option, force_boolean);
473
474 enquire->set_cutoff(threshold);
475
476 if (sort_key != Xapian::BAD_VALUENO) {
477 if (sort_after) {
478 enquire->set_sort_by_relevance_then_value(sort_key, sort_ascending);
479 } else {
480 enquire->set_sort_by_value_then_relevance(sort_key, sort_ascending);
481 }
482 }
483
484 enquire->set_docid_order(docid_order);
485
486 if (collapse) {
487 enquire->set_collapse_key(collapse_key);
488 }
489
490 if (!query.empty()) {
491 #if 0
492 // FIXME: If we start doing permissions checks based on $REMOTE_USER
493 // we're going to break some existing setups if users upgrade. We
494 // probably want a way to set this from OmegaScript.
495 const char * remote_user = getenv("REMOTE_USER");
496 if (remote_user)
497 apply_unix_permissions(query, remote_user);
498 #endif
499
500 enquire->set_query(query);
501 // We could use the value of topdoc as first parameter, but we
502 // need to know the first few items in the mset to fake a
503 // relevance set for topterms.
504 //
505 // If min_hits isn't set, check at least one extra result so we
506 // know if we've reached the end of the matches or not - then we
507 // can avoid offering a "next" button which leads to an empty page.
508 mset = enquire->get_mset(0, topdoc + hits_per_page,
509 topdoc + max(hits_per_page + 1, min_hits),
510 &rset, mdecider);
511 }
512 }
513
514 string
html_escape(const string & str)515 html_escape(const string &str)
516 {
517 string res;
518 string::size_type p = 0;
519 while (p < str.size()) {
520 char ch = str[p++];
521 switch (ch) {
522 case '<':
523 res += "<";
524 continue;
525 case '>':
526 res += ">";
527 continue;
528 case '&':
529 res += "&";
530 continue;
531 case '"':
532 res += """;
533 continue;
534 default:
535 res += ch;
536 }
537 }
538 return res;
539 }
540
541 static string
html_strip(const string & str)542 html_strip(const string &str)
543 {
544 string res;
545 string::size_type p = 0;
546 bool skip = false;
547 while (p < str.size()) {
548 char ch = str[p++];
549 switch (ch) {
550 case '<':
551 skip = true;
552 continue;
553 case '>':
554 skip = false;
555 continue;
556 default:
557 if (! skip) res += ch;
558 }
559 }
560 return res;
561 }
562
563 // FIXME split list into hash or map and use that rather than linear lookup?
word_in_list(const string & word,const string & list)564 static int word_in_list(const string& word, const string& list)
565 {
566 string::size_type split = 0, split2;
567 int count = 0;
568 while ((split2 = list.find('\t', split)) != string::npos) {
569 if (word.size() == split2 - split) {
570 if (memcmp(word.data(), list.data() + split, word.size()) == 0)
571 return count;
572 }
573 split = split2 + 1;
574 ++count;
575 }
576 if (word.size() == list.size() - split) {
577 if (memcmp(word.data(), list.data() + split, word.size()) == 0)
578 return count;
579 }
580 return -1;
581 }
582
583 // Not a character in an identifier
584 inline static bool
p_notid(unsigned int c)585 p_notid(unsigned int c)
586 {
587 return !C_isalnum(c) && c != '_';
588 }
589
590 // Not a character in an HTML tag name
591 inline static bool
p_nottag(unsigned int c)592 p_nottag(unsigned int c)
593 {
594 return !C_isalnum(c) && c != '.' && c != '-';
595 }
596
597 // FIXME: shares algorithm with indextext.cc!
598 static string
html_highlight(const string & s,const string & list,const string & bra,const string & ket)599 html_highlight(const string &s, const string &list,
600 const string &bra, const string &ket)
601 {
602 if (!stemmer) {
603 stemmer = new Xapian::Stem(option["stemmer"]);
604 }
605
606 string res;
607
608 Utf8Iterator j(s);
609 const Utf8Iterator s_end;
610 while (true) {
611 Utf8Iterator first = j;
612 while (first != s_end && !is_wordchar(*first)) ++first;
613 if (first == s_end) break;
614 Utf8Iterator term_end;
615 string term;
616 string word;
617 const char *l = j.raw();
618 if (*first < 128 && C_isupper(*first)) {
619 j = first;
620 Xapian::Unicode::append_utf8(term, *j);
621 while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && C_isupper(*j)) {
622 Xapian::Unicode::append_utf8(term, *j);
623 }
624 if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
625 term.resize(0);
626 }
627 term_end = j;
628 }
629 if (term.empty()) {
630 j = first;
631 while (is_wordchar(*j)) {
632 Xapian::Unicode::append_utf8(term, *j);
633 ++j;
634 if (j == s_end) break;
635 if (*j == '&' || *j == '\'') {
636 Utf8Iterator next = j;
637 ++next;
638 if (next == s_end || !is_wordchar(*next)) break;
639 term += *j;
640 j = next;
641 }
642 }
643 term_end = j;
644 if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
645 string::size_type len = term.length();
646 if (*j == '#') {
647 term += '#';
648 do { ++j; } while (j != s_end && *j == '#');
649 } else {
650 while (j != s_end && (*j == '+' || *j == '-')) {
651 Xapian::Unicode::append_utf8(term, *j);
652 ++j;
653 }
654 }
655 if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
656 term.resize(len);
657 } else {
658 term_end = j;
659 }
660 }
661 }
662 j = term_end;
663 term = Xapian::Unicode::tolower(term);
664 int match = word_in_list(term, list);
665 if (match == -1) {
666 string stem = "Z";
667 stem += (*stemmer)(term);
668 match = word_in_list(stem, list);
669 }
670 if (match >= 0) {
671 res += html_escape(string(l, first.raw() - l));
672 if (!bra.empty()) {
673 res += bra;
674 } else {
675 static const char * colours[] = {
676 "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
677 "990000", "009900", "996600", "006699", "990099"
678 };
679 size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
680 const char * bg = colours[idx];
681 if (strchr(bg, 'f')) {
682 res += "<b style=\"color:black;background-color:#";
683 } else {
684 res += "<b style=\"color:white;background-color:#";
685 }
686 res += bg;
687 res += "\">";
688 }
689 word.assign(first.raw(), j.raw() - first.raw());
690 res += html_escape(word);
691 if (!bra.empty()) {
692 res += ket;
693 } else {
694 res += "</b>";
695 }
696 } else {
697 res += html_escape(string(l, j.raw() - l));
698 }
699 }
700 if (j != s_end) res += html_escape(string(j.raw(), j.left()));
701 return res;
702 }
703
704 #if 0
705 static void
706 print_query_string(const char *after)
707 {
708 if (after && strncmp(after, "&B=", 3) == 0) {
709 char prefix = after[3];
710 string::size_type start = 0, amp = 0;
711 while (true) {
712 amp = url_query_string.find('&', amp);
713 if (amp == string::npos) {
714 cout << url_query_string.substr(start);
715 return;
716 }
717 amp++;
718 while (url_query_string[amp] == 'B' &&
719 url_query_string[amp + 1] == '=' &&
720 url_query_string[amp + 2] == prefix) {
721 cout << url_query_string.substr(start, amp - start - 1);
722 start = url_query_string.find('&', amp + 3);
723 if (start == string::npos) return;
724 amp = start + 1;
725 }
726 }
727 }
728 cout << url_query_string;
729 }
730 #endif
731
732 class Fields {
733 mutable Xapian::docid did_cached;
734 mutable map<string, string> fields;
735
736 void read_fields(Xapian::docid did) const;
737
738 public:
Fields()739 Fields() : did_cached(0) { }
740
get_field(Xapian::docid did,const string & field) const741 const string & get_field(Xapian::docid did, const string & field) const {
742 if (did != did_cached) read_fields(did);
743 return fields[field];
744 }
745 };
746
747 void
read_fields(Xapian::docid did) const748 Fields::read_fields(Xapian::docid did) const
749 {
750 fields.clear();
751 did_cached = did;
752 const string & data = db.get_document(did).get_data();
753
754 // Parse document data.
755 string::size_type i = 0;
756 const string & names = option["fieldnames"];
757 if (!names.empty()) {
758 // Each line is a field, with fieldnames taken from corresponding
759 // entries in the tab-separated list specified by $opt{fieldnames}.
760 string::size_type n = 0;
761 do {
762 string::size_type n0 = n;
763 n = names.find('\t', n);
764 string::size_type i0 = i;
765 i = data.find('\n', i);
766 fields.insert(make_pair(names.substr(n0, n - n0),
767 data.substr(i0, i - i0)));
768 } while (++n && ++i);
769 } else {
770 // Each line is a field, in the format NAME=VALUE. We assume the field
771 // name doesn't contain an "=". Lines without an "=" are currently
772 // just ignored.
773 do {
774 string::size_type i0 = i;
775 i = data.find('\n', i);
776 string line = data.substr(i0, i - i0);
777 string::size_type j = line.find('=');
778 if (j != string::npos) {
779 string & value = fields[line.substr(0, j)];
780 if (!value.empty()) value += '\t';
781 value.append(line, j + 1, string::npos);
782 }
783 } while (++i);
784 }
785 }
786
787 static Fields fields;
788 static Xapian::docid q0;
789 static Xapian::doccount hit_no;
790 static int percent;
791 static Xapian::weight weight;
792 static Xapian::doccount collapsed;
793
794 static string print_caption(const string &fmt, const vector<string> ¶m);
795
796 enum tagval {
797 CMD_,
798 CMD_add,
799 CMD_addfilter,
800 CMD_allterms,
801 CMD_and,
802 CMD_cgi,
803 CMD_cgilist,
804 CMD_collapsed,
805 CMD_date,
806 CMD_dbname,
807 CMD_dbsize,
808 CMD_def,
809 CMD_defaultop,
810 CMD_div,
811 CMD_eq,
812 CMD_emptydocs,
813 CMD_env,
814 CMD_error,
815 CMD_field,
816 CMD_filesize,
817 CMD_filters,
818 CMD_filterterms,
819 CMD_find,
820 CMD_fmt,
821 CMD_freq,
822 CMD_ge,
823 CMD_gt,
824 CMD_highlight,
825 CMD_hit,
826 CMD_hitlist,
827 CMD_hitsperpage,
828 CMD_hostname,
829 CMD_html,
830 CMD_htmlstrip,
831 CMD_httpheader,
832 CMD_id,
833 CMD_if,
834 CMD_include,
835 CMD_last,
836 CMD_lastpage,
837 CMD_le,
838 CMD_length,
839 CMD_list,
840 CMD_log,
841 CMD_lookup,
842 CMD_lower,
843 CMD_lt,
844 CMD_map,
845 CMD_max,
846 CMD_min,
847 CMD_mod,
848 CMD_msize,
849 CMD_msizeexact,
850 CMD_mul,
851 CMD_muldiv,
852 CMD_ne,
853 CMD_nice,
854 CMD_not,
855 CMD_now,
856 CMD_opt,
857 CMD_or,
858 CMD_pack,
859 CMD_percentage,
860 CMD_prettyterm,
861 CMD_prettyurl,
862 CMD_query,
863 CMD_querydescription,
864 CMD_queryterms,
865 CMD_range,
866 CMD_record,
867 CMD_relevant,
868 CMD_relevants,
869 CMD_score,
870 CMD_set,
871 CMD_setmap,
872 CMD_setrelevant,
873 CMD_slice,
874 CMD_split,
875 CMD_stoplist,
876 CMD_sub,
877 CMD_substr,
878 CMD_suggestion,
879 CMD_terms,
880 CMD_thispage,
881 CMD_time,
882 CMD_topdoc,
883 CMD_topterms,
884 CMD_transform,
885 CMD_uniq,
886 CMD_unpack,
887 CMD_unstem,
888 CMD_upper,
889 CMD_url,
890 CMD_value,
891 CMD_version,
892 CMD_weight,
893 CMD_MACRO // special tag for macro evaluation
894 };
895
896 struct func_attrib {
897 int tag;
898 int minargs, maxargs, evalargs;
899 char ensure;
900 };
901
902 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
903 struct func_desc {
904 const char *name;
905 struct func_attrib a;
906 };
907
908 #define N -1
909 #define M 'M'
910 #define Q 'Q'
911 // NB when adding a new command which ensures M or Q, update the list in
912 // docs/omegascript.rst
913 static struct func_desc func_tab[] = {
914 //name minargs maxargs evalargs ensure
915 {"",{CMD_, N, N, 0, 0}},// commented out code
916 T(add, 0, N, N, 0), // add a list of numbers
917 T(addfilter, 1, 1, N, 0), // add filter term
918 T(allterms, 0, 1, N, 0), // list of all terms matching document
919 T(and, 1, N, 0, 0), // logical shortcutting and of a list of values
920 T(cgi, 1, 1, N, 0), // return cgi parameter value
921 T(cgilist, 1, 1, N, 0), // return list of values for cgi parameter
922 T(collapsed, 0, 0, N, 0), // return number of hits collapsed into this
923 T(date, 1, 2, N, 0), // convert time_t to strftime format
924 // (default: YYYY-MM-DD)
925 T(dbname, 0, 0, N, 0), // database name
926 T(dbsize, 0, 0, N, 0), // database size (# of documents)
927 T(def, 2, 2, 1, 0), // define a macro
928 T(defaultop, 0, 0, N, 0), // default operator: "and" or "or"
929 T(div, 2, 2, N, 0), // integer divide
930 T(emptydocs, 0, 1, N, 0), // list of empty documents
931 T(env, 1, 1, N, 0), // environment variable
932 T(error, 0, 0, N, 0), // error message
933 T(eq, 2, 2, N, 0), // test equality
934 T(field, 1, 2, N, 0), // lookup field in record
935 T(filesize, 1, 1, N, 0), // pretty printed filesize
936 T(filters, 0, 0, N, 0), // serialisation of current filters
937 T(filterterms, 1, 1, N, 0), // list of terms with a given prefix
938 T(find, 2, 2, N, 0), // find entry in list
939 T(fmt, 0, 0, N, 0), // name of current format
940 T(freq, 1, 1, N, 0), // frequency of a term
941 T(ge, 2, 2, N, 0), // test >=
942 T(gt, 2, 2, N, 0), // test >
943 T(highlight, 2, 4, N, 0), // html escape and highlight words from list
944 T(hit, 0, 0, N, 0), // hit number of current mset entry (0-based)
945 T(hitlist, 1, 1, 0, M), // display hitlist using format in argument
946 T(hitsperpage, 0, 0, N, 0), // hits per page
947 T(hostname, 1, 1, N, 0), // extract hostname from URL
948 T(html, 1, 1, N, 0), // html escape string (<>&")
949 T(htmlstrip, 1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
950 T(httpheader, 2, 2, N, 0), // arbitrary HTTP header
951 T(id, 0, 0, N, 0), // docid of current doc
952 T(if, 2, 3, 1, 0), // conditional
953 T(include, 1, 1, 1, 0), // include another file
954 T(last, 0, 0, N, M), // hit number one beyond end of current page
955 T(lastpage, 0, 0, N, M), // number of last hit page
956 T(le, 2, 2, N, 0), // test <=
957 T(length, 1, 1, N, 0), // length of list
958 T(list, 2, 5, N, 0), // pretty print list
959 T(log, 1, 2, 1, 0), // create a log entry
960 T(lookup, 2, 2, N, 0), // lookup in named cdb file
961 T(lower, 1, 1, N, 0), // convert string to lower case
962 T(lt, 2, 2, N, 0), // test <
963 T(map, 1, 2, 1, 0), // map a list into another list
964 T(max, 1, N, N, 0), // maximum of a list of values
965 T(min, 1, N, N, 0), // minimum of a list of values
966 T(mod, 2, 2, N, 0), // integer modulus
967 T(msize, 0, 0, N, M), // number of matches
968 T(msizeexact, 0, 0, N, M), // is $msize exact?
969 T(mul, 2, N, N, 0), // multiply a list of numbers
970 T(muldiv, 3, 3, N, 0), // calculate A*B/C
971 T(ne, 2, 2, N, 0), // test not equal
972 T(nice, 1, 1, N, 0), // pretty print integer (with thousands sep)
973 T(not, 1, 1, N, 0), // logical not
974 T(now, 0, 0, N, 0), // current date/time as a time_t
975 T(opt, 1, 2, N, 0), // lookup an option value
976 T(or, 1, N, 0, 0), // logical shortcutting or of a list of values
977 T(pack, 1, 1, N, 0), // convert a number to a 4 byte big endian binary string
978 T(percentage, 0, 0, N, 0), // percentage score of current hit
979 T(prettyterm, 1, 1, N, Q), // pretty print term name
980 T(prettyurl, 1, 1, N, 0), // pretty version of URL
981 T(query, 0, 0, N, Q), // query
982 T(querydescription,0, 0, N, M), // query.get_description() (run_query() adds filters so M)
983 T(queryterms, 0, 0, N, Q), // list of query terms
984 T(range, 2, 2, N, 0), // return list of values between start and end
985 T(record, 0, 1, N, 0), // record contents of document
986 T(relevant, 0, 1, N, Q), // is document relevant?
987 T(relevants, 0, 0, N, Q), // return list of relevant documents
988 T(score, 0, 0, N, 0), // score (0-10) of current hit
989 T(set, 2, 2, N, 0), // set option value
990 T(setmap, 1, N, N, 0), // set map of option values
991 T(setrelevant, 0, 1, N, Q), // set rset
992 T(slice, 2, 2, N, 0), // slice a list using a second list
993 T(split, 1, 2, N, 0), // split a string to give a list
994 T(stoplist, 0, 0, N, Q), // return list of stopped terms
995 T(sub, 2, 2, N, 0), // subtract
996 T(substr, 2, 3, N, 0), // substring
997 T(suggestion, 0, 0, N, Q), // misspelled word correction suggestion
998 T(terms, 0, 0, N, M), // list of matching terms
999 T(thispage, 0, 0, N, M), // page number of current page
1000 T(time, 0, 0, N, M), // how long the match took (in seconds)
1001 T(topdoc, 0, 0, N, M), // first document on current page of hit list
1002 // (counting from 0)
1003 T(topterms, 0, 1, N, M), // list of up to N top relevance feedback terms
1004 // (default 16)
1005 T(transform, 3, 3, N, 0), // transform with a regexp
1006 T(uniq, 1, 1, N, 0), // removed duplicates from a sorted list
1007 T(unpack, 1, 1, N, 0), // convert 4 byte big endian binary string to a number
1008 T(unstem, 1, 1, N, Q), // return list of probabilistic terms from
1009 // the query which stemmed to this term
1010 T(upper, 1, 1, N, 0), // convert string to upper case
1011 T(url, 1, 1, N, 0), // url encode argument
1012 T(value, 1, 2, N, 0), // return document value
1013 T(version, 0, 0, N, 0), // omega version string
1014 T(weight, 0, 0, N, 0), // weight of the current hit
1015 { NULL,{0, 0, 0, 0, 0}}
1016 };
1017
1018 #undef T // Leaving T defined screws up Sun's C++ compiler!
1019
1020 static vector<string> macros;
1021
1022 // Call write() repeatedly until all data is written or we get a
1023 // non-recoverable error.
1024 static ssize_t
write_all(int fd,const char * buf,size_t count)1025 write_all(int fd, const char * buf, size_t count)
1026 {
1027 while (count) {
1028 ssize_t r = write(fd, buf, count);
1029 if (rare(r < 0)) {
1030 if (errno == EINTR) continue;
1031 return r;
1032 }
1033 buf += r;
1034 count -= r;
1035 }
1036 return 0;
1037 }
1038
1039 static string
eval(const string & fmt,const vector<string> & param)1040 eval(const string &fmt, const vector<string> ¶m)
1041 {
1042 static map<string, const struct func_attrib *> func_map;
1043 if (func_map.empty()) {
1044 struct func_desc *p;
1045 for (p = func_tab; p->name != NULL; p++) {
1046 func_map[string(p->name)] = &(p->a);
1047 }
1048 }
1049 string res;
1050 string::size_type p = 0, q;
1051 while ((q = fmt.find('$', p)) != string::npos) try {
1052 res.append(fmt, p, q - p);
1053 string::size_type code_start = q; // note down for error reporting
1054 q++;
1055 if (q >= fmt.size()) break;
1056 unsigned char ch = fmt[q];
1057 switch (ch) {
1058 // Magic sequences:
1059 // `$$' -> `$', `$(' -> `{', `$)' -> `}', `$.' -> `,'
1060 case '$':
1061 res += '$';
1062 p = q + 1;
1063 continue;
1064 case '(':
1065 res += '{';
1066 p = q + 1;
1067 continue;
1068 case ')':
1069 res += '}';
1070 p = q + 1;
1071 continue;
1072 case '.':
1073 res += ',';
1074 p = q + 1;
1075 continue;
1076 case '_':
1077 ch = '0';
1078 // FALL THRU
1079 case '1': case '2': case '3': case '4': case '5':
1080 case '6': case '7': case '8': case '9':
1081 ch -= '0';
1082 if (ch < param.size()) res += param[ch];
1083 p = q + 1;
1084 continue;
1085 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1086 case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1087 case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1088 case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1089 case 'y': case 'z':
1090 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1091 case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1092 case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1093 case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1094 case 'Y': case 'Z':
1095 case '{':
1096 break;
1097 default:
1098 string msg = "Unknown $ code in: $" + fmt.substr(q);
1099 throw msg;
1100 }
1101 p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
1102 string var = fmt.substr(q, p - q);
1103 map<string, const struct func_attrib *>::const_iterator func;
1104 func = func_map.find(var);
1105 if (func == func_map.end()) {
1106 throw "Unknown function `" + var + "'";
1107 }
1108 vector<string> args;
1109 if (fmt[p] == '{') {
1110 q = p + 1;
1111 int nest = 1;
1112 while (true) {
1113 p = fmt.find_first_of(",{}", p + 1);
1114 if (p == string::npos)
1115 throw "missing } in " + fmt.substr(code_start);
1116 if (fmt[p] == '{') {
1117 ++nest;
1118 } else {
1119 if (nest == 1) {
1120 // should we split the args
1121 if (func->second->minargs != N) {
1122 args.push_back(fmt.substr(q, p - q));
1123 q = p + 1;
1124 }
1125 }
1126 if (fmt[p] == '}' && --nest == 0) break;
1127 }
1128 }
1129 if (func->second->minargs == N)
1130 args.push_back(fmt.substr(q, p - q));
1131 p++;
1132 }
1133
1134 if (func->second->minargs != N) {
1135 if ((int)args.size() < func->second->minargs)
1136 throw "too few arguments to $" + var;
1137 if (func->second->maxargs != N &&
1138 (int)args.size() > func->second->maxargs)
1139 throw "too many arguments to $" + var;
1140
1141 vector<string>::size_type n;
1142 if (func->second->evalargs != N)
1143 n = func->second->evalargs;
1144 else
1145 n = args.size();
1146
1147 for (vector<string>::size_type j = 0; j < n; j++)
1148 args[j] = eval(args[j], param);
1149 }
1150 if (func->second->ensure == 'Q' || func->second->ensure == 'M')
1151 ensure_query_parsed();
1152 if (func->second->ensure == 'M') ensure_match();
1153 string value;
1154 switch (func->second->tag) {
1155 case CMD_:
1156 break;
1157 case CMD_add: {
1158 int total = 0;
1159 vector<string>::const_iterator i;
1160 for (i = args.begin(); i != args.end(); i++)
1161 total += string_to_int(*i);
1162 value = str(total);
1163 break;
1164 }
1165 case CMD_addfilter:
1166 add_bterm(args[0]);
1167 break;
1168 case CMD_allterms: {
1169 // list of all terms indexing document
1170 int id = q0;
1171 if (!args.empty()) id = string_to_int(args[0]);
1172 Xapian::TermIterator term = db.termlist_begin(id);
1173 for ( ; term != db.termlist_end(id); term++) {
1174 value += *term;
1175 value += '\t';
1176 }
1177
1178 if (!value.empty()) value.erase(value.size() - 1);
1179 break;
1180 }
1181 case CMD_and: {
1182 value = "true";
1183 for (vector<string>::const_iterator i = args.begin();
1184 i != args.end(); i++) {
1185 if (eval(*i, param).empty()) {
1186 value.resize(0);
1187 break;
1188 }
1189 }
1190 break;
1191 }
1192 case CMD_cgi: {
1193 MCI i = cgi_params.find(args[0]);
1194 if (i != cgi_params.end()) value = i->second;
1195 break;
1196 }
1197 case CMD_cgilist: {
1198 pair<MCI, MCI> g;
1199 g = cgi_params.equal_range(args[0]);
1200 for (MCI i = g.first; i != g.second; i++) {
1201 value += i->second;
1202 value += '\t';
1203 }
1204 if (!value.empty()) value.erase(value.size() - 1);
1205 break;
1206 }
1207 case CMD_collapsed: {
1208 value = str(collapsed);
1209 break;
1210 }
1211 case CMD_date:
1212 value = args[0];
1213 if (!value.empty()) {
1214 char buf[64] = "";
1215 time_t date = string_to_int(value);
1216 if (date != (time_t)-1) {
1217 struct tm *then;
1218 then = gmtime(&date);
1219 string date_fmt = "%Y-%m-%d";
1220 if (args.size() > 1) date_fmt = eval(args[1], param);
1221 strftime(buf, sizeof buf, date_fmt.c_str(), then);
1222 }
1223 value = buf;
1224 }
1225 break;
1226 case CMD_dbname:
1227 value = dbname;
1228 break;
1229 case CMD_dbsize: {
1230 static Xapian::doccount dbsize;
1231 if (!dbsize) dbsize = db.get_doccount();
1232 value = str(dbsize);
1233 break;
1234 }
1235 case CMD_def: {
1236 func_attrib *fa = new func_attrib;
1237 fa->tag = CMD_MACRO + macros.size();
1238 fa->minargs = 0;
1239 fa->maxargs = 9;
1240 fa->evalargs = N; // FIXME: or 0?
1241 fa->ensure = 0;
1242
1243 macros.push_back(args[1]);
1244 func_map[args[0]] = fa;
1245 break;
1246 }
1247 case CMD_defaultop:
1248 if (default_op == Xapian::Query::OP_AND) {
1249 value = "and";
1250 } else {
1251 value = "or";
1252 }
1253 break;
1254 case CMD_div: {
1255 int denom = string_to_int(args[1]);
1256 if (denom == 0) {
1257 value = "divide by 0";
1258 } else {
1259 value = str(string_to_int(args[0]) /
1260 string_to_int(args[1]));
1261 }
1262 break;
1263 }
1264 case CMD_eq:
1265 if (args[0] == args[1]) value = "true";
1266 break;
1267 case CMD_emptydocs: {
1268 string t;
1269 if (!args.empty())
1270 t = args[0];
1271 Xapian::PostingIterator i;
1272 for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
1273 if (i.get_doclength() != 0) continue;
1274 if (!value.empty()) value += '\t';
1275 value += str(*i);
1276 }
1277 break;
1278 }
1279 case CMD_env: {
1280 char *env = getenv(args[0].c_str());
1281 if (env != NULL) value = env;
1282 break;
1283 }
1284 case CMD_error:
1285 if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
1286 error_msg = "Database `" + dbname + "' couldn't be opened";
1287 }
1288 value = error_msg;
1289 break;
1290 case CMD_field: {
1291 Xapian::docid did = q0;
1292 if (args.size() > 1) did = string_to_int(args[1]);
1293 value = fields.get_field(did, args[0]);
1294 break;
1295 }
1296 case CMD_filesize: {
1297 // FIXME: rounding? i18n?
1298 int size = string_to_int(args[0]);
1299 int intpart = size;
1300 int fraction = -1;
1301 const char * format = 0;
1302 if (size < 0) {
1303 // Negative size -> empty result.
1304 } else if (size == 1) {
1305 format = "%d byte";
1306 } else if (size < 1024) {
1307 format = "%d bytes";
1308 } else {
1309 if (size < 1024*1024) {
1310 format = "%d.%cK";
1311 } else {
1312 size /= 1024;
1313 if (size < 1024*1024) {
1314 format = "%d.%cM";
1315 } else {
1316 size /= 1024;
1317 format = "%d.%cG";
1318 }
1319 }
1320 intpart = unsigned(size) / 1024;
1321 fraction = unsigned(size) % 1024;
1322 }
1323 if (format) {
1324 char buf[200];
1325 int len;
1326 if (fraction == -1) {
1327 len = my_snprintf(buf, sizeof(buf), format, intpart);
1328 } else {
1329 fraction = (fraction * 10 / 1024) + '0';
1330 len = my_snprintf(buf, sizeof(buf), format, intpart, fraction);
1331 }
1332 if (len < 0 || (unsigned)len > sizeof(buf)) len = sizeof(buf);
1333 value.assign(buf, len);
1334 }
1335 break;
1336 }
1337 case CMD_filters:
1338 value = filters;
1339 break;
1340 case CMD_filterterms: {
1341 Xapian::TermIterator term = db.allterms_begin();
1342 term.skip_to(args[0]);
1343 while (term != db.allterms_end()) {
1344 string t = *term;
1345 if (!startswith(t, args[0])) break;
1346 value += t;
1347 value += '\t';
1348 ++term;
1349 }
1350
1351 if (!value.empty()) value.erase(value.size() - 1);
1352 break;
1353 }
1354 case CMD_find: {
1355 string l = args[0], s = args[1];
1356 string::size_type i = 0, j = 0;
1357 size_t count = 0;
1358 while (j != l.size()) {
1359 j = l.find('\t', i);
1360 if (j == string::npos) j = l.size();
1361 if (j - i == s.length()) {
1362 if (memcmp(s.data(), l.data() + i, j - i) == 0) {
1363 value = str(count);
1364 break;
1365 }
1366 }
1367 ++count;
1368 i = j + 1;
1369 }
1370 break;
1371 }
1372 case CMD_fmt:
1373 value = fmtname;
1374 break;
1375 case CMD_freq:
1376 try {
1377 value = str(mset.get_termfreq(args[0]));
1378 } catch (const Xapian::InvalidOperationError&) {
1379 // An MSet will raise this error if it's empty and not
1380 // associated with a search.
1381 value = str(db.get_termfreq(args[0]));
1382 }
1383 break;
1384 case CMD_ge:
1385 if (string_to_int(args[0]) >= string_to_int(args[1]))
1386 value = "true";
1387 break;
1388 case CMD_gt:
1389 if (string_to_int(args[0]) > string_to_int(args[1]))
1390 value = "true";
1391 break;
1392 case CMD_highlight: {
1393 string bra, ket;
1394 if (args.size() > 2) {
1395 bra = args[2];
1396 if (args.size() > 3) {
1397 ket = args[3];
1398 } else {
1399 string::const_iterator i;
1400 i = find_if(bra.begin() + 2, bra.end(), p_nottag);
1401 ket = "</";
1402 ket.append(bra, 1, i - bra.begin() - 1);
1403 ket += '>';
1404 }
1405 }
1406
1407 value = html_highlight(args[0], args[1], bra, ket);
1408 break;
1409 }
1410 case CMD_hit:
1411 // 0-based mset index
1412 value = str(hit_no);
1413 break;
1414 case CMD_hitlist:
1415 #if 0
1416 const char *q;
1417 int ch;
1418
1419 url_query_string = "?DB=";
1420 url_query_string += dbname;
1421 url_query_string += "&P=";
1422 q = query_string.c_str();
1423 while ((ch = *q++) != '\0') {
1424 switch (ch) {
1425 case '+':
1426 url_query_string += "%2b";
1427 break;
1428 case '"':
1429 url_query_string += "%22";
1430 break;
1431 case ' ':
1432 ch = '+';
1433 /* fall through */
1434 default:
1435 url_query_string += ch;
1436 }
1437 }
1438 // add any boolean terms
1439 for (FMCI i = filter_map.begin(); i != filter_map.end(); i++) {
1440 url_query_string += "&B=";
1441 url_query_string += i->second;
1442 }
1443 #endif
1444 for (hit_no = topdoc; hit_no < last; hit_no++)
1445 value += print_caption(args[0], param);
1446 hit_no = 0;
1447 break;
1448 case CMD_hitsperpage:
1449 value = str(hits_per_page);
1450 break;
1451 case CMD_hostname: {
1452 value = args[0];
1453 // remove URL scheme and/or path
1454 string::size_type i = value.find("://");
1455 if (i == string::npos) i = 0; else i += 3;
1456 value = value.substr(i, value.find('/', i) - i);
1457 // remove user@ or user:password@
1458 i = value.find('@');
1459 if (i != string::npos) value.erase(0, i + 1);
1460 // remove :port
1461 i = value.find(':');
1462 if (i != string::npos) value.resize(i);
1463 break;
1464 }
1465 case CMD_html:
1466 value = html_escape(args[0]);
1467 break;
1468 case CMD_htmlstrip:
1469 value = html_strip(args[0]);
1470 break;
1471 case CMD_httpheader:
1472 if (!suppress_http_headers) {
1473 cout << args[0] << ": " << args[1] << endl;
1474 if (!set_content_type && args[0].length() == 12 &&
1475 strcasecmp(args[0].c_str(), "Content-Type") == 0) {
1476 set_content_type = true;
1477 }
1478 }
1479 break;
1480 case CMD_id:
1481 // document id
1482 value = str(q0);
1483 break;
1484 case CMD_if:
1485 if (!args[0].empty())
1486 value = eval(args[1], param);
1487 else if (args.size() > 2)
1488 value = eval(args[2], param);
1489 break;
1490 case CMD_include:
1491 value = eval_file(args[0]);
1492 break;
1493 case CMD_last:
1494 value = str(last);
1495 break;
1496 case CMD_lastpage: {
1497 int l = mset.get_matches_estimated();
1498 if (l > 0) l = (l - 1) / hits_per_page + 1;
1499 value = str(l);
1500 break;
1501 }
1502 case CMD_le:
1503 if (string_to_int(args[0]) <= string_to_int(args[1]))
1504 value = "true";
1505 break;
1506 case CMD_length:
1507 if (args[0].empty()) {
1508 value = "0";
1509 } else {
1510 size_t length = count(args[0].begin(), args[0].end(), '\t');
1511 value = str(length + 1);
1512 }
1513 break;
1514 case CMD_list: {
1515 if (!args[0].empty()) {
1516 string pre, inter, interlast, post;
1517 switch (args.size()) {
1518 case 2:
1519 inter = interlast = args[1];
1520 break;
1521 case 3:
1522 inter = args[1];
1523 interlast = args[2];
1524 break;
1525 case 4:
1526 pre = args[1];
1527 inter = interlast = args[2];
1528 post = args[3];
1529 break;
1530 case 5:
1531 pre = args[1];
1532 inter = args[2];
1533 interlast = args[3];
1534 post = args[4];
1535 break;
1536 }
1537 value += pre;
1538 string list = args[0];
1539 string::size_type split = 0, split2;
1540 while ((split2 = list.find('\t', split)) != string::npos) {
1541 if (split) value += inter;
1542 value.append(list, split, split2 - split);
1543 split = split2 + 1;
1544 }
1545 if (split) value += interlast;
1546 value.append(list, split, string::npos);
1547 value += post;
1548 }
1549 break;
1550 }
1551 case CMD_log: {
1552 if (!vet_filename(args[0])) break;
1553 string logfile = log_dir + args[0];
1554 int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
1555 if (fd == -1) break;
1556 vector<string> noargs;
1557 noargs.resize(1);
1558 string line;
1559 if (args.size() > 1) {
1560 line = args[1];
1561 } else {
1562 line = DEFAULT_LOG_ENTRY;
1563 }
1564 line = eval(line, noargs);
1565 line += '\n';
1566 (void)write_all(fd, line.data(), line.length());
1567 close(fd);
1568 break;
1569 }
1570 case CMD_lookup: {
1571 if (!vet_filename(args[0])) break;
1572 string cdbfile = cdb_dir + args[0];
1573 int fd = open(cdbfile.c_str(), O_RDONLY);
1574 if (fd == -1) break;
1575
1576 struct cdb cdb;
1577 cdb_init(&cdb, fd);
1578
1579 if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
1580 size_t datalen = cdb_datalen(&cdb);
1581 const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
1582 if (q) {
1583 value.assign(static_cast<const char *>(dat), datalen);
1584 }
1585 }
1586
1587 cdb_free(&cdb);
1588 close(fd); // FIXME: cache fds?
1589 break;
1590 }
1591 case CMD_lower:
1592 value = Xapian::Unicode::tolower(args[0]);
1593 break;
1594 case CMD_lt:
1595 if (string_to_int(args[0]) < string_to_int(args[1]))
1596 value = "true";
1597 break;
1598 case CMD_map:
1599 if (!args[0].empty()) {
1600 string l = args[0], pat = args[1];
1601 vector<string> new_args(param);
1602 string::size_type i = 0, j;
1603 while (true) {
1604 j = l.find('\t', i);
1605 new_args[0] = l.substr(i, j - i);
1606 value += eval(pat, new_args);
1607 if (j == string::npos) break;
1608 value += '\t';
1609 i = j + 1;
1610 }
1611 }
1612 break;
1613 case CMD_max: {
1614 vector<string>::const_iterator i = args.begin();
1615 int val = string_to_int(*i++);
1616 for (; i != args.end(); i++) {
1617 int x = string_to_int(*i);
1618 if (x > val) val = x;
1619 }
1620 value = str(val);
1621 break;
1622 }
1623 case CMD_min: {
1624 vector<string>::const_iterator i = args.begin();
1625 int val = string_to_int(*i++);
1626 for (; i != args.end(); i++) {
1627 int x = string_to_int(*i);
1628 if (x < val) val = x;
1629 }
1630 value = str(val);
1631 break;
1632 }
1633 case CMD_msize:
1634 // number of matches
1635 value = str(mset.get_matches_estimated());
1636 break;
1637 case CMD_msizeexact:
1638 // is msize exact?
1639 if (mset.get_matches_lower_bound()
1640 == mset.get_matches_upper_bound())
1641 value = "true";
1642 break;
1643 case CMD_mod: {
1644 int denom = string_to_int(args[1]);
1645 if (denom == 0) {
1646 value = "divide by 0";
1647 } else {
1648 value = str(string_to_int(args[0]) %
1649 string_to_int(args[1]));
1650 }
1651 break;
1652 }
1653 case CMD_mul: {
1654 vector<string>::const_iterator i = args.begin();
1655 int total = string_to_int(*i++);
1656 while (i != args.end())
1657 total *= string_to_int(*i++);
1658 value = str(total);
1659 break;
1660 }
1661 case CMD_muldiv: {
1662 int denom = string_to_int(args[2]);
1663 if (denom == 0) {
1664 value = "divide by 0";
1665 } else {
1666 int num = string_to_int(args[0]) * string_to_int(args[1]);
1667 value = str(num / denom);
1668 }
1669 break;
1670 }
1671 case CMD_ne:
1672 if (args[0] != args[1]) value = "true";
1673 break;
1674 case CMD_nice: {
1675 string::const_iterator i = args[0].begin();
1676 int len = args[0].length();
1677 while (len) {
1678 value += *i++;
1679 if (--len && len % 3 == 0) value += option["thousand"];
1680 }
1681 break;
1682 }
1683 case CMD_not:
1684 if (args[0].empty()) value = "true";
1685 break;
1686 case CMD_now: {
1687 char buf[64];
1688 my_snprintf(buf, sizeof(buf), "%lu", (unsigned long)time(NULL));
1689 // MSVC's snprintf omits the zero byte if the string if
1690 // sizeof(buf) long.
1691 buf[sizeof(buf) - 1] = '\0';
1692 value = buf;
1693 break;
1694 }
1695 case CMD_opt:
1696 if (args.size() == 2) {
1697 value = option[args[0] + "," + args[1]];
1698 } else {
1699 value = option[args[0]];
1700 }
1701 break;
1702 case CMD_or: {
1703 for (vector<string>::const_iterator i = args.begin();
1704 i != args.end(); i++) {
1705 value = eval(*i, param);
1706 if (!value.empty()) break;
1707 }
1708 break;
1709 }
1710 case CMD_pack:
1711 value = int_to_binary_string(string_to_int(args[0]));
1712 break;
1713 case CMD_percentage:
1714 // percentage score
1715 value = str(percent);
1716 break;
1717 case CMD_prettyterm:
1718 value = pretty_term(args[0]);
1719 break;
1720 case CMD_prettyurl:
1721 value = args[0];
1722 url_prettify(value);
1723 break;
1724 case CMD_query:
1725 value = query_string;
1726 break;
1727 case CMD_querydescription:
1728 value = query.get_description();
1729 break;
1730 case CMD_queryterms:
1731 value = queryterms;
1732 break;
1733 case CMD_range: {
1734 int start = string_to_int(args[0]);
1735 int end = string_to_int(args[1]);
1736 while (start <= end) {
1737 value += str(start);
1738 if (start < end) value += '\t';
1739 start++;
1740 }
1741 break;
1742 }
1743 case CMD_record: {
1744 int id = q0;
1745 if (!args.empty()) id = string_to_int(args[0]);
1746 value = db.get_document(id).get_data();
1747 break;
1748 }
1749 case CMD_relevant: {
1750 // document id if relevant; empty otherwise
1751 int id = q0;
1752 if (!args.empty()) id = string_to_int(args[0]);
1753 map<Xapian::docid, bool>::iterator i = ticked.find(id);
1754 if (i != ticked.end()) {
1755 i->second = false; // icky side-effect
1756 value = str(id);
1757 }
1758 break;
1759 }
1760 case CMD_relevants: {
1761 for (map <Xapian::docid, bool>::const_iterator i = ticked.begin();
1762 i != ticked.end(); i++) {
1763 if (i->second) {
1764 value += str(i->first);
1765 value += '\t';
1766 }
1767 }
1768 if (!value.empty()) value.erase(value.size() - 1);
1769 break;
1770 }
1771 case CMD_score:
1772 // Score (0 to 10)
1773 value = str(percent / 10);
1774 break;
1775 case CMD_set:
1776 option[args[0]] = args[1];
1777 break;
1778 case CMD_setmap: {
1779 string base = args[0] + ',';
1780 if (args.size() % 2 != 1)
1781 throw string("$setmap requires an odd number of arguments");
1782 for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
1783 option[base + args[i]] = args[i + 1];
1784 }
1785 break;
1786 }
1787 case CMD_setrelevant: {
1788 string::size_type i = 0, j;
1789 while (true) {
1790 j = args[0].find_first_not_of("0123456789", i);
1791 Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
1792 if (id) {
1793 rset.add_document(id);
1794 ticked[id] = true;
1795 }
1796 if (j == string::npos) break;
1797 i = j + 1;
1798 }
1799 break;
1800 }
1801 case CMD_slice: {
1802 string list = args[0], pos = args[1];
1803 vector<string> items;
1804 string::size_type i = 0, j;
1805 while (true) {
1806 j = list.find('\t', i);
1807 items.push_back(list.substr(i, j - i));
1808 if (j == string::npos) break;
1809 i = j + 1;
1810 }
1811 i = 0;
1812 bool have_added = false;
1813 while (true) {
1814 j = pos.find('\t', i);
1815 int item = string_to_int(pos.substr(i, j - i));
1816 if (item >= 0 && size_t(item) < items.size()) {
1817 if (have_added) value += '\t';
1818 value += items[item];
1819 have_added = true;
1820 }
1821 if (j == string::npos) break;
1822 i = j + 1;
1823 }
1824 break;
1825 }
1826 case CMD_split: {
1827 string split;
1828 if (args.size() == 1) {
1829 split = " ";
1830 value = args[0];
1831 } else {
1832 split = args[0];
1833 value = args[1];
1834 }
1835 string::size_type i = 0;
1836 while (true) {
1837 if (split.empty()) {
1838 ++i;
1839 if (i >= value.size()) break;
1840 } else {
1841 i = value.find(split, i);
1842 if (i == string::npos) break;
1843 }
1844 value.replace(i, split.size(), 1, '\t');
1845 ++i;
1846 }
1847 break;
1848 }
1849 case CMD_stoplist: {
1850 Xapian::TermIterator i = qp.stoplist_begin();
1851 Xapian::TermIterator end = qp.stoplist_end();
1852 while (i != end) {
1853 if (!value.empty()) value += '\t';
1854 value += *i;
1855 ++i;
1856 }
1857 break;
1858 }
1859 case CMD_sub:
1860 value = str(string_to_int(args[0]) - string_to_int(args[1]));
1861 break;
1862 case CMD_substr: {
1863 int start = string_to_int(args[1]);
1864 if (start < 0) {
1865 if (static_cast<size_t>(-start) >= args[0].size()) {
1866 start = 0;
1867 } else {
1868 start = static_cast<int>(args[0].size()) + start;
1869 }
1870 } else {
1871 if (static_cast<size_t>(start) >= args[0].size()) break;
1872 }
1873 size_t len = string::npos;
1874 if (args.size() > 2) {
1875 int int_len = string_to_int(args[2]);
1876 if (int_len >= 0) {
1877 len = size_t(int_len);
1878 } else {
1879 len = args[0].size() - start;
1880 if (static_cast<size_t>(-int_len) >= len) {
1881 len = 0;
1882 } else {
1883 len -= static_cast<size_t>(-int_len);
1884 }
1885 }
1886 }
1887 value = args[0].substr(start, len);
1888 break;
1889 }
1890 case CMD_suggestion:
1891 value = qp.get_corrected_query_string();
1892 break;
1893 case CMD_terms:
1894 if (enquire) {
1895 // list of matching terms
1896 Xapian::TermIterator term = enquire->get_matching_terms_begin(q0);
1897 while (term != enquire->get_matching_terms_end(q0)) {
1898 // check term was in the typed query so we ignore
1899 // boolean filter terms
1900 if (termset.find(*term) != termset.end()) {
1901 value += *term;
1902 value += '\t';
1903 }
1904 ++term;
1905 }
1906
1907 if (!value.empty()) value.erase(value.size() - 1);
1908 }
1909 break;
1910 case CMD_thispage:
1911 value = str(topdoc / hits_per_page + 1);
1912 break;
1913 case CMD_time:
1914 if (secs >= 0) {
1915 char buf[64];
1916 my_snprintf(buf, sizeof(buf), "%.6f", secs);
1917 // MSVC's snprintf omits the zero byte if the string if
1918 // sizeof(buf) long.
1919 buf[sizeof(buf) - 1] = '\0';
1920 value = buf;
1921 }
1922 break;
1923 case CMD_topdoc:
1924 // first document on current page of hit list (counting from 0)
1925 value = str(topdoc);
1926 break;
1927 case CMD_topterms:
1928 if (enquire) {
1929 int howmany = 16;
1930 if (!args.empty()) howmany = string_to_int(args[0]);
1931 if (howmany < 0) howmany = 0;
1932
1933 // List of expand terms
1934 Xapian::ESet eset;
1935 OmegaExpandDecider decider(db, &termset);
1936
1937 if (!rset.empty()) {
1938 set_expansion_scheme(*enquire, option);
1939 eset = enquire->get_eset(howmany * 2, rset, 0,
1940 expand_param_k, &decider);
1941 } else if (mset.size()) {
1942 // invent an rset
1943 Xapian::RSet tmp;
1944
1945 int c = 5;
1946 // FIXME: what if mset does not start at first match?
1947 Xapian::MSetIterator m = mset.begin();
1948 for ( ; m != mset.end(); ++m) {
1949 tmp.add_document(*m);
1950 if (--c == 0) break;
1951 }
1952
1953 set_expansion_scheme(*enquire, option);
1954 eset = enquire->get_eset(howmany * 2, tmp, 0,
1955 expand_param_k, &decider);
1956 }
1957
1958 // Don't show more than one word with the same stem.
1959 set<string> stems;
1960 Xapian::ESetIterator i;
1961 for (i = eset.begin(); i != eset.end(); ++i) {
1962 string term(*i);
1963 string stem = (*stemmer)(term);
1964 if (stems.find(stem) != stems.end()) continue;
1965 stems.insert(stem);
1966 value += term;
1967 value += '\t';
1968 if (--howmany == 0) break;
1969 }
1970 if (!value.empty()) value.erase(value.size() - 1);
1971 }
1972 break;
1973 case CMD_transform:
1974 omegascript_transform(value, args);
1975 break;
1976 case CMD_uniq: {
1977 const string &list = args[0];
1978 if (list.empty()) break;
1979 string::size_type split = 0, split2;
1980 string prev;
1981 do {
1982 split2 = list.find('\t', split);
1983 string item = list.substr(split, split2 - split);
1984 if (split == 0) {
1985 value = item;
1986 } else if (item != prev) {
1987 value += '\t';
1988 value += item;
1989 }
1990 prev = item;
1991 split = split2 + 1;
1992 } while (split2 != string::npos);
1993 break;
1994 }
1995 case CMD_unpack:
1996 value = str(binary_string_to_int(args[0]));
1997 break;
1998 case CMD_unstem: {
1999 const string &term = args[0];
2000 Xapian::TermIterator i = qp.unstem_begin(term);
2001 Xapian::TermIterator end = qp.unstem_end(term);
2002 while (i != end) {
2003 if (!value.empty()) value += '\t';
2004 value += *i;
2005 ++i;
2006 }
2007 break;
2008 }
2009 case CMD_upper:
2010 value = Xapian::Unicode::toupper(args[0]);
2011 break;
2012 case CMD_url:
2013 url_encode(value, args[0]);
2014 break;
2015 case CMD_value: {
2016 Xapian::docid id = q0;
2017 Xapian::valueno value_no = string_to_int(args[0]);
2018 if (args.size() > 1) id = string_to_int(args[1]);
2019 value = db.get_document(id).get_value(value_no);
2020 break;
2021 }
2022 case CMD_version:
2023 value = PACKAGE_STRING;
2024 break;
2025 case CMD_weight:
2026 value = double_to_string(weight);
2027 break;
2028 default: {
2029 args.insert(args.begin(), param[0]);
2030 int macro_no = func->second->tag - CMD_MACRO;
2031 assert(macro_no >= 0 && (unsigned int)macro_no < macros.size());
2032 // throw "Unknown function `" + var + "'";
2033 value = eval(macros[macro_no], args);
2034 break;
2035 }
2036 }
2037 res += value;
2038 } catch (const Xapian::Error & e) {
2039 // FIXME: this means we only see the most recent error in $error
2040 // - is that the best approach?
2041 error_msg = e.get_msg();
2042 }
2043
2044 res.append(fmt, p, string::npos);
2045 return res;
2046 }
2047
2048 static string
eval_file(const string & fmtfile)2049 eval_file(const string &fmtfile)
2050 {
2051 string err;
2052 if (vet_filename(fmtfile)) {
2053 string file = template_dir + fmtfile;
2054 string fmt;
2055 if (load_file(file, fmt)) {
2056 vector<string> noargs;
2057 noargs.resize(1);
2058 return eval(fmt, noargs);
2059 }
2060 err = strerror(errno);
2061 } else {
2062 err = "name contains `..'";
2063 }
2064
2065 // FIXME: report why!
2066 string msg = string("Couldn't read format template `") + fmtfile + '\'';
2067 if (!err.empty()) msg += " (" + err + ')';
2068 throw msg;
2069 }
2070
2071 extern string
pretty_term(string term)2072 pretty_term(string term)
2073 {
2074 // Just leave empty strings and single characters alone.
2075 if (term.length() <= 1) return term;
2076
2077 // Assume unprefixed terms are unstemmed.
2078 if (!C_isupper(term[0])) return term;
2079
2080 // FIXME: keep this for now in case people are still generating 'R' terms?
2081 // But if we assumed unprefixed terms are unstemmed, what use is this?
2082 if (term[0] == 'R') {
2083 term.erase(0, 1);
2084 term[0] = C_toupper(term[0]);
2085 return term;
2086 }
2087
2088 // Handle stemmed terms.
2089 bool stemmed = (term[0] == 'Z');
2090 if (stemmed) {
2091 // First of all, check if a term in the query stemmed to this one.
2092 Xapian::TermIterator u = qp.unstem_begin(term);
2093 // There might be multiple words with the same stem, but we only want
2094 // one so just take the first.
2095 if (u != qp.unstem_end(term)) return *u;
2096
2097 // Remove the 'Z'.
2098 term.erase(0, 1);
2099 }
2100
2101 bool add_quotes = false;
2102
2103 // Check if the term has a prefix.
2104 if (C_isupper(term[0])) {
2105 // See if we have this prefix in the termprefix_to_userprefix map. If
2106 // so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2107 string prefix;
2108 size_t prefix_len = prefix_from_term(prefix, term);
2109
2110 map<string, string>::const_iterator i;
2111 i = termprefix_to_userprefix.find(prefix);
2112 if (i != termprefix_to_userprefix.end()) {
2113 string user_prefix = i->second;
2114 user_prefix += ':';
2115 term.replace(0, prefix_len, user_prefix);
2116 } else {
2117 // We don't have a prefix mapping for this, so just set a flag to
2118 // add quotes around the term.
2119 add_quotes = true;
2120 }
2121 }
2122
2123 if (stemmed) term += '.';
2124
2125 if (add_quotes) {
2126 term.insert(0, "\"");
2127 term.append("\"");
2128 }
2129
2130 return term;
2131 }
2132
2133 static string
print_caption(const string & fmt,const vector<string> & param)2134 print_caption(const string &fmt, const vector<string> ¶m)
2135 {
2136 q0 = *(mset[hit_no]);
2137
2138 weight = mset[hit_no].get_weight();
2139 percent = mset.convert_to_percent(mset[hit_no]);
2140 collapsed = mset[hit_no].get_collapse_count();
2141
2142 return eval(fmt, param);
2143 }
2144
2145 void
parse_omegascript()2146 parse_omegascript()
2147 {
2148 try {
2149 const char * p = getenv("SERVER_PROTOCOL");
2150 if (p && strcmp(p, "INCLUDED") == 0) {
2151 // We're being included in another page, so suppress headers.
2152 suppress_http_headers = true;
2153 }
2154
2155 string output = eval_file(fmtname);
2156 if (!set_content_type && !suppress_http_headers) {
2157 cout << "Content-Type: text/html" << endl;
2158 set_content_type = true;
2159 }
2160 if (!suppress_http_headers) cout << endl;
2161 cout << output;
2162 } catch (...) {
2163 // Ensure the headers have been output so that any exception gets
2164 // reported rather than giving a server error.
2165 if (!set_content_type && !suppress_http_headers) {
2166 cout << "Content-Type: text/html" << endl;
2167 set_content_type = true;
2168 }
2169 if (!suppress_http_headers) cout << endl;
2170 throw;
2171 }
2172 }
2173
2174 static void
ensure_query_parsed()2175 ensure_query_parsed()
2176 {
2177 if (query_parsed) return;
2178 query_parsed = true;
2179
2180 MCI val;
2181 pair<MCI, MCI> g;
2182
2183 // Should we discard the existing R-set recorded in R CGI parameters?
2184 bool discard_rset = true;
2185
2186 // Should we force the first page of hits (and ignore [ > < # and TOPDOC
2187 // CGI parameters)?
2188 bool force_first_page = true;
2189
2190 string v;
2191 // get list of terms from previous iteration of query
2192 val = cgi_params.find("xP");
2193 if (val == cgi_params.end()) val = cgi_params.find("OLDP");
2194 if (val != cgi_params.end()) {
2195 v = val->second;
2196 } else {
2197 // if xP not given, default to keeping the rset and don't force page 1
2198 discard_rset = false;
2199 force_first_page = false;
2200 }
2201 querytype result = set_probabilistic(v);
2202 switch (result) {
2203 case BAD_QUERY:
2204 break;
2205 case NEW_QUERY:
2206 break;
2207 case SAME_QUERY:
2208 case EXTENDED_QUERY:
2209 // If we've changed database, force the first page of hits
2210 // and discard the R-set (since the docids will have changed)
2211 val = cgi_params.find("xDB");
2212 if (val != cgi_params.end() && val->second != dbname) break;
2213 if (result == SAME_QUERY && force_first_page) {
2214 val = cgi_params.find("xFILTERS");
2215 if (val != cgi_params.end() && val->second != filters) {
2216 // Filters have changed since last query.
2217 } else {
2218 force_first_page = false;
2219 }
2220 }
2221 discard_rset = false;
2222 break;
2223 }
2224
2225 if (!force_first_page) {
2226 // Work out which mset element is the first hit we want
2227 // to display
2228 val = cgi_params.find("TOPDOC");
2229 if (val != cgi_params.end()) {
2230 topdoc = atol(val->second.c_str());
2231 }
2232
2233 // Handle next, previous, and page links
2234 if (cgi_params.find(">") != cgi_params.end()) {
2235 topdoc += hits_per_page;
2236 } else if (cgi_params.find("<") != cgi_params.end()) {
2237 if (topdoc >= hits_per_page)
2238 topdoc -= hits_per_page;
2239 else
2240 topdoc = 0;
2241 } else if ((val = cgi_params.find("[")) != cgi_params.end() ||
2242 (val = cgi_params.find("#")) != cgi_params.end()) {
2243 long page = atol(val->second.c_str());
2244 // Do something sensible for page 0 (we count pages from 1).
2245 if (page == 0) page = 1;
2246 topdoc = (page - 1) * hits_per_page;
2247 }
2248
2249 // raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
2250 // Normally we snap TOPDOC like this so that things work nicely if
2251 // HITSPERPAGE is in a <select> or on radio buttons. If we're
2252 // postprocessing the output of omega and want variable sized pages,
2253 // this is unhelpful.
2254 bool raw_search = false;
2255 val = cgi_params.find("RAWSEARCH");
2256 if (val != cgi_params.end()) {
2257 raw_search = bool(atol(val->second.c_str()));
2258 }
2259
2260 if (!raw_search) topdoc = (topdoc / hits_per_page) * hits_per_page;
2261 }
2262
2263 if (!discard_rset) {
2264 // put documents marked as relevant into the rset
2265 g = cgi_params.equal_range("R");
2266 for (MCI i = g.first; i != g.second; i++) {
2267 const string & value = i->second;
2268 for (size_t j = 0; j < value.size(); j = value.find('.', j)) {
2269 while (value[j] == '.') ++j;
2270 Xapian::docid d = atoi(value.c_str() + j);
2271 if (d) {
2272 rset.add_document(d);
2273 ticked[d] = true;
2274 }
2275 }
2276 }
2277 }
2278 }
2279
2280 // run query if we haven't already
2281 static void
ensure_match()2282 ensure_match()
2283 {
2284 if (done_query) return;
2285
2286 secs = RealTime::now();
2287 run_query();
2288 if (secs != -1)
2289 secs = RealTime::now() - secs;
2290
2291 done_query = true;
2292 last = mset.get_matches_lower_bound();
2293 if (last == 0) {
2294 // Otherwise topdoc ends up being -6 if it's non-zero!
2295 topdoc = 0;
2296 } else {
2297 if (topdoc >= last)
2298 topdoc = ((last - 1) / hits_per_page) * hits_per_page;
2299 // last is the count of documents up to the end of the current page
2300 // (as returned by $last)
2301 if (topdoc + hits_per_page < last)
2302 last = topdoc + hits_per_page;
2303 }
2304 }
2305
2306 // OmegaExpandDecider methods.
2307
OmegaExpandDecider(const Xapian::Database & db_,set<string> * querytermset)2308 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database & db_,
2309 set<string> * querytermset)
2310 : db(db_)
2311 {
2312 // We'll want the stemmer for testing matches anyway.
2313 if (!stemmer)
2314 stemmer = new Xapian::Stem(option["stemmer"]);
2315 if (querytermset) {
2316 set<string>::const_iterator i;
2317 for (i = querytermset->begin(); i != querytermset->end(); ++i) {
2318 string term(*i);
2319 if (term.empty()) continue;
2320
2321 unsigned char ch = term[0];
2322 bool stemmed = (ch == 'Z');
2323 if (stemmed) {
2324 term.erase(0, 1);
2325 if (term.empty()) continue;
2326 ch = term[0];
2327 }
2328
2329 if (C_isupper(ch)) {
2330 string prefix;
2331 size_t prefix_len = prefix_from_term(prefix, term);
2332 term.erase(0, prefix_len);
2333 }
2334
2335 if (!stemmed) term = (*stemmer)(term);
2336
2337 exclude_stems.insert(term);
2338 }
2339 }
2340 }
2341
2342 bool
operator ()(const string & term) const2343 OmegaExpandDecider::operator()(const string & term) const
2344 {
2345 unsigned char ch = term[0];
2346
2347 // Reject terms with a prefix.
2348 if (C_isupper(ch)) return false;
2349
2350 {
2351 MyStopper stopper;
2352 // Don't suggest stopwords.
2353 if (stopper(term)) return false;
2354 }
2355
2356 // Reject small numbers.
2357 if (term.size() < 4 && C_isdigit(ch)) return false;
2358
2359 // Reject terms containing a space.
2360 if (term.find(' ') != string::npos) return false;
2361
2362 // Skip terms with stems in the exclude_stems set, to avoid suggesting
2363 // terms which are already in the query in some form.
2364 string stem = (*stemmer)(term);
2365 if (exclude_stems.find(stem) != exclude_stems.end())
2366 return false;
2367
2368 // Ignore terms that only occur once (hapaxes) since they aren't
2369 // useful for finding related documents - they only occur in a
2370 // document that's already been marked as relevant.
2371 // FIXME: add an expand option to ignore terms where
2372 // termfreq == rtermfreq.
2373 if (db.get_termfreq(term) <= 1) return false;
2374
2375 return true;
2376 }
2377