1 /* query.cc: query executor for omega
2  *
3  * Copyright 1999,2000,2001 BrightStation PLC
4  * Copyright 2001 James Aylett
5  * Copyright 2001,2002 Ananova Ltd
6  * Copyright 2002 Intercede 1749 Ltd
7  * Copyright 2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2014 Olly Betts
8  * Copyright 2008 Thomas Viehmann
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public License as
12  * published by the Free Software Foundation; either version 2 of the
13  * License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
23  * USA
24  */
25 
26 #include <config.h>
27 
28 #include <algorithm>
29 #include <iostream>
30 #include <map>
31 #include <set>
32 #include <vector>
33 
34 #include <cassert>
35 #include <cctype>
36 #include "safeerrno.h"
37 #include <stdio.h>
38 #include <cstdlib>
39 #include <cstring>
40 #include "strcasecmp.h"
41 #include <ctime>
42 
43 #include "safeunistd.h"
44 #include <sys/types.h>
45 #include "safesysstat.h"
46 #include "safefcntl.h"
47 
48 #include "realtime.h"
49 
50 #include <cdb.h>
51 
52 #include "date.h"
53 #include "datematchdecider.h"
54 #include "utils.h"
55 #include "omega.h"
56 #include "query.h"
57 #include "cgiparam.h"
58 #include "loadfile.h"
59 #include "str.h"
60 #include "stringutils.h"
61 #include "transform.h"
62 #include "urldecode.h"
63 #include "urlencode.h"
64 #include "unixperm.h"
65 #include "values.h"
66 #include "weight.h"
67 #include "expand.h"
68 
69 #include <xapian.h>
70 
71 #ifndef XAPIAN_AT_LEAST
72 #define XAPIAN_AT_LEAST(A,B,C) \
73     (XAPIAN_MAJOR_VERSION > (A) || \
74      (XAPIAN_MAJOR_VERSION == (A) && \
75       (XAPIAN_MINOR_VERSION > (B) || \
76        (XAPIAN_MINOR_VERSION == (B) && XAPIAN_REVISION >= (C)))))
77 #endif
78 
79 using namespace std;
80 
81 using Xapian::Utf8Iterator;
82 
83 using Xapian::Unicode::is_wordchar;
84 
85 #ifndef SNPRINTF
86 #include <cstdarg>
87 
my_snprintf(char * str,size_t size,const char * format,...)88 static int my_snprintf(char *str, size_t size, const char *format, ...)
89 {
90     int res;
91     va_list ap;
92     va_start(ap, format);
93     str[size - 1] = '\0';
94     res = vsprintf(str, format, ap);
95     if (str[size - 1] || res < 0 || size_t(res) >= size)
96 	abort(); /* Overflowed! */
97     va_end(ap);
98     return res;
99 }
100 #else
101 #define my_snprintf SNPRINTF
102 #endif
103 
104 static bool query_parsed = false;
105 static bool done_query = false;
106 static Xapian::docid last = 0;
107 
108 static Xapian::MSet mset;
109 
110 static map<Xapian::docid, bool> ticked;
111 
112 static void ensure_query_parsed();
113 static void ensure_match();
114 
115 static Xapian::Query query;
116 //static string url_query_string;
117 Xapian::Query::op default_op = Xapian::Query::OP_OR; // default matching mode
118 
119 static Xapian::QueryParser qp;
120 static Xapian::NumberValueRangeProcessor * size_vrp = NULL;
121 static Xapian::Stem *stemmer = NULL;
122 
123 static string eval_file(const string &fmtfile);
124 
125 static set<string> termset;
126 
127 // Holds mapping from term prefix to user prefix (e.g. 'S' -> 'subject:').
128 static map<string, string> termprefix_to_userprefix;
129 
130 static string queryterms;
131 
132 static string error_msg;
133 
134 static double secs = -1;
135 
136 static const char DEFAULT_LOG_ENTRY[] =
137 	"$or{$env{REMOTE_HOST},$env{REMOTE_ADDR},-}\t"
138 	"[$date{$now,%d/%b/%Y:%H:%M:%S} +0000]\t"
139 	"$if{$cgi{X},add,$if{$cgi{MORELIKE},morelike,query}}\t"
140 	"$dbname\t"
141 	"$query\t"
142 	"$msize$if{$env{HTTP_REFERER},\t$env{HTTP_REFERER}}";
143 
144 class MyStopper : public Xapian::Stopper {
145   public:
operator ()(const string & t) const146     bool operator()(const string &t) const {
147 	switch (t[0]) {
148 	    case 'a':
149 		return (t == "a" || t == "about" || t == "an" || t == "and" ||
150 			t == "are" || t == "as" || t == "at");
151 	    case 'b':
152 		return (t == "be" || t == "by");
153 	    case 'e':
154 		return (t == "en");
155 	    case 'f':
156 		return (t == "for" || t == "from");
157 	    case 'h':
158 		return (t == "how");
159 	    case 'i':
160 		return (t == "i" || t == "in" || t == "is" || t == "it");
161 	    case 'o':
162 		return (t == "of" || t == "on" || t == "or");
163 	    case 't':
164 		return (t == "that" || t == "the" || t == "this" || t == "to");
165 	    case 'w':
166 		return (t == "was" || t == "what" || t == "when" ||
167 			t == "where" || t == "which" || t == "who" ||
168 			t == "why" || t == "will" || t == "with");
169 	    case 'y':
170 		return (t == "you" || t == "your");
171 	    default:
172 		return false;
173 	}
174     }
175 };
176 
177 static size_t
prefix_from_term(string & prefix,const string & term)178 prefix_from_term(string &prefix, const string &term)
179 {
180     if (term.empty()) {
181 	prefix.resize(0);
182 	return 0;
183     }
184     if (term[0] == 'X') {
185 	const string::const_iterator begin = term.begin();
186 	string::const_iterator i = begin + 1;
187 	while (i != term.end() && C_isupper(*i)) ++i;
188 	prefix.assign(begin, i);
189 	if (i != term.end() && *i == ':') ++i;
190 	return i - begin;
191     }
192 
193     prefix = term[0];
194     return 1;
195 }
196 
197 // Don't allow ".." in format names, log file names, etc as this would allow
198 // people to open a format "../../etc/passwd" or similar.
199 // FIXME: make this check more exact ("foo..bar" is safe)
200 // FIXME: log when this check fails
201 static bool
vet_filename(const string & filename)202 vet_filename(const string &filename)
203 {
204     string::size_type i = filename.find("..");
205     return (i == string::npos);
206 }
207 
208 // Heuristics:
209 // * If any terms have been removed, it's a "fresh query" so we discard any
210 //   relevance judgements
211 // * If all previous terms are there but more have been added then we keep
212 //   the relevance judgements, but return the first page of hits
213 //
214 // NEW_QUERY entirely new query
215 // SAME_QUERY unchanged query
216 // EXTENDED_QUERY new query, but based on the old one
217 // BAD_QUERY parse error (message in error_msg)
218 typedef enum { NEW_QUERY, SAME_QUERY, EXTENDED_QUERY, BAD_QUERY } querytype;
219 
220 static querytype
set_probabilistic(const string & oldp)221 set_probabilistic(const string &oldp)
222 {
223     // Parse the query string.
224     qp.set_stemmer(Xapian::Stem(option["stemmer"]));
225     qp.set_stemming_strategy(option["stem_all"] == "true" ? Xapian::QueryParser::STEM_ALL : Xapian::QueryParser::STEM_SOME);
226     qp.set_stopper(new MyStopper());
227     qp.set_default_op(default_op);
228     qp.set_database(db);
229     // FIXME: provide a custom VRP which handles size:10..20K, etc.
230     if (!size_vrp)
231 	size_vrp = new Xapian::NumberValueRangeProcessor(VALUE_SIZE, "size:",
232 							 true);
233     qp.add_valuerangeprocessor(size_vrp);
234     // std::map::insert() won't overwrite an existing entry, so we'll prefer
235     // the first user_prefix for which a particular term prefix is specified.
236     map<string, string>::const_iterator pfx = option.lower_bound("prefix,");
237     for (; pfx != option.end() && startswith(pfx->first, "prefix,"); ++pfx) {
238 	string user_prefix = pfx->first.substr(7);
239 	qp.add_prefix(user_prefix, pfx->second);
240 	termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
241     }
242     pfx = option.lower_bound("boolprefix,");
243     for (; pfx != option.end() && startswith(pfx->first, "boolprefix,"); ++pfx) {
244 	string user_prefix = pfx->first.substr(11);
245 	qp.add_boolean_prefix(user_prefix, pfx->second);
246 	termprefix_to_userprefix.insert(make_pair(pfx->second, user_prefix));
247     }
248 
249     try {
250 	unsigned f = 0;
251 	map<string, string>::const_iterator i = option.lower_bound("flag_");
252 	for (; i != option.end() && startswith(i->first, "flag_"); ++i) {
253 	    if (i->second.empty()) continue;
254 	    const string & s = i->first;
255 	    switch (s[5]) {
256 		case 'a':
257 		    if (s == "flag_auto_multiword_synonyms") {
258 			f |= Xapian::QueryParser::FLAG_AUTO_MULTIWORD_SYNONYMS;
259 			break;
260 		    }
261 		    if (s == "flag_auto_synonyms") {
262 			f |= Xapian::QueryParser::FLAG_AUTO_SYNONYMS;
263 			break;
264 		    }
265 		    break;
266 		case 'b':
267 		    if (s == "flag_boolean") {
268 			f |= Xapian::QueryParser::FLAG_BOOLEAN;
269 			break;
270 		    }
271 		    if (s == "flag_boolean_any_case") {
272 			f |= Xapian::QueryParser::FLAG_BOOLEAN_ANY_CASE;
273 			break;
274 		    }
275 		    break;
276 #if XAPIAN_AT_LEAST(1,2,22)
277 		case 'c':
278 		    if (s == "flag_cjk_ngram") {
279 			f |= Xapian::QueryParser::FLAG_CJK_NGRAM;
280 			break;
281 		    }
282 		    break;
283 #endif
284 		case 'd':
285 		    if (s == "flag_default") {
286 			f |= Xapian::QueryParser::FLAG_DEFAULT;
287 			break;
288 		    }
289 		    break;
290 		case 'l':
291 		    if (s == "flag_lovehate") {
292 			f |= Xapian::QueryParser::FLAG_LOVEHATE;
293 			break;
294 		    }
295 		    break;
296 		case 'p':
297 		    if (s == "flag_partial") {
298 			f |= Xapian::QueryParser::FLAG_PARTIAL;
299 			break;
300 		    }
301 		    if (s == "flag_phrase") {
302 			f |= Xapian::QueryParser::FLAG_PHRASE;
303 			break;
304 		    }
305 		    if (s == "flag_pure_not") {
306 			f |= Xapian::QueryParser::FLAG_PURE_NOT;
307 			break;
308 		    }
309 		    break;
310 		case 's':
311 		    if (s == "flag_spelling_correction") {
312 			f |= Xapian::QueryParser::FLAG_SPELLING_CORRECTION;
313 			break;
314 		    }
315 		    if (s == "flag_synonym") {
316 			f |= Xapian::QueryParser::FLAG_SYNONYM;
317 			break;
318 		    }
319 		    break;
320 		case 'w':
321 		    if (s == "flag_wildcard") {
322 			f |= Xapian::QueryParser::FLAG_WILDCARD;
323 			break;
324 		    }
325 		    break;
326 	    }
327 	}
328 	if (option["spelling"] == "true")
329 	    f |= qp.FLAG_SPELLING_CORRECTION;
330 	query = qp.parse_query(query_string, f);
331     } catch (Xapian::QueryParserError &e) {
332 	error_msg = e.get_msg();
333 	return BAD_QUERY;
334     }
335 
336     Xapian::termcount n_new_terms = 0;
337     for (Xapian::TermIterator i = query.get_terms_begin();
338 	 i != query.get_terms_end(); ++i) {
339 	if (termset.find(*i) == termset.end()) {
340 	    termset.insert(*i);
341 	    if (!queryterms.empty()) queryterms += '\t';
342 	    queryterms += *i;
343 	}
344 	n_new_terms++;
345     }
346 
347     // Check new query against the previous one
348     if (oldp.empty()) return query_string.empty() ? SAME_QUERY : NEW_QUERY;
349 
350     // Long, long ago we used "word1#word2#" (with trailing #) but some broken
351     // old browsers (versions of MSIE) don't quote # in form GET submissions
352     // and everything after the # gets interpreted as an anchor.  We now allow
353     // terms like `c#' so we want to avoid '#' anyway.
354     //
355     // So we switched to using "word1.word2." but that doesn't work if
356     // the terms contain "." themselves (e.g. Tapplication/vnd.ms-excel)
357     // so now we use "word1\tword2" instead (with no trailing separator).
358     //
359     // However for compatibility with templates which haven't been updated and
360     // bookmarked queries from Omega 0.9.6 and earlier we still support ".".
361     char separator = '\t';
362     unsigned int n_old_terms = count(oldp.begin(), oldp.end(), '\t') + 1;
363     if (n_old_terms == 1 && oldp[oldp.size() - 1] == '.') {
364 	separator = '.';
365 	n_old_terms = count(oldp.begin(), oldp.end(), '.');
366     }
367 
368     // short-cut: if the new query has fewer terms, it must be a new one
369     if (n_new_terms < n_old_terms) return NEW_QUERY;
370 
371     const char *term = oldp.c_str();
372     const char *pend;
373     while ((pend = strchr(term, separator)) != NULL) {
374 	if (termset.find(string(term, pend - term)) == termset.end())
375 	    return NEW_QUERY;
376 	term = pend + 1;
377     }
378     if (*term) {
379 	if (termset.find(string(term)) == termset.end())
380 	    return NEW_QUERY;
381     }
382 
383     // Use termset.size() rather than n_new_terms so we correctly handle
384     // the case when the query has repeated terms.
385     // This works wrongly in the case when the user extends the query
386     // by adding a term already in it, but that's unlikely and the behaviour
387     // isn't too bad (we just don't reset page 1).  We also mishandle a few
388     // other obscure cases e.g. adding quotes to turn a query into a phrase.
389     if (termset.size() > n_old_terms) return EXTENDED_QUERY;
390     return SAME_QUERY;
391 }
392 
393 static multimap<string, string> filter_map;
394 
395 typedef multimap<string, string>::const_iterator FMCI;
396 
add_bterm(const string & term)397 void add_bterm(const string &term) {
398     string prefix;
399     if (prefix_from_term(prefix, term) > 0)
400 	filter_map.insert(multimap<string, string>::value_type(prefix, term));
401 }
402 
403 static void
run_query()404 run_query()
405 {
406     bool force_boolean = false;
407     if (!filter_map.empty()) {
408 	// OR together filters with the same prefix, then AND together
409 	vector<Xapian::Query> filter_vec;
410 	vector<string> or_vec;
411 	string current;
412 	for (FMCI i = filter_map.begin(); ; i++) {
413 	    bool over = (i == filter_map.end());
414 	    if (over || i->first != current) {
415 		switch (or_vec.size()) {
416 		    case 0:
417 		        break;
418 		    case 1:
419 			filter_vec.push_back(Xapian::Query(or_vec[0]));
420 		        break;
421 		    default:
422 			filter_vec.push_back(Xapian::Query(Xapian::Query::OP_OR,
423 						     or_vec.begin(),
424 						     or_vec.end()));
425 		        break;
426 		}
427 		or_vec.clear();
428 		if (over) break;
429 		current = i->first;
430 	    }
431 	    or_vec.push_back(i->second);
432 	}
433 
434 	Xapian::Query filter(Xapian::Query::OP_AND,
435 			     filter_vec.begin(), filter_vec.end());
436 
437 	if (query.empty()) {
438 	    // If no probabilistic query is provided then promote the filters
439 	    // to be THE query - filtering an empty query will give no
440 	    // matches.
441 	    std::swap(query, filter);
442 	    force_boolean = true;
443 	} else {
444 	    query = Xapian::Query(Xapian::Query::OP_FILTER, query, filter);
445 	}
446     }
447 
448     Xapian::MatchDecider * mdecider = NULL;
449     if (!date_start.empty() || !date_end.empty() || !date_span.empty()) {
450 	MCI i = cgi_params.find("DATEVALUE");
451 	if (i != cgi_params.end()) {
452 	    Xapian::valueno datevalue = string_to_int(i->second);
453 	    mdecider = new DateMatchDecider(datevalue, date_start, date_end, date_span);
454 	} else {
455 	    Xapian::Query date_filter(Xapian::Query::OP_OR,
456 				      date_range_filter(date_start, date_end,
457 							date_span),
458 				      Xapian::Query("Dlatest"));
459 
460 	    // If no probabilistic query is provided then promote the daterange
461 	    // filter to be THE query instead of filtering an empty query.
462 	    if (query.empty()) {
463 		query = date_filter;
464 	    } else {
465 		query = Xapian::Query(Xapian::Query::OP_FILTER, query, date_filter);
466 	    }
467 	}
468     }
469 
470     if (!enquire || !error_msg.empty()) return;
471 
472     set_weighting_scheme(*enquire, option, force_boolean);
473 
474     enquire->set_cutoff(threshold);
475 
476     if (sort_key != Xapian::BAD_VALUENO) {
477 	if (sort_after) {
478 	    enquire->set_sort_by_relevance_then_value(sort_key, sort_ascending);
479 	} else {
480 	    enquire->set_sort_by_value_then_relevance(sort_key, sort_ascending);
481 	}
482     }
483 
484     enquire->set_docid_order(docid_order);
485 
486     if (collapse) {
487 	enquire->set_collapse_key(collapse_key);
488     }
489 
490     if (!query.empty()) {
491 #if 0
492 	// FIXME: If we start doing permissions checks based on $REMOTE_USER
493 	// we're going to break some existing setups if users upgrade.  We
494 	// probably want a way to set this from OmegaScript.
495 	const char * remote_user = getenv("REMOTE_USER");
496 	if (remote_user)
497 	    apply_unix_permissions(query, remote_user);
498 #endif
499 
500 	enquire->set_query(query);
501 	// We could use the value of topdoc as first parameter, but we
502 	// need to know the first few items in the mset to fake a
503 	// relevance set for topterms.
504 	//
505 	// If min_hits isn't set, check at least one extra result so we
506 	// know if we've reached the end of the matches or not - then we
507 	// can avoid offering a "next" button which leads to an empty page.
508 	mset = enquire->get_mset(0, topdoc + hits_per_page,
509 				 topdoc + max(hits_per_page + 1, min_hits),
510 				 &rset, mdecider);
511     }
512 }
513 
514 string
html_escape(const string & str)515 html_escape(const string &str)
516 {
517     string res;
518     string::size_type p = 0;
519     while (p < str.size()) {
520 	char ch = str[p++];
521 	switch (ch) {
522 	    case '<':
523 	        res += "&lt;";
524 	        continue;
525 	    case '>':
526 	        res += "&gt;";
527 	        continue;
528 	    case '&':
529 	        res += "&amp;";
530 	        continue;
531 	    case '"':
532 	        res += "&quot;";
533 	        continue;
534 	    default:
535 	        res += ch;
536 	}
537     }
538     return res;
539 }
540 
541 static string
html_strip(const string & str)542 html_strip(const string &str)
543 {
544     string res;
545     string::size_type p = 0;
546     bool skip = false;
547     while (p < str.size()) {
548 	char ch = str[p++];
549 	switch (ch) {
550 	    case '<':
551 	        skip = true;
552 	        continue;
553 	    case '>':
554 	        skip = false;
555 	        continue;
556 	    default:
557 	        if (! skip) res += ch;
558 	}
559     }
560     return res;
561 }
562 
563 // FIXME split list into hash or map and use that rather than linear lookup?
word_in_list(const string & word,const string & list)564 static int word_in_list(const string& word, const string& list)
565 {
566     string::size_type split = 0, split2;
567     int count = 0;
568     while ((split2 = list.find('\t', split)) != string::npos) {
569 	if (word.size() == split2 - split) {
570 	    if (memcmp(word.data(), list.data() + split, word.size()) == 0)
571 		return count;
572 	}
573 	split = split2 + 1;
574 	++count;
575     }
576     if (word.size() == list.size() - split) {
577 	if (memcmp(word.data(), list.data() + split, word.size()) == 0)
578 	    return count;
579     }
580     return -1;
581 }
582 
583 // Not a character in an identifier
584 inline static bool
p_notid(unsigned int c)585 p_notid(unsigned int c)
586 {
587     return !C_isalnum(c) && c != '_';
588 }
589 
590 // Not a character in an HTML tag name
591 inline static bool
p_nottag(unsigned int c)592 p_nottag(unsigned int c)
593 {
594     return !C_isalnum(c) && c != '.' && c != '-';
595 }
596 
597 // FIXME: shares algorithm with indextext.cc!
598 static string
html_highlight(const string & s,const string & list,const string & bra,const string & ket)599 html_highlight(const string &s, const string &list,
600 	       const string &bra, const string &ket)
601 {
602     if (!stemmer) {
603 	stemmer = new Xapian::Stem(option["stemmer"]);
604     }
605 
606     string res;
607 
608     Utf8Iterator j(s);
609     const Utf8Iterator s_end;
610     while (true) {
611 	Utf8Iterator first = j;
612 	while (first != s_end && !is_wordchar(*first)) ++first;
613 	if (first == s_end) break;
614 	Utf8Iterator term_end;
615 	string term;
616 	string word;
617 	const char *l = j.raw();
618 	if (*first < 128 && C_isupper(*first)) {
619 	    j = first;
620 	    Xapian::Unicode::append_utf8(term, *j);
621 	    while (++j != s_end && *j == '.' && ++j != s_end && *j < 128 && C_isupper(*j)) {
622 		Xapian::Unicode::append_utf8(term, *j);
623 	    }
624 	    if (term.length() < 2 || (j != s_end && is_wordchar(*j))) {
625 		term.resize(0);
626 	    }
627 	    term_end = j;
628 	}
629 	if (term.empty()) {
630 	    j = first;
631 	    while (is_wordchar(*j)) {
632 		Xapian::Unicode::append_utf8(term, *j);
633 		++j;
634 		if (j == s_end) break;
635 		if (*j == '&' || *j == '\'') {
636 		    Utf8Iterator next = j;
637 		    ++next;
638 		    if (next == s_end || !is_wordchar(*next)) break;
639 		    term += *j;
640 		    j = next;
641 		}
642 	    }
643 	    term_end = j;
644 	    if (j != s_end && (*j == '+' || *j == '-' || *j == '#')) {
645 		string::size_type len = term.length();
646 		if (*j == '#') {
647 		    term += '#';
648 		    do { ++j; } while (j != s_end && *j == '#');
649 		} else {
650 		    while (j != s_end && (*j == '+' || *j == '-')) {
651 			Xapian::Unicode::append_utf8(term, *j);
652 			++j;
653 		    }
654 		}
655 		if (term.size() - len > 3 || (j != s_end && is_wordchar(*j))) {
656 		    term.resize(len);
657 		} else {
658 		    term_end = j;
659 		}
660 	    }
661 	}
662 	j = term_end;
663 	term = Xapian::Unicode::tolower(term);
664 	int match = word_in_list(term, list);
665 	if (match == -1) {
666 	    string stem = "Z";
667 	    stem += (*stemmer)(term);
668 	    match = word_in_list(stem, list);
669 	}
670 	if (match >= 0) {
671 	    res += html_escape(string(l, first.raw() - l));
672 	    if (!bra.empty()) {
673 		res += bra;
674 	    } else {
675 		static const char * colours[] = {
676 		    "ffff66", "99ff99", "99ffff", "ff66ff", "ff9999",
677 		    "990000", "009900", "996600", "006699", "990099"
678 		};
679 		size_t idx = match % (sizeof(colours) / sizeof(colours[0]));
680 		const char * bg = colours[idx];
681 		if (strchr(bg, 'f')) {
682 		    res += "<b style=\"color:black;background-color:#";
683 		} else {
684 		    res += "<b style=\"color:white;background-color:#";
685 		}
686 		res += bg;
687 		res += "\">";
688 	    }
689 	    word.assign(first.raw(), j.raw() - first.raw());
690 	    res += html_escape(word);
691 	    if (!bra.empty()) {
692 		res += ket;
693 	    } else {
694 		res += "</b>";
695 	    }
696 	} else {
697 	    res += html_escape(string(l, j.raw() - l));
698 	}
699     }
700     if (j != s_end) res += html_escape(string(j.raw(), j.left()));
701     return res;
702 }
703 
704 #if 0
705 static void
706 print_query_string(const char *after)
707 {
708     if (after && strncmp(after, "&B=", 3) == 0) {
709 	char prefix = after[3];
710 	string::size_type start = 0, amp = 0;
711 	while (true) {
712 	    amp = url_query_string.find('&', amp);
713 	    if (amp == string::npos) {
714 		cout << url_query_string.substr(start);
715 		return;
716 	    }
717 	    amp++;
718 	    while (url_query_string[amp] == 'B' &&
719 		   url_query_string[amp + 1] == '=' &&
720 		   url_query_string[amp + 2] == prefix) {
721 		cout << url_query_string.substr(start, amp - start - 1);
722 		start = url_query_string.find('&', amp + 3);
723 		if (start == string::npos) return;
724 		amp = start + 1;
725 	    }
726 	}
727     }
728     cout << url_query_string;
729 }
730 #endif
731 
732 class Fields {
733     mutable Xapian::docid did_cached;
734     mutable map<string, string> fields;
735 
736     void read_fields(Xapian::docid did) const;
737 
738   public:
Fields()739     Fields() : did_cached(0) { }
740 
get_field(Xapian::docid did,const string & field) const741     const string & get_field(Xapian::docid did, const string & field) const {
742 	if (did != did_cached) read_fields(did);
743 	return fields[field];
744     }
745 };
746 
747 void
read_fields(Xapian::docid did) const748 Fields::read_fields(Xapian::docid did) const
749 {
750     fields.clear();
751     did_cached = did;
752     const string & data = db.get_document(did).get_data();
753 
754     // Parse document data.
755     string::size_type i = 0;
756     const string & names = option["fieldnames"];
757     if (!names.empty()) {
758 	// Each line is a field, with fieldnames taken from corresponding
759 	// entries in the tab-separated list specified by $opt{fieldnames}.
760 	string::size_type n = 0;
761 	do {
762 	    string::size_type n0 = n;
763 	    n = names.find('\t', n);
764 	    string::size_type i0 = i;
765 	    i = data.find('\n', i);
766 	    fields.insert(make_pair(names.substr(n0, n  - n0),
767 				    data.substr(i0, i - i0)));
768 	} while (++n && ++i);
769     } else {
770 	// Each line is a field, in the format NAME=VALUE.  We assume the field
771 	// name doesn't contain an "=".  Lines without an "=" are currently
772 	// just ignored.
773 	do {
774 	    string::size_type i0 = i;
775 	    i = data.find('\n', i);
776 	    string line = data.substr(i0, i - i0);
777 	    string::size_type j = line.find('=');
778 	    if (j != string::npos) {
779 		string & value = fields[line.substr(0, j)];
780 		if (!value.empty()) value += '\t';
781 		value.append(line, j + 1, string::npos);
782 	    }
783 	} while (++i);
784     }
785 }
786 
787 static Fields fields;
788 static Xapian::docid q0;
789 static Xapian::doccount hit_no;
790 static int percent;
791 static Xapian::weight weight;
792 static Xapian::doccount collapsed;
793 
794 static string print_caption(const string &fmt, const vector<string> &param);
795 
796 enum tagval {
797 CMD_,
798 CMD_add,
799 CMD_addfilter,
800 CMD_allterms,
801 CMD_and,
802 CMD_cgi,
803 CMD_cgilist,
804 CMD_collapsed,
805 CMD_date,
806 CMD_dbname,
807 CMD_dbsize,
808 CMD_def,
809 CMD_defaultop,
810 CMD_div,
811 CMD_eq,
812 CMD_emptydocs,
813 CMD_env,
814 CMD_error,
815 CMD_field,
816 CMD_filesize,
817 CMD_filters,
818 CMD_filterterms,
819 CMD_find,
820 CMD_fmt,
821 CMD_freq,
822 CMD_ge,
823 CMD_gt,
824 CMD_highlight,
825 CMD_hit,
826 CMD_hitlist,
827 CMD_hitsperpage,
828 CMD_hostname,
829 CMD_html,
830 CMD_htmlstrip,
831 CMD_httpheader,
832 CMD_id,
833 CMD_if,
834 CMD_include,
835 CMD_last,
836 CMD_lastpage,
837 CMD_le,
838 CMD_length,
839 CMD_list,
840 CMD_log,
841 CMD_lookup,
842 CMD_lower,
843 CMD_lt,
844 CMD_map,
845 CMD_max,
846 CMD_min,
847 CMD_mod,
848 CMD_msize,
849 CMD_msizeexact,
850 CMD_mul,
851 CMD_muldiv,
852 CMD_ne,
853 CMD_nice,
854 CMD_not,
855 CMD_now,
856 CMD_opt,
857 CMD_or,
858 CMD_pack,
859 CMD_percentage,
860 CMD_prettyterm,
861 CMD_prettyurl,
862 CMD_query,
863 CMD_querydescription,
864 CMD_queryterms,
865 CMD_range,
866 CMD_record,
867 CMD_relevant,
868 CMD_relevants,
869 CMD_score,
870 CMD_set,
871 CMD_setmap,
872 CMD_setrelevant,
873 CMD_slice,
874 CMD_split,
875 CMD_stoplist,
876 CMD_sub,
877 CMD_substr,
878 CMD_suggestion,
879 CMD_terms,
880 CMD_thispage,
881 CMD_time,
882 CMD_topdoc,
883 CMD_topterms,
884 CMD_transform,
885 CMD_uniq,
886 CMD_unpack,
887 CMD_unstem,
888 CMD_upper,
889 CMD_url,
890 CMD_value,
891 CMD_version,
892 CMD_weight,
893 CMD_MACRO // special tag for macro evaluation
894 };
895 
896 struct func_attrib {
897     int tag;
898     int minargs, maxargs, evalargs;
899     char ensure;
900 };
901 
902 #define T(F,A,B,C,D) {STRINGIZE(F),{CMD_##F,A,B,C,D}}
903 struct func_desc {
904     const char *name;
905     struct func_attrib a;
906 };
907 
908 #define N -1
909 #define M 'M'
910 #define Q 'Q'
911 // NB when adding a new command which ensures M or Q, update the list in
912 // docs/omegascript.rst
913 static struct func_desc func_tab[] = {
914 //name minargs maxargs evalargs ensure
915 {"",{CMD_,	   N, N, 0, 0}},// commented out code
916 T(add,		   0, N, N, 0), // add a list of numbers
917 T(addfilter,	   1, 1, N, 0), // add filter term
918 T(allterms,	   0, 1, N, 0), // list of all terms matching document
919 T(and,		   1, N, 0, 0), // logical shortcutting and of a list of values
920 T(cgi,		   1, 1, N, 0), // return cgi parameter value
921 T(cgilist,	   1, 1, N, 0), // return list of values for cgi parameter
922 T(collapsed,	   0, 0, N, 0), // return number of hits collapsed into this
923 T(date,		   1, 2, N, 0), // convert time_t to strftime format
924 				// (default: YYYY-MM-DD)
925 T(dbname,	   0, 0, N, 0), // database name
926 T(dbsize,	   0, 0, N, 0), // database size (# of documents)
927 T(def,		   2, 2, 1, 0), // define a macro
928 T(defaultop,	   0, 0, N, 0), // default operator: "and" or "or"
929 T(div,		   2, 2, N, 0), // integer divide
930 T(emptydocs,	   0, 1, N, 0), // list of empty documents
931 T(env,		   1, 1, N, 0), // environment variable
932 T(error,	   0, 0, N, 0), // error message
933 T(eq,		   2, 2, N, 0), // test equality
934 T(field,	   1, 2, N, 0), // lookup field in record
935 T(filesize,	   1, 1, N, 0), // pretty printed filesize
936 T(filters,	   0, 0, N, 0), // serialisation of current filters
937 T(filterterms,	   1, 1, N, 0), // list of terms with a given prefix
938 T(find,		   2, 2, N, 0), // find entry in list
939 T(fmt,		   0, 0, N, 0), // name of current format
940 T(freq,		   1, 1, N, 0), // frequency of a term
941 T(ge,		   2, 2, N, 0), // test >=
942 T(gt,		   2, 2, N, 0), // test >
943 T(highlight,	   2, 4, N, 0), // html escape and highlight words from list
944 T(hit,		   0, 0, N, 0), // hit number of current mset entry (0-based)
945 T(hitlist,	   1, 1, 0, M), // display hitlist using format in argument
946 T(hitsperpage,	   0, 0, N, 0), // hits per page
947 T(hostname,	   1, 1, N, 0), // extract hostname from URL
948 T(html,		   1, 1, N, 0), // html escape string (<>&")
949 T(htmlstrip,	   1, 1, N, 0), // html strip tags string (s/<[^>]*>?//g)
950 T(httpheader,      2, 2, N, 0), // arbitrary HTTP header
951 T(id,		   0, 0, N, 0), // docid of current doc
952 T(if,		   2, 3, 1, 0), // conditional
953 T(include,	   1, 1, 1, 0), // include another file
954 T(last,		   0, 0, N, M), // hit number one beyond end of current page
955 T(lastpage,	   0, 0, N, M), // number of last hit page
956 T(le,		   2, 2, N, 0), // test <=
957 T(length,	   1, 1, N, 0), // length of list
958 T(list,		   2, 5, N, 0), // pretty print list
959 T(log,		   1, 2, 1, 0), // create a log entry
960 T(lookup,	   2, 2, N, 0), // lookup in named cdb file
961 T(lower,	   1, 1, N, 0), // convert string to lower case
962 T(lt,		   2, 2, N, 0), // test <
963 T(map,		   1, 2, 1, 0), // map a list into another list
964 T(max,		   1, N, N, 0), // maximum of a list of values
965 T(min,		   1, N, N, 0), // minimum of a list of values
966 T(mod,		   2, 2, N, 0), // integer modulus
967 T(msize,	   0, 0, N, M), // number of matches
968 T(msizeexact,	   0, 0, N, M), // is $msize exact?
969 T(mul,		   2, N, N, 0), // multiply a list of numbers
970 T(muldiv,	   3, 3, N, 0), // calculate A*B/C
971 T(ne,		   2, 2, N, 0), // test not equal
972 T(nice,		   1, 1, N, 0), // pretty print integer (with thousands sep)
973 T(not,		   1, 1, N, 0), // logical not
974 T(now,		   0, 0, N, 0), // current date/time as a time_t
975 T(opt,		   1, 2, N, 0), // lookup an option value
976 T(or,		   1, N, 0, 0), // logical shortcutting or of a list of values
977 T(pack,		   1, 1, N, 0), // convert a number to a 4 byte big endian binary string
978 T(percentage,	   0, 0, N, 0), // percentage score of current hit
979 T(prettyterm,	   1, 1, N, Q), // pretty print term name
980 T(prettyurl,	   1, 1, N, 0), // pretty version of URL
981 T(query,	   0, 0, N, Q), // query
982 T(querydescription,0, 0, N, M), // query.get_description() (run_query() adds filters so M)
983 T(queryterms,	   0, 0, N, Q), // list of query terms
984 T(range,	   2, 2, N, 0), // return list of values between start and end
985 T(record,	   0, 1, N, 0), // record contents of document
986 T(relevant,	   0, 1, N, Q), // is document relevant?
987 T(relevants,	   0, 0, N, Q), // return list of relevant documents
988 T(score,	   0, 0, N, 0), // score (0-10) of current hit
989 T(set,		   2, 2, N, 0), // set option value
990 T(setmap,	   1, N, N, 0), // set map of option values
991 T(setrelevant,     0, 1, N, Q), // set rset
992 T(slice,	   2, 2, N, 0), // slice a list using a second list
993 T(split,	   1, 2, N, 0), // split a string to give a list
994 T(stoplist,	   0, 0, N, Q), // return list of stopped terms
995 T(sub,		   2, 2, N, 0), // subtract
996 T(substr,	   2, 3, N, 0), // substring
997 T(suggestion,	   0, 0, N, Q), // misspelled word correction suggestion
998 T(terms,	   0, 0, N, M), // list of matching terms
999 T(thispage,	   0, 0, N, M), // page number of current page
1000 T(time,		   0, 0, N, M), // how long the match took (in seconds)
1001 T(topdoc,	   0, 0, N, M), // first document on current page of hit list
1002 				// (counting from 0)
1003 T(topterms,	   0, 1, N, M), // list of up to N top relevance feedback terms
1004 				// (default 16)
1005 T(transform,	   3, 3, N, 0), // transform with a regexp
1006 T(uniq,		   1, 1, N, 0), // removed duplicates from a sorted list
1007 T(unpack,	   1, 1, N, 0), // convert 4 byte big endian binary string to a number
1008 T(unstem,	   1, 1, N, Q), // return list of probabilistic terms from
1009 				// the query which stemmed to this term
1010 T(upper,	   1, 1, N, 0), // convert string to upper case
1011 T(url,		   1, 1, N, 0), // url encode argument
1012 T(value,	   1, 2, N, 0), // return document value
1013 T(version,	   0, 0, N, 0), // omega version string
1014 T(weight,	   0, 0, N, 0), // weight of the current hit
1015 { NULL,{0,	   0, 0, 0, 0}}
1016 };
1017 
1018 #undef T // Leaving T defined screws up Sun's C++ compiler!
1019 
1020 static vector<string> macros;
1021 
1022 // Call write() repeatedly until all data is written or we get a
1023 // non-recoverable error.
1024 static ssize_t
write_all(int fd,const char * buf,size_t count)1025 write_all(int fd, const char * buf, size_t count)
1026 {
1027     while (count) {
1028 	ssize_t r = write(fd, buf, count);
1029 	if (rare(r < 0)) {
1030 	    if (errno == EINTR) continue;
1031 	    return r;
1032 	}
1033 	buf += r;
1034 	count -= r;
1035     }
1036     return 0;
1037 }
1038 
1039 static string
eval(const string & fmt,const vector<string> & param)1040 eval(const string &fmt, const vector<string> &param)
1041 {
1042     static map<string, const struct func_attrib *> func_map;
1043     if (func_map.empty()) {
1044 	struct func_desc *p;
1045 	for (p = func_tab; p->name != NULL; p++) {
1046 	    func_map[string(p->name)] = &(p->a);
1047 	}
1048     }
1049     string res;
1050     string::size_type p = 0, q;
1051     while ((q = fmt.find('$', p)) != string::npos) try {
1052 	res.append(fmt, p, q - p);
1053 	string::size_type code_start = q; // note down for error reporting
1054 	q++;
1055 	if (q >= fmt.size()) break;
1056 	unsigned char ch = fmt[q];
1057 	switch (ch) {
1058 	    // Magic sequences:
1059 	    // `$$' -> `$', `$(' -> `{', `$)' -> `}', `$.' -> `,'
1060 	    case '$':
1061 		res += '$';
1062 		p = q + 1;
1063 		continue;
1064 	    case '(':
1065 		res += '{';
1066 		p = q + 1;
1067 		continue;
1068 	    case ')':
1069 		res += '}';
1070 		p = q + 1;
1071 		continue;
1072 	    case '.':
1073 		res += ',';
1074 		p = q + 1;
1075 		continue;
1076 	    case '_':
1077 		ch = '0';
1078 		// FALL THRU
1079 	    case '1': case '2': case '3': case '4': case '5':
1080 	    case '6': case '7': case '8': case '9':
1081 		ch -= '0';
1082 		if (ch < param.size()) res += param[ch];
1083 		p = q + 1;
1084 		continue;
1085 	    case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
1086 	    case 'g': case 'h': case 'i': case 'j': case 'k': case 'l':
1087 	    case 'm': case 'n': case 'o': case 'p': case 'q': case 'r':
1088 	    case 's': case 't': case 'u': case 'v': case 'w': case 'x':
1089 	    case 'y': case 'z':
1090 	    case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
1091 	    case 'G': case 'H': case 'I': case 'J': case 'K': case 'L':
1092 	    case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R':
1093 	    case 'S': case 'T': case 'U': case 'V': case 'W': case 'X':
1094 	    case 'Y': case 'Z':
1095 	    case '{':
1096 		break;
1097 	    default:
1098 		string msg = "Unknown $ code in: $" + fmt.substr(q);
1099 		throw msg;
1100 	}
1101 	p = find_if(fmt.begin() + q, fmt.end(), p_notid) - fmt.begin();
1102 	string var = fmt.substr(q, p - q);
1103 	map<string, const struct func_attrib *>::const_iterator func;
1104 	func = func_map.find(var);
1105 	if (func == func_map.end()) {
1106 	    throw "Unknown function `" + var + "'";
1107 	}
1108 	vector<string> args;
1109 	if (fmt[p] == '{') {
1110 	    q = p + 1;
1111 	    int nest = 1;
1112 	    while (true) {
1113 		p = fmt.find_first_of(",{}", p + 1);
1114 		if (p == string::npos)
1115 		    throw "missing } in " + fmt.substr(code_start);
1116 		if (fmt[p] == '{') {
1117 		    ++nest;
1118 		} else {
1119 		    if (nest == 1) {
1120 			// should we split the args
1121 			if (func->second->minargs != N) {
1122 			    args.push_back(fmt.substr(q, p - q));
1123 			    q = p + 1;
1124 			}
1125 		    }
1126 		    if (fmt[p] == '}' && --nest == 0) break;
1127 		}
1128 	    }
1129 	    if (func->second->minargs == N)
1130 		args.push_back(fmt.substr(q, p - q));
1131 	    p++;
1132 	}
1133 
1134 	if (func->second->minargs != N) {
1135 	    if ((int)args.size() < func->second->minargs)
1136 		throw "too few arguments to $" + var;
1137 	    if (func->second->maxargs != N &&
1138 		(int)args.size() > func->second->maxargs)
1139 		throw "too many arguments to $" + var;
1140 
1141 	    vector<string>::size_type n;
1142 	    if (func->second->evalargs != N)
1143 		n = func->second->evalargs;
1144 	    else
1145 		n = args.size();
1146 
1147 	    for (vector<string>::size_type j = 0; j < n; j++)
1148 		args[j] = eval(args[j], param);
1149 	}
1150 	if (func->second->ensure == 'Q' || func->second->ensure == 'M')
1151 	    ensure_query_parsed();
1152 	if (func->second->ensure == 'M') ensure_match();
1153 	string value;
1154 	switch (func->second->tag) {
1155 	    case CMD_:
1156 	        break;
1157 	    case CMD_add: {
1158 		int total = 0;
1159 		vector<string>::const_iterator i;
1160 		for (i = args.begin(); i != args.end(); i++)
1161 		    total += string_to_int(*i);
1162 		value = str(total);
1163 		break;
1164 	    }
1165 	    case CMD_addfilter:
1166 		add_bterm(args[0]);
1167 		break;
1168 	    case CMD_allterms: {
1169 		// list of all terms indexing document
1170 		int id = q0;
1171 		if (!args.empty()) id = string_to_int(args[0]);
1172 		Xapian::TermIterator term = db.termlist_begin(id);
1173 		for ( ; term != db.termlist_end(id); term++) {
1174 		    value += *term;
1175 		    value += '\t';
1176 		}
1177 
1178 		if (!value.empty()) value.erase(value.size() - 1);
1179 		break;
1180 	    }
1181 	    case CMD_and: {
1182 		value = "true";
1183 		for (vector<string>::const_iterator i = args.begin();
1184 		     i != args.end(); i++) {
1185 		    if (eval(*i, param).empty()) {
1186 			value.resize(0);
1187 			break;
1188 		    }
1189 	        }
1190 		break;
1191 	    }
1192 	    case CMD_cgi: {
1193 		MCI i = cgi_params.find(args[0]);
1194 		if (i != cgi_params.end()) value = i->second;
1195 		break;
1196 	    }
1197 	    case CMD_cgilist: {
1198 		pair<MCI, MCI> g;
1199 		g = cgi_params.equal_range(args[0]);
1200 		for (MCI i = g.first; i != g.second; i++) {
1201 		    value += i->second;
1202 		    value += '\t';
1203 		}
1204 		if (!value.empty()) value.erase(value.size() - 1);
1205 		break;
1206 	    }
1207 	    case CMD_collapsed: {
1208 		value = str(collapsed);
1209 		break;
1210 	    }
1211 	    case CMD_date:
1212 		value = args[0];
1213 		if (!value.empty()) {
1214 		    char buf[64] = "";
1215 		    time_t date = string_to_int(value);
1216 		    if (date != (time_t)-1) {
1217 			struct tm *then;
1218 			then = gmtime(&date);
1219 			string date_fmt = "%Y-%m-%d";
1220 			if (args.size() > 1) date_fmt = eval(args[1], param);
1221 			strftime(buf, sizeof buf, date_fmt.c_str(), then);
1222 		    }
1223 		    value = buf;
1224 		}
1225 		break;
1226 	    case CMD_dbname:
1227 		value = dbname;
1228 		break;
1229 	    case CMD_dbsize: {
1230 		static Xapian::doccount dbsize;
1231 		if (!dbsize) dbsize = db.get_doccount();
1232 		value = str(dbsize);
1233 		break;
1234 	    }
1235 	    case CMD_def: {
1236 		func_attrib *fa = new func_attrib;
1237 		fa->tag = CMD_MACRO + macros.size();
1238 		fa->minargs = 0;
1239 		fa->maxargs = 9;
1240 		fa->evalargs = N; // FIXME: or 0?
1241 		fa->ensure = 0;
1242 
1243 		macros.push_back(args[1]);
1244 		func_map[args[0]] = fa;
1245 		break;
1246 	    }
1247 	    case CMD_defaultop:
1248 		if (default_op == Xapian::Query::OP_AND) {
1249 		    value = "and";
1250 		} else {
1251 		    value = "or";
1252 		}
1253 		break;
1254 	    case CMD_div: {
1255 		int denom = string_to_int(args[1]);
1256 		if (denom == 0) {
1257 		    value = "divide by 0";
1258 		} else {
1259 		    value = str(string_to_int(args[0]) /
1260 				string_to_int(args[1]));
1261 		}
1262 		break;
1263 	    }
1264 	    case CMD_eq:
1265 		if (args[0] == args[1]) value = "true";
1266 		break;
1267 	    case CMD_emptydocs: {
1268 		string t;
1269 		if (!args.empty())
1270 		    t = args[0];
1271 		Xapian::PostingIterator i;
1272 		for (i = db.postlist_begin(t); i != db.postlist_end(t); ++i) {
1273 		    if (i.get_doclength() != 0) continue;
1274 		    if (!value.empty()) value += '\t';
1275 		    value += str(*i);
1276 		}
1277 		break;
1278 	    }
1279 	    case CMD_env: {
1280 		char *env = getenv(args[0].c_str());
1281 		if (env != NULL) value = env;
1282 		break;
1283 	    }
1284 	    case CMD_error:
1285 		if (error_msg.empty() && enquire == NULL && !dbname.empty()) {
1286 		    error_msg = "Database `" + dbname + "' couldn't be opened";
1287 		}
1288 		value = error_msg;
1289 		break;
1290 	    case CMD_field: {
1291 		Xapian::docid did = q0;
1292 		if (args.size() > 1) did = string_to_int(args[1]);
1293 		value = fields.get_field(did, args[0]);
1294 		break;
1295 	    }
1296 	    case CMD_filesize: {
1297 		// FIXME: rounding?  i18n?
1298 		int size = string_to_int(args[0]);
1299 		int intpart = size;
1300 		int fraction = -1;
1301 		const char * format = 0;
1302 		if (size < 0) {
1303 		    // Negative size -> empty result.
1304 		} else if (size == 1) {
1305 		    format = "%d byte";
1306 		} else if (size < 1024) {
1307 		    format = "%d bytes";
1308 		} else {
1309 		    if (size < 1024*1024) {
1310 			format = "%d.%cK";
1311 		    } else {
1312 			size /= 1024;
1313 			if (size < 1024*1024) {
1314 			    format = "%d.%cM";
1315 			} else {
1316 			    size /= 1024;
1317 			    format = "%d.%cG";
1318 			}
1319 		    }
1320 		    intpart = unsigned(size) / 1024;
1321 		    fraction = unsigned(size) % 1024;
1322 		}
1323 		if (format) {
1324 		    char buf[200];
1325 		    int len;
1326 		    if (fraction == -1) {
1327 			len = my_snprintf(buf, sizeof(buf), format, intpart);
1328 		    } else {
1329 			fraction = (fraction * 10 / 1024) + '0';
1330 			len = my_snprintf(buf, sizeof(buf), format, intpart, fraction);
1331 		    }
1332 		    if (len < 0 || (unsigned)len > sizeof(buf)) len = sizeof(buf);
1333 		    value.assign(buf, len);
1334 		}
1335 		break;
1336 	    }
1337 	    case CMD_filters:
1338 		value = filters;
1339 		break;
1340 	    case CMD_filterterms: {
1341 		Xapian::TermIterator term = db.allterms_begin();
1342 		term.skip_to(args[0]);
1343 		while (term != db.allterms_end()) {
1344 		    string t = *term;
1345 		    if (!startswith(t, args[0])) break;
1346 		    value += t;
1347 		    value += '\t';
1348 		    ++term;
1349 		}
1350 
1351 		if (!value.empty()) value.erase(value.size() - 1);
1352 		break;
1353 	    }
1354 	    case CMD_find: {
1355 		string l = args[0], s = args[1];
1356 		string::size_type i = 0, j = 0;
1357 		size_t count = 0;
1358 		while (j != l.size()) {
1359 		    j = l.find('\t', i);
1360 		    if (j == string::npos) j = l.size();
1361 		    if (j - i == s.length()) {
1362 			if (memcmp(s.data(), l.data() + i, j - i) == 0) {
1363 			    value = str(count);
1364 			    break;
1365 			}
1366 		    }
1367 		    ++count;
1368 		    i = j + 1;
1369 		}
1370 		break;
1371 	    }
1372 	    case CMD_fmt:
1373 		value = fmtname;
1374 		break;
1375 	    case CMD_freq:
1376 		try {
1377 		    value = str(mset.get_termfreq(args[0]));
1378 		} catch (const Xapian::InvalidOperationError&) {
1379 		    // An MSet will raise this error if it's empty and not
1380 		    // associated with a search.
1381 		    value = str(db.get_termfreq(args[0]));
1382 		}
1383 		break;
1384             case CMD_ge:
1385 		if (string_to_int(args[0]) >= string_to_int(args[1]))
1386 		    value = "true";
1387 		break;
1388             case CMD_gt:
1389 		if (string_to_int(args[0]) > string_to_int(args[1]))
1390 		    value = "true";
1391 		break;
1392 	    case CMD_highlight: {
1393 		string bra, ket;
1394 		if (args.size() > 2) {
1395 		    bra = args[2];
1396 		    if (args.size() > 3) {
1397 			ket = args[3];
1398 		    } else {
1399 			string::const_iterator i;
1400 			i = find_if(bra.begin() + 2, bra.end(), p_nottag);
1401 			ket = "</";
1402 			ket.append(bra, 1, i - bra.begin() - 1);
1403 			ket += '>';
1404 		    }
1405 		}
1406 
1407 		value = html_highlight(args[0], args[1], bra, ket);
1408 		break;
1409 	    }
1410 	    case CMD_hit:
1411 		// 0-based mset index
1412 		value = str(hit_no);
1413 		break;
1414 	    case CMD_hitlist:
1415 #if 0
1416 		const char *q;
1417 		int ch;
1418 
1419 		url_query_string = "?DB=";
1420 		url_query_string += dbname;
1421 		url_query_string += "&P=";
1422 		q = query_string.c_str();
1423 		while ((ch = *q++) != '\0') {
1424 		    switch (ch) {
1425 		     case '+':
1426 			url_query_string += "%2b";
1427 			break;
1428 		     case '"':
1429 			url_query_string += "%22";
1430 			break;
1431 		     case ' ':
1432 			ch = '+';
1433 			/* fall through */
1434 		     default:
1435 			url_query_string += ch;
1436 		    }
1437 		}
1438 	        // add any boolean terms
1439 		for (FMCI i = filter_map.begin(); i != filter_map.end(); i++) {
1440 		    url_query_string += "&B=";
1441 		    url_query_string += i->second;
1442 		}
1443 #endif
1444 		for (hit_no = topdoc; hit_no < last; hit_no++)
1445 		    value += print_caption(args[0], param);
1446 		hit_no = 0;
1447 		break;
1448 	    case CMD_hitsperpage:
1449 		value = str(hits_per_page);
1450 		break;
1451 	    case CMD_hostname: {
1452 	        value = args[0];
1453 		// remove URL scheme and/or path
1454 		string::size_type i = value.find("://");
1455 		if (i == string::npos) i = 0; else i += 3;
1456 		value = value.substr(i, value.find('/', i) - i);
1457 		// remove user@ or user:password@
1458 		i = value.find('@');
1459 		if (i != string::npos) value.erase(0, i + 1);
1460 		// remove :port
1461 		i = value.find(':');
1462 		if (i != string::npos) value.resize(i);
1463 		break;
1464 	    }
1465 	    case CMD_html:
1466 	        value = html_escape(args[0]);
1467 		break;
1468 	    case CMD_htmlstrip:
1469 	        value = html_strip(args[0]);
1470 		break;
1471 	    case CMD_httpheader:
1472 		if (!suppress_http_headers) {
1473 		    cout << args[0] << ": " << args[1] << endl;
1474 		    if (!set_content_type && args[0].length() == 12 &&
1475 			    strcasecmp(args[0].c_str(), "Content-Type") == 0) {
1476 			set_content_type = true;
1477 		    }
1478 		}
1479 	        break;
1480 	    case CMD_id:
1481 		// document id
1482 		value = str(q0);
1483 		break;
1484 	    case CMD_if:
1485 		if (!args[0].empty())
1486 		    value = eval(args[1], param);
1487 		else if (args.size() > 2)
1488 		    value = eval(args[2], param);
1489 		break;
1490 	    case CMD_include:
1491 	        value = eval_file(args[0]);
1492 	        break;
1493 	    case CMD_last:
1494 		value = str(last);
1495 		break;
1496 	    case CMD_lastpage: {
1497 		int l = mset.get_matches_estimated();
1498 		if (l > 0) l = (l - 1) / hits_per_page + 1;
1499 		value = str(l);
1500 		break;
1501 	    }
1502             case CMD_le:
1503 		if (string_to_int(args[0]) <= string_to_int(args[1]))
1504 		    value = "true";
1505 		break;
1506             case CMD_length:
1507 		if (args[0].empty()) {
1508 		    value = "0";
1509 		} else {
1510 		    size_t length = count(args[0].begin(), args[0].end(), '\t');
1511 		    value = str(length + 1);
1512 		}
1513 		break;
1514 	    case CMD_list: {
1515 		if (!args[0].empty()) {
1516 		    string pre, inter, interlast, post;
1517 		    switch (args.size()) {
1518 		     case 2:
1519 			inter = interlast = args[1];
1520 			break;
1521 		     case 3:
1522 			inter = args[1];
1523 			interlast = args[2];
1524 			break;
1525 		     case 4:
1526 			pre = args[1];
1527 			inter = interlast = args[2];
1528 			post = args[3];
1529 			break;
1530 		     case 5:
1531 			pre = args[1];
1532 			inter = args[2];
1533 			interlast = args[3];
1534 			post = args[4];
1535 			break;
1536 		    }
1537 		    value += pre;
1538 		    string list = args[0];
1539 		    string::size_type split = 0, split2;
1540 		    while ((split2 = list.find('\t', split)) != string::npos) {
1541 			if (split) value += inter;
1542 			value.append(list, split, split2 - split);
1543 			split = split2 + 1;
1544 		    }
1545 		    if (split) value += interlast;
1546 		    value.append(list, split, string::npos);
1547 		    value += post;
1548 		}
1549 		break;
1550 	    }
1551 	    case CMD_log: {
1552 		if (!vet_filename(args[0])) break;
1553 		string logfile = log_dir + args[0];
1554 	        int fd = open(logfile.c_str(), O_CREAT|O_APPEND|O_WRONLY, 0644);
1555 		if (fd == -1) break;
1556 		vector<string> noargs;
1557 		noargs.resize(1);
1558 		string line;
1559 		if (args.size() > 1) {
1560 		    line = args[1];
1561 		} else {
1562 		    line = DEFAULT_LOG_ENTRY;
1563 		}
1564 		line = eval(line, noargs);
1565 		line += '\n';
1566 		(void)write_all(fd, line.data(), line.length());
1567 		close(fd);
1568 		break;
1569 	    }
1570 	    case CMD_lookup: {
1571 		if (!vet_filename(args[0])) break;
1572 		string cdbfile = cdb_dir + args[0];
1573 	        int fd = open(cdbfile.c_str(), O_RDONLY);
1574 		if (fd == -1) break;
1575 
1576 		struct cdb cdb;
1577 		cdb_init(&cdb, fd);
1578 
1579 		if (cdb_find(&cdb, args[1].data(), args[1].length()) > 0) {
1580 		    size_t datalen = cdb_datalen(&cdb);
1581 		    const void *dat = cdb_get(&cdb, datalen, cdb_datapos(&cdb));
1582 		    if (q) {
1583 			value.assign(static_cast<const char *>(dat), datalen);
1584 		    }
1585 		}
1586 
1587 		cdb_free(&cdb);
1588 		close(fd); // FIXME: cache fds?
1589 		break;
1590 	    }
1591 	    case CMD_lower:
1592 		value = Xapian::Unicode::tolower(args[0]);
1593 		break;
1594             case CMD_lt:
1595 		if (string_to_int(args[0]) < string_to_int(args[1]))
1596 		    value = "true";
1597 		break;
1598 	    case CMD_map:
1599 		if (!args[0].empty()) {
1600 		    string l = args[0], pat = args[1];
1601 		    vector<string> new_args(param);
1602 		    string::size_type i = 0, j;
1603 		    while (true) {
1604 			j = l.find('\t', i);
1605 			new_args[0] = l.substr(i, j - i);
1606 			value += eval(pat, new_args);
1607 			if (j == string::npos) break;
1608 			value += '\t';
1609 			i = j + 1;
1610 		    }
1611 		}
1612 	        break;
1613 	    case CMD_max: {
1614 		vector<string>::const_iterator i = args.begin();
1615 		int val = string_to_int(*i++);
1616 		for (; i != args.end(); i++) {
1617 		    int x = string_to_int(*i);
1618 		    if (x > val) val = x;
1619 	        }
1620 		value = str(val);
1621 		break;
1622 	    }
1623 	    case CMD_min: {
1624 		vector<string>::const_iterator i = args.begin();
1625 		int val = string_to_int(*i++);
1626 		for (; i != args.end(); i++) {
1627 		    int x = string_to_int(*i);
1628 		    if (x < val) val = x;
1629 	        }
1630 		value = str(val);
1631 		break;
1632 	    }
1633 	    case CMD_msize:
1634 		// number of matches
1635 		value = str(mset.get_matches_estimated());
1636 		break;
1637 	    case CMD_msizeexact:
1638 		// is msize exact?
1639 		if (mset.get_matches_lower_bound()
1640 		    == mset.get_matches_upper_bound())
1641 		    value = "true";
1642 		break;
1643 	    case CMD_mod: {
1644 		int denom = string_to_int(args[1]);
1645 		if (denom == 0) {
1646 		    value = "divide by 0";
1647 		} else {
1648 		    value = str(string_to_int(args[0]) %
1649 				string_to_int(args[1]));
1650 		}
1651 		break;
1652 	    }
1653 	    case CMD_mul: {
1654 		vector<string>::const_iterator i = args.begin();
1655 		int total = string_to_int(*i++);
1656 		while (i != args.end())
1657 		    total *= string_to_int(*i++);
1658 		value = str(total);
1659 		break;
1660 	    }
1661 	    case CMD_muldiv: {
1662 		int denom = string_to_int(args[2]);
1663 		if (denom == 0) {
1664 		    value = "divide by 0";
1665 		} else {
1666 		    int num = string_to_int(args[0]) * string_to_int(args[1]);
1667 		    value = str(num / denom);
1668 		}
1669 		break;
1670 	    }
1671             case CMD_ne:
1672 		if (args[0] != args[1]) value = "true";
1673 		break;
1674 	    case CMD_nice: {
1675 		string::const_iterator i = args[0].begin();
1676 		int len = args[0].length();
1677 		while (len) {
1678 		    value += *i++;
1679 		    if (--len && len % 3 == 0) value += option["thousand"];
1680 		}
1681 		break;
1682 	    }
1683 	    case CMD_not:
1684 		if (args[0].empty()) value = "true";
1685 		break;
1686 	    case CMD_now: {
1687 		char buf[64];
1688 		my_snprintf(buf, sizeof(buf), "%lu", (unsigned long)time(NULL));
1689 		// MSVC's snprintf omits the zero byte if the string if
1690 		// sizeof(buf) long.
1691 		buf[sizeof(buf) - 1] = '\0';
1692 		value = buf;
1693 		break;
1694 	    }
1695 	    case CMD_opt:
1696 		if (args.size() == 2) {
1697 		    value = option[args[0] + "," + args[1]];
1698 		} else {
1699 		    value = option[args[0]];
1700 		}
1701 		break;
1702 	    case CMD_or: {
1703 		for (vector<string>::const_iterator i = args.begin();
1704 		     i != args.end(); i++) {
1705 		    value = eval(*i, param);
1706 		    if (!value.empty()) break;
1707 	        }
1708 		break;
1709 	    }
1710 	    case CMD_pack:
1711 		value = int_to_binary_string(string_to_int(args[0]));
1712 		break;
1713 	    case CMD_percentage:
1714 		// percentage score
1715 		value = str(percent);
1716 		break;
1717 	    case CMD_prettyterm:
1718 		value = pretty_term(args[0]);
1719 		break;
1720 	    case CMD_prettyurl:
1721 		value = args[0];
1722 		url_prettify(value);
1723 		break;
1724 	    case CMD_query:
1725 		value = query_string;
1726 		break;
1727 	    case CMD_querydescription:
1728 		value = query.get_description();
1729 		break;
1730 	    case CMD_queryterms:
1731 		value = queryterms;
1732 		break;
1733 	    case CMD_range: {
1734 		int start = string_to_int(args[0]);
1735 		int end = string_to_int(args[1]);
1736 	        while (start <= end) {
1737 		    value += str(start);
1738 		    if (start < end) value += '\t';
1739 		    start++;
1740 		}
1741 		break;
1742 	    }
1743 	    case CMD_record: {
1744 		int id = q0;
1745 		if (!args.empty()) id = string_to_int(args[0]);
1746 		value = db.get_document(id).get_data();
1747 		break;
1748 	    }
1749 	    case CMD_relevant: {
1750 		// document id if relevant; empty otherwise
1751 		int id = q0;
1752 		if (!args.empty()) id = string_to_int(args[0]);
1753 		map<Xapian::docid, bool>::iterator i = ticked.find(id);
1754 		if (i != ticked.end()) {
1755 		    i->second = false; // icky side-effect
1756 		    value = str(id);
1757 		}
1758 		break;
1759 	    }
1760 	    case CMD_relevants:	{
1761 		for (map <Xapian::docid, bool>::const_iterator i = ticked.begin();
1762 		     i != ticked.end(); i++) {
1763 		    if (i->second) {
1764 			value += str(i->first);
1765 			value += '\t';
1766 		    }
1767 		}
1768 		if (!value.empty()) value.erase(value.size() - 1);
1769 		break;
1770 	    }
1771 	    case CMD_score:
1772 	        // Score (0 to 10)
1773 		value = str(percent / 10);
1774 		break;
1775 	    case CMD_set:
1776 		option[args[0]] = args[1];
1777 		break;
1778 	    case CMD_setmap: {
1779 		string base = args[0] + ',';
1780 		if (args.size() % 2 != 1)
1781 		    throw string("$setmap requires an odd number of arguments");
1782 		for (unsigned int i = 1; i + 1 < args.size(); i += 2) {
1783 		    option[base + args[i]] = args[i + 1];
1784 		}
1785 		break;
1786 	    }
1787 	    case CMD_setrelevant: {
1788 		string::size_type i = 0, j;
1789 		while (true) {
1790 		    j = args[0].find_first_not_of("0123456789", i);
1791 		    Xapian::docid id = atoi(args[0].substr(i, j - i).c_str());
1792 		    if (id) {
1793 			rset.add_document(id);
1794 			ticked[id] = true;
1795 		    }
1796 		    if (j == string::npos) break;
1797 		    i = j + 1;
1798 		}
1799 		break;
1800 	    }
1801 	    case CMD_slice: {
1802 		string list = args[0], pos = args[1];
1803 		vector<string> items;
1804 		string::size_type i = 0, j;
1805 		while (true) {
1806 		    j = list.find('\t', i);
1807 		    items.push_back(list.substr(i, j - i));
1808 		    if (j == string::npos) break;
1809 		    i = j + 1;
1810 		}
1811 		i = 0;
1812 		bool have_added = false;
1813 		while (true) {
1814 		    j = pos.find('\t', i);
1815 		    int item = string_to_int(pos.substr(i, j - i));
1816 		    if (item >= 0 && size_t(item) < items.size()) {
1817 			if (have_added) value += '\t';
1818 			value += items[item];
1819 			have_added = true;
1820 		    }
1821 		    if (j == string::npos) break;
1822 		    i = j + 1;
1823 		}
1824 	        break;
1825 	    }
1826 	    case CMD_split: {
1827 		string split;
1828 		if (args.size() == 1) {
1829 		    split = " ";
1830 		    value = args[0];
1831 		} else {
1832 		    split = args[0];
1833 		    value = args[1];
1834 		}
1835 		string::size_type i = 0;
1836 		while (true) {
1837 		    if (split.empty()) {
1838 			++i;
1839 			if (i >= value.size()) break;
1840 		    } else {
1841 			i = value.find(split, i);
1842 			if (i == string::npos) break;
1843 		    }
1844 		    value.replace(i, split.size(), 1, '\t');
1845 		    ++i;
1846 		}
1847 	        break;
1848 	    }
1849 	    case CMD_stoplist: {
1850 		Xapian::TermIterator i = qp.stoplist_begin();
1851 		Xapian::TermIterator end = qp.stoplist_end();
1852 		while (i != end) {
1853 		    if (!value.empty()) value += '\t';
1854 		    value += *i;
1855 		    ++i;
1856 		}
1857 		break;
1858 	    }
1859 	    case CMD_sub:
1860 		value = str(string_to_int(args[0]) - string_to_int(args[1]));
1861 		break;
1862 	    case CMD_substr: {
1863 		int start = string_to_int(args[1]);
1864 		if (start < 0) {
1865 		    if (static_cast<size_t>(-start) >= args[0].size()) {
1866 			start = 0;
1867 		    } else {
1868 			start = static_cast<int>(args[0].size()) + start;
1869 		    }
1870 		} else {
1871 		    if (static_cast<size_t>(start) >= args[0].size()) break;
1872 		}
1873 		size_t len = string::npos;
1874 		if (args.size() > 2) {
1875 		    int int_len = string_to_int(args[2]);
1876 		    if (int_len >= 0) {
1877 			len = size_t(int_len);
1878 		    } else {
1879 			len = args[0].size() - start;
1880 			if (static_cast<size_t>(-int_len) >= len) {
1881 			    len = 0;
1882 			} else {
1883 			    len -= static_cast<size_t>(-int_len);
1884 			}
1885 		    }
1886 		}
1887 		value = args[0].substr(start, len);
1888 		break;
1889 	    }
1890 	    case CMD_suggestion:
1891 		value = qp.get_corrected_query_string();
1892 		break;
1893 	    case CMD_terms:
1894 		if (enquire) {
1895 		    // list of matching terms
1896 		    Xapian::TermIterator term = enquire->get_matching_terms_begin(q0);
1897 		    while (term != enquire->get_matching_terms_end(q0)) {
1898 			// check term was in the typed query so we ignore
1899 			// boolean filter terms
1900 			if (termset.find(*term) != termset.end()) {
1901 			    value += *term;
1902 			    value += '\t';
1903 			}
1904 			++term;
1905 		    }
1906 
1907 		    if (!value.empty()) value.erase(value.size() - 1);
1908 		}
1909 		break;
1910 	    case CMD_thispage:
1911 		value = str(topdoc / hits_per_page + 1);
1912 		break;
1913 	    case CMD_time:
1914 		if (secs >= 0) {
1915 		    char buf[64];
1916 		    my_snprintf(buf, sizeof(buf), "%.6f", secs);
1917 		    // MSVC's snprintf omits the zero byte if the string if
1918 		    // sizeof(buf) long.
1919 		    buf[sizeof(buf) - 1] = '\0';
1920 		    value = buf;
1921 		}
1922 		break;
1923 	    case CMD_topdoc:
1924 		// first document on current page of hit list (counting from 0)
1925 		value = str(topdoc);
1926 		break;
1927 	    case CMD_topterms:
1928 		if (enquire) {
1929 		    int howmany = 16;
1930 		    if (!args.empty()) howmany = string_to_int(args[0]);
1931 		    if (howmany < 0) howmany = 0;
1932 
1933 		    // List of expand terms
1934 		    Xapian::ESet eset;
1935 		    OmegaExpandDecider decider(db, &termset);
1936 
1937 		    if (!rset.empty()) {
1938 			set_expansion_scheme(*enquire, option);
1939 			eset = enquire->get_eset(howmany * 2, rset, 0,
1940 						 expand_param_k, &decider);
1941 		    } else if (mset.size()) {
1942 			// invent an rset
1943 			Xapian::RSet tmp;
1944 
1945 			int c = 5;
1946 			// FIXME: what if mset does not start at first match?
1947 			Xapian::MSetIterator m = mset.begin();
1948 			for ( ; m != mset.end(); ++m) {
1949 			    tmp.add_document(*m);
1950 			    if (--c == 0) break;
1951 			}
1952 
1953 			set_expansion_scheme(*enquire, option);
1954 			eset = enquire->get_eset(howmany * 2, tmp, 0,
1955 						 expand_param_k, &decider);
1956 		    }
1957 
1958 		    // Don't show more than one word with the same stem.
1959 		    set<string> stems;
1960 		    Xapian::ESetIterator i;
1961 		    for (i = eset.begin(); i != eset.end(); ++i) {
1962 			string term(*i);
1963 			string stem = (*stemmer)(term);
1964 			if (stems.find(stem) != stems.end()) continue;
1965 			stems.insert(stem);
1966 			value += term;
1967 			value += '\t';
1968 			if (--howmany == 0) break;
1969 		    }
1970 		    if (!value.empty()) value.erase(value.size() - 1);
1971 		}
1972 		break;
1973 	    case CMD_transform:
1974 		omegascript_transform(value, args);
1975 		break;
1976 	    case CMD_uniq: {
1977 		const string &list = args[0];
1978 		if (list.empty()) break;
1979 		string::size_type split = 0, split2;
1980 		string prev;
1981 		do {
1982 		    split2 = list.find('\t', split);
1983 		    string item = list.substr(split, split2 - split);
1984 		    if (split == 0) {
1985 			value = item;
1986 		    } else if (item != prev) {
1987 			value += '\t';
1988 			value += item;
1989 		    }
1990 		    prev = item;
1991 		    split = split2 + 1;
1992 		} while (split2 != string::npos);
1993 		break;
1994 	    }
1995 	    case CMD_unpack:
1996 		value = str(binary_string_to_int(args[0]));
1997 		break;
1998 	    case CMD_unstem: {
1999 		const string &term = args[0];
2000 		Xapian::TermIterator i = qp.unstem_begin(term);
2001 		Xapian::TermIterator end = qp.unstem_end(term);
2002 		while (i != end) {
2003 		    if (!value.empty()) value += '\t';
2004 		    value += *i;
2005 		    ++i;
2006 		}
2007 		break;
2008 	    }
2009 	    case CMD_upper:
2010 		value = Xapian::Unicode::toupper(args[0]);
2011 		break;
2012 	    case CMD_url:
2013 		url_encode(value, args[0]);
2014 		break;
2015 	    case CMD_value: {
2016 		Xapian::docid id = q0;
2017 		Xapian::valueno value_no = string_to_int(args[0]);
2018 		if (args.size() > 1) id = string_to_int(args[1]);
2019 		value = db.get_document(id).get_value(value_no);
2020 		break;
2021 	    }
2022 	    case CMD_version:
2023 		value = PACKAGE_STRING;
2024 		break;
2025 	    case CMD_weight:
2026 		value = double_to_string(weight);
2027 		break;
2028 	    default: {
2029 		args.insert(args.begin(), param[0]);
2030 		int macro_no = func->second->tag - CMD_MACRO;
2031 		assert(macro_no >= 0 && (unsigned int)macro_no < macros.size());
2032 		// throw "Unknown function `" + var + "'";
2033 		value = eval(macros[macro_no], args);
2034 		break;
2035 	    }
2036 	}
2037         res += value;
2038     } catch (const Xapian::Error & e) {
2039 	// FIXME: this means we only see the most recent error in $error
2040 	// - is that the best approach?
2041 	error_msg = e.get_msg();
2042     }
2043 
2044     res.append(fmt, p, string::npos);
2045     return res;
2046 }
2047 
2048 static string
eval_file(const string & fmtfile)2049 eval_file(const string &fmtfile)
2050 {
2051     string err;
2052     if (vet_filename(fmtfile)) {
2053 	string file = template_dir + fmtfile;
2054 	string fmt;
2055 	if (load_file(file, fmt)) {
2056 	    vector<string> noargs;
2057 	    noargs.resize(1);
2058 	    return eval(fmt, noargs);
2059 	}
2060 	err = strerror(errno);
2061     } else {
2062 	err = "name contains `..'";
2063     }
2064 
2065     // FIXME: report why!
2066     string msg = string("Couldn't read format template `") + fmtfile + '\'';
2067     if (!err.empty()) msg += " (" + err + ')';
2068     throw msg;
2069 }
2070 
2071 extern string
pretty_term(string term)2072 pretty_term(string term)
2073 {
2074     // Just leave empty strings and single characters alone.
2075     if (term.length() <= 1) return term;
2076 
2077     // Assume unprefixed terms are unstemmed.
2078     if (!C_isupper(term[0])) return term;
2079 
2080     // FIXME: keep this for now in case people are still generating 'R' terms?
2081     // But if we assumed unprefixed terms are unstemmed, what use is this?
2082     if (term[0] == 'R') {
2083 	term.erase(0, 1);
2084 	term[0] = C_toupper(term[0]);
2085 	return term;
2086     }
2087 
2088     // Handle stemmed terms.
2089     bool stemmed = (term[0] == 'Z');
2090     if (stemmed) {
2091 	// First of all, check if a term in the query stemmed to this one.
2092 	Xapian::TermIterator u = qp.unstem_begin(term);
2093 	// There might be multiple words with the same stem, but we only want
2094 	// one so just take the first.
2095 	if (u != qp.unstem_end(term)) return *u;
2096 
2097 	// Remove the 'Z'.
2098 	term.erase(0, 1);
2099     }
2100 
2101     bool add_quotes = false;
2102 
2103     // Check if the term has a prefix.
2104     if (C_isupper(term[0])) {
2105 	// See if we have this prefix in the termprefix_to_userprefix map.  If
2106 	// so, just reverse the mapping (e.g. turn 'Sfish' into 'subject:fish').
2107 	string prefix;
2108 	size_t prefix_len = prefix_from_term(prefix, term);
2109 
2110 	map<string, string>::const_iterator i;
2111 	i = termprefix_to_userprefix.find(prefix);
2112 	if (i != termprefix_to_userprefix.end()) {
2113 	    string user_prefix = i->second;
2114 	    user_prefix += ':';
2115 	    term.replace(0, prefix_len, user_prefix);
2116 	} else {
2117 	    // We don't have a prefix mapping for this, so just set a flag to
2118 	    // add quotes around the term.
2119 	    add_quotes = true;
2120 	}
2121     }
2122 
2123     if (stemmed) term += '.';
2124 
2125     if (add_quotes) {
2126 	term.insert(0, "\"");
2127 	term.append("\"");
2128     }
2129 
2130     return term;
2131 }
2132 
2133 static string
print_caption(const string & fmt,const vector<string> & param)2134 print_caption(const string &fmt, const vector<string> &param)
2135 {
2136     q0 = *(mset[hit_no]);
2137 
2138     weight = mset[hit_no].get_weight();
2139     percent = mset.convert_to_percent(mset[hit_no]);
2140     collapsed = mset[hit_no].get_collapse_count();
2141 
2142     return eval(fmt, param);
2143 }
2144 
2145 void
parse_omegascript()2146 parse_omegascript()
2147 {
2148     try {
2149 	const char * p = getenv("SERVER_PROTOCOL");
2150 	if (p && strcmp(p, "INCLUDED") == 0) {
2151 	    // We're being included in another page, so suppress headers.
2152 	    suppress_http_headers = true;
2153 	}
2154 
2155 	string output = eval_file(fmtname);
2156 	if (!set_content_type && !suppress_http_headers) {
2157 	    cout << "Content-Type: text/html" << endl;
2158 	    set_content_type = true;
2159 	}
2160 	if (!suppress_http_headers) cout << endl;
2161 	cout << output;
2162     } catch (...) {
2163 	// Ensure the headers have been output so that any exception gets
2164 	// reported rather than giving a server error.
2165 	if (!set_content_type && !suppress_http_headers) {
2166 	    cout << "Content-Type: text/html" << endl;
2167 	    set_content_type = true;
2168 	}
2169 	if (!suppress_http_headers) cout << endl;
2170 	throw;
2171     }
2172 }
2173 
2174 static void
ensure_query_parsed()2175 ensure_query_parsed()
2176 {
2177     if (query_parsed) return;
2178     query_parsed = true;
2179 
2180     MCI val;
2181     pair<MCI, MCI> g;
2182 
2183     // Should we discard the existing R-set recorded in R CGI parameters?
2184     bool discard_rset = true;
2185 
2186     // Should we force the first page of hits (and ignore [ > < # and TOPDOC
2187     // CGI parameters)?
2188     bool force_first_page = true;
2189 
2190     string v;
2191     // get list of terms from previous iteration of query
2192     val = cgi_params.find("xP");
2193     if (val == cgi_params.end()) val = cgi_params.find("OLDP");
2194     if (val != cgi_params.end()) {
2195 	v = val->second;
2196     } else {
2197 	// if xP not given, default to keeping the rset and don't force page 1
2198 	discard_rset = false;
2199 	force_first_page = false;
2200     }
2201     querytype result = set_probabilistic(v);
2202     switch (result) {
2203 	case BAD_QUERY:
2204 	    break;
2205 	case NEW_QUERY:
2206 	    break;
2207 	case SAME_QUERY:
2208         case EXTENDED_QUERY:
2209 	    // If we've changed database, force the first page of hits
2210 	    // and discard the R-set (since the docids will have changed)
2211 	    val = cgi_params.find("xDB");
2212 	    if (val != cgi_params.end() && val->second != dbname) break;
2213 	    if (result == SAME_QUERY && force_first_page) {
2214 		val = cgi_params.find("xFILTERS");
2215 		if (val != cgi_params.end() && val->second != filters) {
2216 		    // Filters have changed since last query.
2217 		} else {
2218 		    force_first_page = false;
2219 		}
2220 	    }
2221 	    discard_rset = false;
2222 	    break;
2223     }
2224 
2225     if (!force_first_page) {
2226 	// Work out which mset element is the first hit we want
2227 	// to display
2228 	val = cgi_params.find("TOPDOC");
2229 	if (val != cgi_params.end()) {
2230 	    topdoc = atol(val->second.c_str());
2231 	}
2232 
2233 	// Handle next, previous, and page links
2234 	if (cgi_params.find(">") != cgi_params.end()) {
2235 	    topdoc += hits_per_page;
2236 	} else if (cgi_params.find("<") != cgi_params.end()) {
2237 	    if (topdoc >= hits_per_page)
2238 		topdoc -= hits_per_page;
2239 	    else
2240 		topdoc = 0;
2241 	} else if ((val = cgi_params.find("[")) != cgi_params.end() ||
2242 		   (val = cgi_params.find("#")) != cgi_params.end()) {
2243 	    long page = atol(val->second.c_str());
2244 	    // Do something sensible for page 0 (we count pages from 1).
2245 	    if (page == 0) page = 1;
2246 	    topdoc = (page - 1) * hits_per_page;
2247 	}
2248 
2249 	// raw_search means don't snap TOPDOC to a multiple of HITSPERPAGE.
2250 	// Normally we snap TOPDOC like this so that things work nicely if
2251 	// HITSPERPAGE is in a <select> or on radio buttons.  If we're
2252 	// postprocessing the output of omega and want variable sized pages,
2253 	// this is unhelpful.
2254 	bool raw_search = false;
2255 	val = cgi_params.find("RAWSEARCH");
2256 	if (val != cgi_params.end()) {
2257 	    raw_search = bool(atol(val->second.c_str()));
2258 	}
2259 
2260 	if (!raw_search) topdoc = (topdoc / hits_per_page) * hits_per_page;
2261     }
2262 
2263     if (!discard_rset) {
2264 	// put documents marked as relevant into the rset
2265 	g = cgi_params.equal_range("R");
2266 	for (MCI i = g.first; i != g.second; i++) {
2267 	    const string & value = i->second;
2268 	    for (size_t j = 0; j < value.size(); j = value.find('.', j)) {
2269 		while (value[j] == '.') ++j;
2270 		Xapian::docid d = atoi(value.c_str() + j);
2271 		if (d) {
2272 		    rset.add_document(d);
2273 		    ticked[d] = true;
2274 		}
2275 	    }
2276 	}
2277     }
2278 }
2279 
2280 // run query if we haven't already
2281 static void
ensure_match()2282 ensure_match()
2283 {
2284     if (done_query) return;
2285 
2286     secs = RealTime::now();
2287     run_query();
2288     if (secs != -1)
2289 	secs = RealTime::now() - secs;
2290 
2291     done_query = true;
2292     last = mset.get_matches_lower_bound();
2293     if (last == 0) {
2294 	// Otherwise topdoc ends up being -6 if it's non-zero!
2295 	topdoc = 0;
2296     } else {
2297 	if (topdoc >= last)
2298 	    topdoc = ((last - 1) / hits_per_page) * hits_per_page;
2299 	// last is the count of documents up to the end of the current page
2300 	// (as returned by $last)
2301 	if (topdoc + hits_per_page < last)
2302 	    last = topdoc + hits_per_page;
2303     }
2304 }
2305 
2306 // OmegaExpandDecider methods.
2307 
OmegaExpandDecider(const Xapian::Database & db_,set<string> * querytermset)2308 OmegaExpandDecider::OmegaExpandDecider(const Xapian::Database & db_,
2309 				       set<string> * querytermset)
2310     : db(db_)
2311 {
2312     // We'll want the stemmer for testing matches anyway.
2313     if (!stemmer)
2314 	stemmer = new Xapian::Stem(option["stemmer"]);
2315     if (querytermset) {
2316 	set<string>::const_iterator i;
2317 	for (i = querytermset->begin(); i != querytermset->end(); ++i) {
2318 	    string term(*i);
2319 	    if (term.empty()) continue;
2320 
2321 	    unsigned char ch = term[0];
2322 	    bool stemmed = (ch == 'Z');
2323 	    if (stemmed) {
2324 	       term.erase(0, 1);
2325 	       if (term.empty()) continue;
2326 	       ch = term[0];
2327 	    }
2328 
2329 	    if (C_isupper(ch)) {
2330 		string prefix;
2331 		size_t prefix_len = prefix_from_term(prefix, term);
2332 		term.erase(0, prefix_len);
2333 	    }
2334 
2335 	    if (!stemmed) term = (*stemmer)(term);
2336 
2337 	    exclude_stems.insert(term);
2338 	}
2339     }
2340 }
2341 
2342 bool
operator ()(const string & term) const2343 OmegaExpandDecider::operator()(const string & term) const
2344 {
2345     unsigned char ch = term[0];
2346 
2347     // Reject terms with a prefix.
2348     if (C_isupper(ch)) return false;
2349 
2350     {
2351 	MyStopper stopper;
2352 	// Don't suggest stopwords.
2353 	if (stopper(term)) return false;
2354     }
2355 
2356     // Reject small numbers.
2357     if (term.size() < 4 && C_isdigit(ch)) return false;
2358 
2359     // Reject terms containing a space.
2360     if (term.find(' ') != string::npos) return false;
2361 
2362     // Skip terms with stems in the exclude_stems set, to avoid suggesting
2363     // terms which are already in the query in some form.
2364     string stem = (*stemmer)(term);
2365     if (exclude_stems.find(stem) != exclude_stems.end())
2366 	return false;
2367 
2368     // Ignore terms that only occur once (hapaxes) since they aren't
2369     // useful for finding related documents - they only occur in a
2370     // document that's already been marked as relevant.
2371     // FIXME: add an expand option to ignore terms where
2372     // termfreq == rtermfreq.
2373     if (db.get_termfreq(term) <= 1) return false;
2374 
2375     return true;
2376 }
2377