1 //
2 // defaults.cc
3 //
4 // defaults: default values for the ht programs through the
5 //           HtConfiguration class
6 //
7 // Part of the ht://Dig package   <http://www.htdig.org/>
8 // Copyright (c) 1995-2004 The ht://Dig Group
9 // For copyright details, see the file COPYING in your distribution
10 // or the GNU Library General Public License (LGPL) version 2 or later
11 // <http://www.gnu.org/copyleft/lgpl.html>
12 //
13 // $Id: defaults.cc,v 1.112 2004/06/12 13:39:12 lha Exp $
14 //
15 
16 #ifdef HAVE_CONFIG_H
17 #include "htconfig.h"
18 #endif /* HAVE_CONFIG_H */
19 
20 #include "HtConfiguration.h"
21 
22 // Fields and their values:
23 //	Attribute name
24 //	Default value ("" becomes "no default" in .html docs)
25 //	Type (boolean, number, integer, string, string list, quoted string list,
26 //				pattern list)
27 //	Commands using attribute (all, htdig, htsearch, htfuzzy,
28 //				htdump, htload, htnotify, htpurge)
29 //	Block (Global, Server, URL)
30 //	Versions for which attribute is present
31 //	Class	(Extra Output, External:Parsers, External:Protocols,
32 //		File Layout,
33 //		Indexing:Connection, Indexing:Out, Indexing:What,Indexing:Where,
34 //		Presentation:Files, Presentation:How, Presentation:Text,
35 //		Searching:Method, Searching:Ranking, Searching:UI,
36 //		URLs)
37 //	Example
38 //	Description
39 
40 ConfigDefaults	defaults[] =
41 {
42 
43 { "accents_db", "${database_base}.accents.db",  \
44 	"string", "htfuzzy htsearch", "", "all", "File Layout", "accents_db: ${database_base}.uml.db", " \
45 	The database file used for the fuzzy \"accents\" search \
46 	algorithm. This database is created by \
47 	<a href=\"htfuzzy.html\">htfuzzy</a> and used by \
48 	<a href=\"htsearch.html\" target=\"_top\">htsearch</a>. \
49 " }, \
50 { "accept_language", "",  \
51 	"string list", "htdig", "Server", "3.2.0b4", "Indexing:Out", "accept_language: en-us en it", " \
52 	This attribute allows you to restrict the set of natural languages \
53 	that are preferred as a response to an HTTP request performed by the \
54 	digger. This can be done by putting one or more language tags \
55 	(as defined by RFC 1766) in the preferred order, separated by spaces. \
56 	By doing this, when the server performs a content negotiation based \
57 	on the 'accept-language' given by the HTTP user agent, a different \
58 	content can be shown depending on the value of this attribute. If \
59 	set to an empty list, no language will be sent and the server default \
60 	will be returned. \
61 " }, \
62 { "add_anchors_to_excerpt", "true",  \
63 	"boolean", "htsearch", "", "3.1.0", "Presentation:How", "add_anchors_to_excerpt: no", " \
64 	If set to true, the first occurrence of each matched \
65 	word in the excerpt will be linked to the closest \
66 	anchor in the document. This only has effect if the \
67 	<strong>EXCERPT</strong> variable is used in the output \
68 	template and the excerpt is actually going to be displayed. \
69 " }, \
70 { "allow_double_slash", "false",  \
71 	"boolean", "htdig", "", "3.2.0b4", "Indexing:Out", "allow_double_slash: true", " \
72 	If set to true, strings of multiple slashes ('/') in URL paths \
73 	will be left intact, rather than being collapsed. This is necessary \
74 	for some search engine URLs which use slashes to separate fields rather \
75 	than to separate directory components.  However, it can lead to multiple database \
76 	entries refering to the same file, and it causes '/foo//../' to \
77 	be equivalent to '/foo/', rather than to '/'. \
78 " }, \
79 { "allow_in_form", "",  \
80 	"string list", "htsearch", "", "3.1.0", "Searching:UI", "allow_in_form: search_algorithm search_results_header", " \
81 	Allows the specified config file attributes to be specified \
82 	in search forms as separate fields. This could be used to \
83 	allow form writers to design their own headers and footers \
84 	and specify them in the search form. Another example would \
85 	be to offer a menu of search_algorithms in the form. \
86 	<table> \
87 	<tr> \
88 	<td nowrap> \
89 	<code> \
90 	&nbsp;&nbsp;&lt;SELECT NAME=\"search_algorithm\"&gt;<br> \
91 	&nbsp;&nbsp;&lt;OPTION VALUE=\"exact:1 prefix:0.6 synonyms:0.5 endings:0.1\" SELECTED&gt;fuzzy<br> \
92 	&nbsp;&nbsp;&lt;OPTION VALUE=\"exact:1\"&gt;exact<br> \
93 	&nbsp;&nbsp;&lt;/SELECT&gt; \
94 	</code></td> \
95 	</tr> \
96 	</table> \
97 	The general idea behind this is to make an input parameter out \
98 	of any configuration attribute that's not already automatically \
99 	handled by an input parameter. You can even make up your own \
100 	configuration attribute names, for purposes of passing data from \
101 	the search form to the results output. You're not restricted to \
102 	the existing attribute names. The attributes listed in the \
103 	allow_in_form list will be settable in the search form using \
104 	input parameters of the same name, and will be propagated to \
105 	the follow-up search form in the results template using template \
106 	variables of the same name in upper-case. \
107 	You can also make select lists out of any of these input \
108 	parameters, in the follow-up search form, using the \
109 	<a href=\"#build_select_lists\">build_select_lists</a> \
110 	configuration attribute. \
111 	<br>WARNING: Extreme care are should be taken with this option, as \
112 	allowing CGI scripts to set file names can open security holes.\
113 " }, \
114 { "allow_numbers", "false",  \
115 	"boolean", "htdig htsearch", "", "all", "Indexing:What", "allow_numbers: true", " \
116 	If set to true, numbers are considered words. This \
117 	means that searches can be done on strings of digits as well as \
118 	regular words. All the same rules apply to numbers as \
119 	to words.  This does not cause numbers containing a decimal point or \
120 	commas to be treated as a single entity. \
121 	When allow_numbers is false, words are stil \
122 	allowed to contain digits, but they must also contain at \
123 	least one alphabetic character or \
124 	<a href=\"#extra_word_characters\">extra word</a> character. \
125 	To disallow digits in words, add the digits to \
126 	<a href=\"#valid_punctuation\">valid_punctuation</a>. \
127 " }, \
128 { "allow_space_in_url", "false",  \
129 	"boolean", "htdig", "", "3.2.0b6", "Indexing:Where", "allow_space_in_url: true", " \
130 	If set to true, htdig will handle URLs that contain \
131 	embedded spaces. Technically, this is a violation of \
132 	RFC 2396, which says spaces should be stripped out \
133 	(as htdig does by default).  However, many web browsers \
134 	and HTML code generators violate this standard already, \
135 	so enabling this attribute allows htdig to handle these \
136 	non-compliant URLs.  Even with this attribute set, htdig \
137 	still strips out all white space (leading, trailing and \
138 	embedded), except that space characters embedded within \
139 	the URL will be encoded as %20. \
140 " }, \
141 { "allow_virtual_hosts", "true",  \
142 	"boolean", "htdig", "", "3.0.8b2", "Indexing:Where", "allow_virtual_hosts: false", " \
143 	If set to true, htdig will index virtual web sites as \
144 	expected. If false, all URL host names will be \
145 	normalized into whatever the DNS server claims the IP \
146 	address to map to. If this option is set to false, \
147 	there is no way to index either \"soft\" or \"hard\" \
148 	virtual web sites. \
149 " }, \
150 { "anchor_target", "",  \
151 	"string", "htsearch", "", "3.1.6", "Presentation:How", "anchor_target: body", " \
152 	When the first matched word in the excerpt is linked \
153 	to the closest anchor in the document, this string \
154 	can be set to specify a target in the link so the \
155 	resulting page is displayed in the desired frame. \
156 	This value will only be used if the \
157 	<a href=\"#add_anchors_to_excerpt\">add_anchors_to_excerpt</a> \
158 	attribute is set to true, the <strong>EXCERPT</strong> \
159 	variable is used in the output template and the \
160 	excerpt is actually displayed with a link. \
161 " }, \
162 { "any_keywords", "false",  \
163 	"boolean", "htsearch", "", "3.2.0b2", "Searching:Method", "any_keywords: yes", " \
164 	If set to true, the words in the <strong>keywords</strong> \
165 	input parameter in the search form will be joined with logical \
166 	ORs rather than ANDs, so that any of the words provided will do. \
167 	Note that this has nothing to do with limiting the search to \
168 	words in META keywords tags. See the <a href=\"hts_form.html\"> \
169 	search form</a> documentation for details on this. \
170 " }, \
171 { "author_factor", "1",  \
172 	"number", "htsearch", "", "3.2.0b4", "Searching:Ranking", "author_factor: 1", " \
173 	Weighting applied to words in a &lt;meta name=\"author\" ... &gt; \
174 	tag.<br> \
175 	See also <a href=\"#heading_factor\">heading_factor</a>. \
176 " }, \
177 { "authorization", "",  \
178 	"string", "htdig", "URL", "3.1.4", "Indexing:Out", "authorization: myusername:mypassword", " \
179 	This tells htdig to send the supplied \
180 	<em>username</em><strong>:</strong><em>password</em> with each HTTP request. \
181 	The credentials will be encoded using the \"Basic\" authentication \
182 	scheme. There <em>must</em> be a colon (:) between the username and \
183 	password.<br> \
184 	This attribute can also be specified on htdig's command line using \
185 	the -u option, and will be blotted out so it won't show up in a \
186 	process listing. If you use it directly in a configuration file, \
187 	be sure to protect it so it is readable only by you, and do not \
188 	use that same configuration file for htsearch. \
189 " }, \
190 { "backlink_factor", "0.1",  \
191 	"number", "htsearch", "", "3.1.0", "Searching:Ranking", "backlink_factor: 501.1", " \
192 	This is a weight of \"how important\" a page is, based on \
193 	the number of URLs pointing to it. It's actually \
194 	multiplied by the ratio of the incoming URLs (backlinks) \
195 	and outgoing URLs (links on the page), to balance out pages \
196 	with lots of links to pages that link back to them. The ratio \
197 	gives lower weight to \"link farms\", which often have many \
198 	links to them.  This factor can \
199 	be changed without changing the database in any way. \
200 	However, setting this value to something other than 0 \
201 	incurs a slowdown on search results. \
202 " }, \
203 { "bad_extensions", ".wav .gz .z .sit .au .zip .tar .hqx .exe .com .gif .jpg .jpeg .aiff .class .map .ram .tgz .bin .rpm .mpg .mov .avi .css",  \
204 	"string list", "htdig", "URL", "all", "Indexing:Where", "bad_extensions: .foo .bar .bad", " \
205 	This is a list of extensions on URLs which are \
206 	considered non-parsable. This list is used mainly to \
207 	supplement the MIME-types that the HTTP server provides \
208 	with documents. Some HTTP servers do not have a correct \
209 	list of MIME-types and so can advertise certain \
210 	documents as text while they are some binary format. \
211 	If the list is empty, then all extensions are acceptable, \
212 	provided they pass other criteria for acceptance or rejection. \
213 	See also <a href=\"#valid_extensions\">valid_extensions</a>. \
214 " }, \
215 { "bad_local_extensions", ".php .shtml .cgi",  \
216 	"string list", "htdig", "URL", "all", "Indexing:Where", "bad_extensions: .foo .bar .bad", " \
217 	This is a list of extensions on URLs which must be retrieved \
218 	using the URL's true transport mechanism (such as HTTP). \
219 	If <a href=\"#local_urls\">local_urls</a> is specified, URLs not \
220 	ending with these extensions may instead be retrieved through \
221 	the local filesystem for efficiency. \
222 " },
223 { "bad_querystr", "",  \
224 	"pattern list", "htdig", "URL", "3.1.0", "Indexing:Where", "bad_querystr: forum=private section=topsecret&amp;passwd=required", " \
225 	This is a list of CGI query strings to be excluded from \
226 	indexing. This can be used in conjunction with CGI-generated \
227 	portions of a website to control which pages are \
228 	indexed. \
229 " }, \
230 { "bad_word_list", "${common_dir}/bad_words",  \
231 	"string", "htdig htsearch", "", "all", "Indexing:What,Searching:Method", "bad_word_list: ${common_dir}/badwords.txt", " \
232 	This specifies a file which contains words which should \
233 	be excluded when digging or searching. This list should \
234 	include the most common words or other words that you \
235 	don't want to be able to search on (things like <em> \
236 	sex</em> or <em>smut</em> are examples of these.)<br> \
237 	The file should contain one word per line. A sample \
238 	bad words file is located in the <code>contrib/examples</code> \
239 	directory. \
240 " }, \
241 { "bin_dir", BIN_DIR,  \
242 	"string", "all", "", "all", "File Layout", "bin_dir: /usr/local/bin", " \
243 	This is the directory in which the executables \
244 	related to ht://Dig are installed. It is never used \
245 	directly by any of the programs, but other attributes \
246 	can be defined in terms of this one. \
247 	<p> \
248 	The default value of this attribute is determined at \
249 	compile time. \
250 	</p> \
251 " }, \
252 { "boolean_keywords", "and or not",  \
253 	"string list", "htsearch", "", "3.1.6", "Presentation:How", "boolean_keywords: et ou non", " \
254 	These three strings are used as the keywords used in \
255 	constructing the \
256 	<a href=\"hts_templates.html#LOGICAL_WORDS\">LOGICAL_WORDS</a> \
257 	template variable, \
258 	and in parsing the <a href=\"hts_form.html#words\">words</a> input \
259 	parameter when the <a href=\"hts_form.html#method\">method</a> \
260 	parameter or <a href=\"#match_method\">match_method</a> attribute \
261 	is set to <code>boolean</code>. \
262 	See also the \
263 	<a href=\"#boolean_syntax_errors\">boolean_syntax_errors</a> attribute. \
264 " },
265 { "boolean_syntax_errors", "Expected \
266 	'a search word, a quoted phrase or a boolean expression between ()' \
267 		'at the end' 'instead of' 'end of expression' quotes",  \
268 	"quoted string list", "htsearch", "", "3.1.6", "Presentation:How",
269 	"boolean_syntax_errors: Attendait \"un mot\" \"&agrave; la fin\" \
270 	\"au lieu de\" \"fin d'expression\" \"guillemet\"", " \
271 	These six strings are used as the keywords used to \
272 	construct various syntax error messages for errors encountered in \
273 	parsing the <a href=\"hts_form.html#words\">words</a> input \
274 	parameter when the <a href=\"hts_form.html#method\">method</a> parameter \
275 	or <a href=\"#match_method\">match_method</a> attribute \
276 	is set to <code>boolean</code>. \
277 	They are used in conjunction with the \
278 	<a href=\"#boolean_keywords\">boolean_keywords</a> attribute, and \
279 	comprise all \
280 	English-specific parts of these error messages.  The order in which \
281 	the strings are put together may not be ideal, or even gramatically \
282 	correct, for all languages, but they can be used to make fairly \
283 	intelligible messages in many languages. \
284 " },
285 { "build_select_lists", "",  \
286 	"quoted string list", "htsearch", "", "3.2.0b1", "Searching:UI", "build_select_lists: \
287 		MATCH_LIST matchesperpage matches_per_page_list \\<br> \
288 				1 1 1 matches_per_page \"Previous Amount\" \\<br> \
289 		RESTRICT_LIST,multiple restrict restrict_names 2 1 2 restrict \"\" \\<br> \
290 		FORMAT_LIST,radio format template_map 3 2 1 template_name \"\"", " \
291 	This list allows you to define any htsearch input parameter as \
292 	a select list for use in templates, provided you also define \
293 	the corresponding name list attribute which enumerates all the \
294 	choices to put in the list. It can be used for existing input \
295 	parameters, as well as any you define using the \
296 	<a href=\"#allow_in_form\">allow_in_form</a> \
297 	attribute. The entries in this list each consist of an octuple, \
298 	a set of eight strings defining the variables and how they are to \
299 	be used to build a select list. The attribute can contain many \
300 	of these octuples. The strings in the string list are merely \
301 	taken eight at a time. For each octuple of strings specified in \
302 	build_select_lists, the elements have the following meaning:  \
303 	<ol> \
304 	   <li>the name of the template variable to be defined as a list, \
305 	   optionally followed by a comma and the type of list, and \
306 	   optional formatting codes \
307 	   <li>the input parameter name that the select list will set  \
308 	   <li>the name of the user-defined attribute containing the \
309 	   name list \
310 	   <li>the tuple size used in the name list above  \
311 	   <li>the index into a name list tuple for the value  \
312 	   <li>the index for the corresponding label on the selector \
313 	   <li>the configuration attribute where the default value for \
314 	   this input parameter is defined \
315 	   <li>the default label, if not an empty string, which will be \
316 	   used as the label for an additional list item for the current \
317 	   input parameter value if it doesn't match any value in the \
318 	   given list \
319 	</ol> \
320 	See the <a href=\"hts_selectors.html\">select list documentation</a> \
321 	for more information on this attribute. \
322 " }, \
323 { "caps_factor", "1",  \
324 	"number", "htsearch", "", "??", "Searching:Ranking", "caps_factor: 1", " \
325 	TO BE COMPLETED<br> \
326 	See also <a href=\"#heading_factor\">heading_factor</a>. \
327 " }, \
328 { "case_sensitive", "true",  \
329 	"boolean", "htdig", "", "3.1.0b2", "Indexing:Where", "case_sensitive: false", " \
330 	This specifies whether ht://Dig should consider URLs \
331 	case-sensitive or not. If your server is case-insensitive, \
332 	you should probably set this to false. <br> \
333 	Even if this is false, \
334 	<a href=\"#common_url_parts\">common_url_parts</a>, \
335 	<a href=\"#url_part_aliases\">url_part_aliases</a> and \
336 	<a href=\"#url_rewrite_rules\">url_rewrite_rules</a> \
337 	are all still case sensitive, and \
338 	<a href=\"#server_aliases\">server_aliases</a> \
339 	is still case insensitive. \
340 " }, \
341 { "check_unique_date", "false",  \
342 	"boolean", "htdig", "Global", "3.2.0b3", "", "check_unique_date: false", " \
343 	Include the modification date of the page in the MD5 hash, to reduce the \
344 	problem with identical but physically separate pages in different parts of the tree pointing to \
345 	different pages.  \
346 " }, \
347 { "check_unique_md5", "false",  \
348 	"boolean", "htdig", "Global", "3.2.0b3", "", "check_unique_md5: false", " \
349 	Uses the MD5 hash of pages to reject aliases, prevents multiple entries \
350 	in the index caused by such things as symbolic links \
351 	Note: May not do the right thing for incremental update \
352 " }, \
353 { "collection_names", "", \
354 	"string list", "htsearch", "", "3.2.0b2", "", "collection_names: htdig_docs htdig_bugs", " \
355 	This is a list of config file names that are used for searching multiple databases. \
356 	Simply put, htsearch will loop through the databases specified by each of these config \
357 	files and present the result of the search on all of the databases. \
358 	The corresponding config files are looked up in the <a href=\"#config_dir\">config_dir</a> directory. \
359 	Each listed config file <strong>must</strong> exist, as well as the corresponding databases. \
360 " }, \
361 { "common_dir", COMMON_DIR,  \
362 	"string", "all", "", "all", "File Layout", "common_dir: /tmp", " \
363 	Specifies the directory for files that will or can be \
364 	shared among different search databases. The default \
365 	value for this attribute is defined at compile time. \
366 " }, \
367 { "common_url_parts", "http:// http://www. ftp:// ftp://ftp. /pub/ .html .htm .shtml /index.html /index.htm .com/ .com mailto:",  \
368 	"string list", "all", "", "3.1.0", "URLs", "common_url_parts: http://www.htdig.org/ml/ \\<br> \
369 .html \\<br> \
370 http://dev.htdig.org/ \\<br> \
371 http://www.htdig.org/", " \
372 	Sub-strings often found in URLs stored in the \
373 	database.  These are replaced in the database by an \
374 	internal space-saving encoding.  If a string \
375 	specified in <a href=\"#url_part_aliases\">url_part_aliases</a>, \
376 	overlaps any string in common_url_parts, the \
377 	common_url_parts string is ignored.<br> \
378 	Note that when this attribute is changed, the \
379 	database should be rebuilt, unless the effect of \
380 	\"changing\" the affected URLs in the database is \
381 	wanted.<br> \
382 " }, \
383 { "compression_level", "6",  \
384 	"integer", "htdig", "", "3.1.0", "Indexing:How", "compression_level: 0", " \
385 	If non-zero and the \
386 	<a href=\"http://www.cdrom.com/pub/infozip/zlib/\">zlib</a> \
387 	compression library was available when compiled, \
388 	this attribute controls the amount of compression used in the \
389 	<a href=\"#doc_excerpt\">doc_excerpt</a> file. \
390 	<br/>This must be in the range 0-9, and must be non-zero when \
391 	<a href=\"#wordlist_compress_zlib\">wordlist_compress_zlib</a> \
392 	is used. \
393 " }, \
394 { "config", "",  \
395 	"string", "all", "", "??", "File Layout", "", " \
396 	Name of configuration file to load. \
397 	For security reasons, restrictions are placed on the values which \
398 	can be specified on the command line to \
399 	<a href=\"htsearch.html\" target=\"_top\">htsearch</a>. \
400 	The default value of this attribute is determined at \
401 	compile time. \
402 " }, \
403 { "config_dir", CONFIG_DIR,  \
404 	"string", "all", "", "all", "File Layout", "config_dir: /var/htdig/conf", " \
405 	This is the directory which contains all configuration \
406 	files related to ht://Dig. It is never used \
407 	directly by any of the programs, but other attributes \
408 	or the <a href=\"#include\">include</a> directive \
409 	can be defined in terms of this one. \
410 	<p> \
411 	The default value of this attribute is determined at \
412 	compile time. \
413 	</p> \
414 " },
415 { "content_classifier", "${bin_dir}/HtFileType",  \
416 	"string", "htdig", "", "3.2.0b4", "Indexing:What", "content_classifier: file -i -b", " \
417 	When ht://Dig can't determine the type of a <code>file://</code> \
418 	URL from its extension, this program is used to determine the type. \
419 	The program is called with one argument, the name of (possibly a \
420 	temporary copy of) the file. \
421 	<p> \
422 	See also <a href=\"#mime_types\">mime_types</a>.\
423 	</p> \
424 " }, \
425 { "cookies_input_file", "",  \
426 	"string", "htdig", "", "3.2.0b4", "Indexing:Connection", "cookies_input_file: ${common_dir}/cookies.txt", " \
427 	Specifies the location of the file used for importing cookies \
428 	for the crawl. These cookies will be preloaded into htdig's \
429 	in-memory cookie jar, but aren't written back to the file. \
430 	Cookies are specified according to Netscape's format \
431 	(tab-separated fields). If this attribute is left blank, \
432 	no cookie file will be read. \
433 	For more information, see the sample cookies.txt file in the \
434 	ht://Dig source distribution. \
435 " }, \
436 { "create_image_list", "false",  \
437 	"boolean", "htdig", "", "all", "Extra Output", "create_image_list: yes", " \
438 	If set to true, a file with all the image URLs that \
439 	were seen will be created, one URL per line. This list \
440 	will not be in any order and there will be lots of \
441 	duplicates, so after htdig has completed, it should be \
442 	piped through <code>sort -u</code> to get a unique list. \
443 " }, \
444 { "create_url_list", "false",  \
445 	"boolean", "htdig", "", "all", "Extra Output", "create_url_list: yes", " \
446 	If set to true, a file with all the URLs that were seen \
447 	will be created, one URL per line. This list will not \
448 	be in any order and there will be lots of duplicates, \
449 	so after htdig has completed, it should be piped \
450 	through <code>sort -u</code> to get a unique list. \
451 " }, \
452 { "database_base", "${database_dir}/db",  \
453 	"string", "all", "", "all", "File Layout", "database_base: ${database_dir}/sales", " \
454 	This is the common prefix for files that are specific \
455 	to a search database. Many different attributes use \
456 	this prefix to specify filenames. Several search \
457 	databases can share the same directory by just changing \
458 	this value for each of the databases. \
459 " }, \
460 { "database_dir", DATABASE_DIR,  \
461 	"string", "all", "", "all", "File Layout", "database_dir: /var/htdig", " \
462 	This is the directory which contains all database and \
463 	other files related to ht://Dig. It is never used \
464 	directly by any of the programs, but other attributes \
465 	are defined in terms of this one. \
466 	<p> \
467 	The default value of this attribute is determined at \
468 	compile time. \
469 	</p> \
470 " }, \
471 { "date_factor", "0",  \
472 	"number", "htsearch", "", "3.1.0", "Searching:Ranking", "date_factor: 0.35", " \
473 	This factor, gives higher \
474 	rankings to newer documents and lower rankings to older \
475 	documents. Before setting this factor, it's advised to \
476 	make sure your servers are returning accurate dates \
477 	(check the dates returned in the long format). \
478 	Additionally, setting this to a nonzero value incurs a \
479 	small performance hit on searching. \
480 " }, \
481 { "date_format", "",  \
482 	"string", "htsearch", "", "3.1.2", "Presentation:How", "date_format: %Y-%m-%d", " \
483 	This format string determines the output format for \
484 	modification dates of documents in the search results. \
485 	It is interpreted by your system's <em>strftime</em> \
486 	function. Please refer to your system's manual page \
487 	for this function, for a description of available \
488 	format codes. If this format string is empty, as it \
489 	is by default,  \
490 	<a href=\"htsearch.html\" target=\"_top\">htsearch</a> \
491 	will pick a format itself. In this case, the <a \
492 	href=\"#iso_8601\">iso_8601</a> attribute can be used \
493 	to modify the appearance of the date. \
494 " }, \
495 { "description_factor", "150",  \
496 	"number", "htsearch", "", "3.1.0b3", "Searching:Ranking", "description_factor: 350", " \
497 	Plain old \"descriptions\" are the text of a link pointing \
498 	to a document. This factor gives weight to the words of \
499 	these descriptions of the document. Not surprisingly, \
500 	these can be pretty accurate summaries of a document's \
501 	content. See also <a href=\"#heading_factor\">heading_factor</a> \
502 	and <a href=\"#meta_description_factor\">meta_description_factor</a>. \
503 " }, \
504 { "description_meta_tag_names", "description",  \
505 	"string list", "htdig", "", "3.1.6", "Searching:Ranking", "description_meta_tag_names: \"description htdig-description\"", " \
506 	The words in this list are used to search for descriptions in HTML \
507 	<em>META</em> tags. This list can contain any number of strings \
508 	that each will be seen as the name for whatever description \
509 	convention is used. While words in any of the specified \
510 	description contents will be indexed, only the last meta tag \
511 	containing a description will be kept for the \
512 	<a href=\"hts_templates.html#METADESCRIPTION\"METADESCRIPTION</a> \
513 	variable in search results. The order in \
514 	which the names are specified in this configuration attribute \
515 	is irrelevant, as it is the order in which the tags appear in \
516 	the documents that matters.<br> The <em>META</em> tags have the \
517 	following format:<br> \
518 	<tt> &nbsp;&nbsp;&lt;META name=\"<em>somename</em>\" \
519 	                       content=\"<em>somevalue</em>\"&gt; </tt><br> \
520 	See also <a href=\"#meta_description_factor\">meta_description_factor</a>. \
521 " }, \
522 { "disable_cookies", "true",  \
523 	"boolean", "htdig", "Server", "3.2.0b4", "Indexing:Connection", "disable_cookies: true", " \
524         This option, if set to true, will disable HTTP cookies. \
525 " }, \
526 { "doc_db", "${database_base}.docdb",  \
527 	"string", "all", "", "all", "File Layout", "doc_db: ${database_base}documents.db", " \
528 	This file will contain a Berkeley database of documents \
529 	indexed by document number. It contains all the information \
530 	gathered for each document, except the document excerpts \
531 	which are stored in the <a href=\"#doc_excerpt\"><em> \
532 	doc_excerpt</em></a> file. \
533 " }, \
534 { "doc_excerpt", "${database_base}.excerpts",  \
535 	"string", "all", "", "3.2.0b1", "File Layout", "doc_excerpt: ${database_base}excerpts.db", " \
536 	This file will contain a Berkeley database of document excerpts \
537 	indexed by document number. It contains all the text \
538 	gathered for each document, so this file can become \
539 	rather large if <a href=\"#max_head_length\"><em> \
540 	max_head_length</em></a> is set to a large value. \
541 	The size can be reduced by setting the \
542 	<a href=\"#compression_level\"><em>compression_level</em></a>, \
543 	if supported on your system. \
544 " }, \
545 { "doc_index", "${database_base}.docs.index",  \
546 	"string", "htdig", "", "all", "File Layout", "doc_index: documents.index.db", " \
547 	This file contains a mapping of document numbers to URLs and is \
548 	used by htdig during indexing. It is used on updates if it exists. \
549 " }, \
550 { "doc_list", "${database_base}.docs",  \
551 	"string", "htdig htdump htload", "", "all", "File Layout", "doc_list: /tmp/documents.text", " \
552 	This file is basically a text version of the file \
553 	specified in <em><a href=\"#doc_db\">doc_db</a></em>. Its \
554 	only use is to have a human readable database of all \
555 	documents. The file is easy to parse with tools like \
556 	perl or tcl. \
557 " }, \
558 { "endday", "",  \
559 	"integer", "htsearch", "", "3.1.6", "Searching:Method", "endday: 31", " \
560 	Day component of last date allowed as last-modified date \
561 	of returned docutments. \
562 	This is most usefully specified as a \
563 	<a href=\"hts_form.html#startyear\">GCI argument</a>. \
564 	See also <a href=\"#startyear\">startyear</a>. \
565 " }, \
566 { "end_ellipses", "<strong><code> ...</code></strong>",  \
567 	"string", "htsearch", "", "all", "Presentation:Text", "end_ellipses: ...", " \
568 	When excerpts are displayed in the search output, this \
569 	string will be appended to the excerpt if there is text \
570 	following the text displayed. This is just a visual \
571 	reminder to the user that the excerpt is only part of \
572 	the complete document. \
573 " }, \
574 { "end_highlight", "</strong>",  \
575 	"string", "htsearch", "", "3.1.4", "Presentation:Text", "end_highlight: &lt;/font&gt;", " \
576 	When excerpts are displayed in the search output, matched \
577 	words will be highlighted using <a href=\"#start_highlight\"> \
578 	start_highlight</a> and this string. \
579 	You should ensure that highlighting tags are balanced, \
580 	that is, this string should close any formatting \
581 	tag opened by start_highlight. \
582 " }, \
583 { "endings_affix_file", "${common_dir}/english.aff",  \
584 	"string", "htfuzzy", "", "all", "File Layout", "endings_affix_file: /var/htdig/affix_rules", " \
585 	Specifies the location of the file which contains the \
586 	affix rules used to create the endings search algorithm \
587 	databases. Consult the documentation on \
588 	<a href=\"htfuzzy.html\">htfuzzy</a> for more information on the \
589 	format of this file. \
590 " }, \
591 { "endings_dictionary", "${common_dir}/english.0",  \
592 	"string", "htfuzzy", "", "all", "File Layout", "endings_dictionary: /var/htdig/dictionary", " \
593 	Specifies the location of the file which contains the \
594 	dictionary used to create the endings search algorithm \
595 	databases. Consult the documentation on \
596 	<a href=\"htfuzzy.html\">htfuzzy</a> for more information on the \
597 	format of this file. \
598 " }, \
599 { "endings_root2word_db", "${common_dir}/root2word.db",  \
600 	"string", "htfuzzy htsearch", "", "all", "File Layout", "endings_root2word_db: /var/htdig/r2w.db", " \
601 	This attributes specifies the database filename to be \
602 	used in the 'endings' fuzzy search algorithm. The \
603 	database maps word roots to all legal words with that \
604 	root. For more information about this and other fuzzy \
605 	search algorithms, consult the \
606 	<a href=\"htfuzzy.html\">htfuzzy</a> documentation.<br> \
607 	Note that the default value uses the \
608 	<a href=\"#common_dir\">common_dir</a> attribute instead of the \
609 	<a href=\"#database_dir\">database_dir</a> attribute. \
610 	This is because this database can be shared with \
611 	different search databases. \
612 " }, \
613 { "endings_word2root_db", "${common_dir}/word2root.db",  \
614 	"string", "htfuzzy htsearch", "", "all", "File Layout", "endings_word2root_db: /var/htdig/w2r.bm", " \
615 	This attributes specifies the database filename to be \
616 	used in the 'endings' fuzzy search algorithm. The \
617 	database maps words to their root. For more information \
618 	about this and other fuzzy search algorithms, consult \
619 	the <a href=\"htfuzzy.html\">htfuzzy</a> \
620 	documentation.<br> \
621 	Note that the default value uses the \
622 	<a href=\"#common_dir\">common_dir</a> attribute instead of the \
623 	<a href=\"#database_dir\">database_dir</a> attribute. \
624 	This is because this database can be shared with \
625 	different search databases. \
626 " }, \
627 { "endmonth", "",  \
628 	"integer", "htsearch", "", "3.1.6", "Searching:Method", "endmonth: 12", " \
629 	Month component of last date allowed as last-modified date \
630 	of returned docutments. \
631 	This is most usefully specified as a \
632 	<a href=\"hts_form.html#startyear\">GCI argument</a>. \
633 	See also <a href=\"#startyear\">startyear</a>. \
634 " }, \
635 { "endyear", "",  \
636 	"integer", "htsearch", "", "3.1.6", "Searching:Method", "endyear: 2002", " \
637 	Year component of last date allowed as last-modified date \
638 	of returned docutments. \
639 	This is most usefully specified as a \
640 	<a href=\"hts_form.html#startyear\">GCI argument</a>. \
641 	See also <a href=\"#startyear\">startyear</a>. \
642 " }, \
643 { "excerpt_length", "300",  \
644 	"integer", "htsearch", "", "all", "Presentation:How", "excerpt_length: 500", " \
645 	This is the maximum number of characters the displayed \
646 	excerpt will be limited to. The first matched word will \
647 	be highlighted in the middle of the excerpt so that there is \
648 	some surrounding context.<br> \
649 	The <em><a href=\"#start_ellipses\"> \
650 	start_ellipses</a></em> and \
651 	<em><a href=\"#end_ellipses\">end_ellipses</a></em> are used to \
652 	indicate that the document contains text before and \
653 	after the displayed excerpt respectively. \
654 	The <em><a href=\"#start_highlight\">start_highlight</a></em> and \
655 	<em><a href=\"#end_highlight\">end_highlight</a></em> are used to \
656 	specify what formatting tags are used to highlight matched words. \
657 " }, \
658 { "excerpt_show_top", "false",  \
659 	"boolean", "htsearch", "", "all", "Presentation:How", "excerpt_show_top: yes", " \
660 	If set to true, the excerpt of a match will always show \
661 	the top of the matching document. If it is false (the \
662 	default), the excerpt will attempt to show the part of \
663 	the document that actually contains one of the words. \
664 " }, \
665 { "exclude", "",  \
666 	"pattern list", "htsearch", "", "3.2.0b4", "Searching:Method", "exclude: myhost.com/mailarchive/", " \
667 	If a URL contains any of the space separated patterns, it will be \
668 	discarded in the searching phase. This is used to exclude certain \
669 	URLs from search results. The list can be specified from within \
670 	the configuration file, and can be overridden with the \"exclude\" \
671 	input parameter in the search form. \
672 " }, \
673 { "exclude_urls", "/cgi-bin/ .cgi",  \
674 	"pattern list", "htdig", "URL", "all", "Indexing:Where", "exclude_urls: students.html cgi-bin", " \
675 	If a URL contains any of the space separated patterns, \
676 	it will be rejected. This is used to exclude such \
677 	common things such as an infinite virtual web-tree \
678 	which start with cgi-bin. \
679 " }, \
680 { "external_parsers", "",  \
681 	"quoted string list", "htdig", "", "3.0.7", "External:Parsers", "external_parsers: text/html /usr/local/bin/htmlparser \\<br> \
682 	application/pdf /usr/local/bin/parse_doc.pl \\<br> \
683 	application/msword-&gt;text/plain \"/usr/local/bin/mswordtotxt -w\" \\<br> \
684 	application/x-gunzip-&gt;user-defined /usr/local/bin/ungzipper", " \
685 	This attribute is used to specify a list of \
686 	content-type/parsers that are to be used to parse \
687 	documents that cannot by parsed by any of the internal \
688 	parsers. The list of external parsers is examined \
689 	before the builtin parsers are checked, so this can be \
690 	used to override the internal behavior without \
691 	recompiling htdig.<br> \
692 	 The external parsers are specified as pairs of \
693 	strings. The first string of each pair is the \
694 	content-type that the parser can handle while the \
695 	second string of each pair is the path to the external \
696 	parsing program. If quoted, it may contain parameters, \
697 	separated by spaces.<br> \
698 	 External parsing can also be done with external \
699 	converters, which convert one content-type to \
700 	another. To do this, instead of just specifying \
701 	a single content-type as the first string \
702 	of a pair, you specify two types, in the form \
703 	<em>type1</em><strong>-&gt;</strong><em>type2</em>, \
704 	as a single string with no spaces. The second \
705 	string will define an external converter \
706 	rather than an external parser, to convert \
707 	the first type to the second. If the second \
708 	type is <strong>user-defined</strong>, then \
709 	it's up to the converter script to put out a \
710 	\"Content-Type:&nbsp;<em>type</em>\" header followed \
711 	by a blank line, to indicate to htdig what type it \
712 	should expect for the output, much like what a CGI \
713 	script would do. The resulting content-type must \
714 	be one that htdig can parse, either internally, \
715 	or with another external parser or converter.<br> \
716 	 Only one external parser or converter can be \
717 	specified for any given content-type. However, \
718 	an external converter for one content-type can be \
719 	chained to the internal parser for the same type, \
720 	by appending <strong>-internal</strong> to the \
721 	second type string (e.g. text/html->text/html-internal) \
722 	to perform external preprocessing on documents of \
723 	this type before internal parsing. \
724 	There are two internal parsers, for text/html and \
725 	text/plain.<p> \
726 	The parser program takes four command-line \
727 	parameters, not counting any parameters already \
728 	given in the command string:<br> \
729 	<em>infile content-type URL configuration-file</em><br> \
730 	<table border=\"1\"> \
731 	  <tr> \
732 		<th> Parameter </th> \
733 		<th> Description </th> \
734 		<th> Example </th> \
735 	  </tr> \
736 	  <tr> \
737 		<td valign=\"top\"> infile </td> \
738 		<td> A temporary file with the contents to be parsed.  </td> \
739 		<td> /var/tmp/htdext.14242 </td> \
740 	  </tr> \
741 	  <tr> \
742 		<td valign=\"top\"> content-type </td> \
743 		<td> The MIME-type of the contents.  </td> \
744 		<td> text/html </td> \
745 	  </tr> \
746 	  <tr> \
747 		<td valign=\"top\"> URL </td> \
748 		<td> The URL of the contents.  </td> \
749 		<td> http://www.htdig.org/attrs.html </td> \
750 	  </tr> \
751 	  <tr> \
752 		<td valign=\"top\"> configuration-file </td> \
753 		<td> The configuration-file in effect.  </td> \
754 		<td> /etc/htdig/htdig.conf </td> \
755 	  </tr> \
756 	</table><p> \
757 	The external parser is to write information for \
758 	htdig on its standard output. Unless it is an \
759 	external converter, which will output a document \
760 	of a different content-type, then its output must \
761 	follow the format described here.<br> \
762 	 The output consists of records, each record terminated \
763 	with a newline. Each record is a series of (unless \
764 	expressively allowed to be empty) non-empty tab-separated \
765 	fields. The first field is a single character \
766 	that specifies the record type. The rest of the fields \
767 	are determined by the record type. \
768 	<table border=\"1\"> \
769 	  <tr> \
770 		<th> Record type </th> \
771 		<th> Fields </th> \
772 		<th> Description </th> \
773 	  </tr> \
774 	  <tr> \
775 		<th rowspan=\"3\" valign=\"top\"> w </th> \
776 		<td valign=\"top\"> word </td> \
777 		<td> A word that was found in the document.  </td> \
778 	  </tr> \
779 	  <tr> \
780 		<td valign=\"top\"> location </td> \
781 		<td> \
782 		  A number indicating the normalized location of \
783 		  the word within the document. The number has to \
784 		  fall in the range 0-1000 where 0 means the top of \
785 		  the document. \
786 		</td> \
787 	  </tr> \
788 	  <tr> \
789 		<td valign=\"top\"> heading level </td> \
790 		<td> \
791 		  A heading level that is used to compute the \
792 		  weight of the word depending on its context in \
793 		  the document itself. The level is in the range of \
794 		  0-11 and are defined as follows: \
795 		  <dl compact> \
796 			<dt> 0 </dt> <dd> Normal text </dd> \
797 			<dt> 1 </dt> <dd> Title text </dd> \
798 			<dt> 2 </dt> <dd> Heading 1 text </dd> \
799 			<dt> 3 </dt> <dd> Heading 2 text </dd> \
800 			<dt> 4 </dt> <dd> Heading 3 text </dd> \
801 			<dt> 5 </dt> <dd> Heading 4 text </dd> \
802 			<dt> 6 </dt> <dd> Heading 5 text </dd> \
803 			<dt> 7 </dt> <dd> Heading 6 text </dd> \
804 			<dt> 8 </dt> <dd> text alternative to images </dd> \
805 			<dt> 9 </dt> <dd> Keywords </dd> \
806 			<dt> 10 </dt> <dd> Meta-description </dd> \
807 			<dt> 11 </dt> <dd> Author </dd> \
808 		  </dl> \
809 		</td> \
810 	  </tr> \
811 	  <tr> \
812 		<th rowspan=\"2\" valign=\"top\"> u </th> \
813 		<td valign=\"top\"> document URL </td> \
814 		<td> \
815 		  A hyperlink to another document that is \
816 		  referenced by the current document.  It must be \
817 		  complete and non-relative, using the URL parameter to \
818 		  resolve any relative references found in the document. \
819 		</td> \
820 	  </tr> \
821 	  <tr> \
822 		<td valign=\"top\"> hyperlink description </td> \
823 		<td> \
824 		  For HTML documents, this would be the text \
825 		  between the &lt;a href...&gt; and &lt;/a&gt; \
826 		  tags. \
827 		</td> \
828 	  </tr> \
829 	  <tr> \
830 		<th valign=\"top\"> t </th> \
831 		<td valign=\"top\"> title </td> \
832 		<td> The title of the document </td> \
833 	  </tr> \
834 	  <tr> \
835 		<th valign=\"top\"> h </th> \
836 		<td valign=\"top\"> head </td> \
837 		<td> \
838 		  The top of the document itself. This is used to \
839 		  build the excerpt. This should only contain \
840 		  normal ASCII text \
841 		</td> \
842 	  </tr> \
843 	  <tr> \
844 		<th valign=\"top\"> a </th> \
845 		<td valign=\"top\"> anchor </td> \
846 		<td> \
847 		  The label that identifies an anchor that can be \
848 		  used as a target in an URL. This really only \
849 		  makes sense for HTML documents. \
850 		</td> \
851 	  </tr> \
852 	  <tr> \
853 		<th valign=\"top\"> i </th> \
854 		<td valign=\"top\"> image URL </td> \
855 		<td> \
856 		  An URL that points at an image that is part of \
857 		  the document. \
858         </td> \
859 	  </tr> \
860 	  <tr> \
861 		<th rowspan=\"3\" valign=\"top\"> m </th> \
862 		<td valign=\"top\"> http-equiv </td> \
863 		<td> \
864 		  The HTTP-EQUIV attribute of a \
865 		  <a href=\"meta.html\"><em>META</em> tag</a>. \
866 		  May be empty. \
867 		</td> \
868 	  </tr> \
869 	  <tr> \
870 		<td valign=\"top\"> name </td> \
871 		<td> \
872 		  The NAME attribute of this \
873 		  <a href=\"meta.html\"><em>META</em> tag</a>. \
874 		  May be empty. \
875 		</td> \
876 	  </tr> \
877 	  <tr> \
878 		<td valign=\"top\"> contents </td> \
879 		<td> \
880 		  The CONTENTS attribute of this \
881 		  <a href=\"meta.html\"><em>META</em> tag</a>. \
882 		  May be empty. \
883 		</td> \
884 	  </tr> \
885 	</table> \
886 	<p><em>See also FAQ questions <a href=\"FAQ.html#q4.8\">4.8</a> and \
887 	<a href=\"FAQ.html#q4.9\">4.9</a> for more examples.</em></p> \
888 " }, \
889 { "external_protocols", "", \
890 	"quoted string list", "htdig", "", "3.2.0b1", "External:Protocols", "external_protocols: https /usr/local/bin/handler.pl \\<br> \
891 	ftp /usr/local/bin/ftp-handler.pl", " \
892 	This attribute is a bit like \
893 	<a href=\"#external_parsers\">external_parsers</a> since it specifies \
894 	a list of protocols/handlers that are used to download documents \
895 	that cannot be retrieved using the internal methods. This enables \
896 	htdig to index documents with URL schemes it does not understand, \
897 	or to use more advanced authentication for the documents it is \
898 	retrieving. This list is checked before HTTP or other methods, \
899 	so this can override the internal behavior without writing additional \
900 	code for htdig.<br> \
901 	  The external protocols are specified as pairs of strings, the first \
902 	being the URL scheme that the script can handle while the second \
903 	is the path to the script itself. If the second is \
904 	quoted, then additional command-line arguments may be given.<br> \
905 	If the external protocol does not contain a colon (:), it is assumed \
906 	to have the standard format \
907 	\"protocol://[usr[:password]@]address[:port]/path\". \
908 	If it ends with a colon, then it is assumed to have the simpler format \
909 	\"protocol:path\". If it ends with \"://\" then the standard form is \
910 	again assumed. <br> \
911 	  The program takes three command-line parameters, not counting any \
912 	parameters already given in the command string:<br> \
913 	<em>protocol URL configuration-file</em><br> \
914 	<table border=\"1\"> \
915 	  <tr> \
916 		<th> Parameter </th> \
917 		<th> Description </th> \
918 		<th> Example </th> \
919 	  </tr> \
920 	  <tr> \
921 		<td valign=\"top\"> protocol </td> \
922 		<td> The URL scheme to be used.  </td> \
923 		<td> https </td> \
924 	  </tr> \
925 	  <tr> \
926 		<td valign=\"top\"> URL </td> \
927 		<td> The URL to be retrieved.  </td> \
928 		<td> https://www.htdig.org:8008/attrs.html </td> \
929 	  </tr> \
930 	  <tr> \
931 		<td valign=\"top\"> configuration-file </td> \
932 		<td> The configuration-file in effect.  </td> \
933 		<td> /etc/htdig/htdig.conf </td> \
934 	  </tr> \
935 	</table><p> \
936 	The external protocol script is to write information for htdig on the  \
937 	standard output. The output must follow the form described here. The \
938 	output consists of a header followed by a blank line, followed by \
939 	the contents of the document. Each record in the header is terminated \
940 	with a newline.  Each record is a series of (unless expressively \
941 	allowed to be empty) non-empty tab-separated fields. The first field \
942 	is a single character that specifies the record type. The rest of \
943 	the fields are determined by the record type. \
944 	<table border=\"1\"> \
945 	  <tr> \
946 		<th> Record type </th> \
947 		<th> Fields </th> \
948 		<th> Description </th> \
949 	  </tr> \
950 	  <tr> \
951 		<th valign=\"top\"> s </th> \
952 		<td valign=\"top\"> status code </td> \
953 		<td> \
954 		  An HTTP-style status code, e.g. 200, 404. Typical codes include: \
955 		    <dl compact> \
956 			<dt> 200 </dt> \
957 			    <dd> Successful retrieval </dd> \
958 			<dt> 304 </dt> \
959 			<dd> \
960 			  Not modified (for example, if the document hasn\'t \
961 			  changed since the last dig) \
962 			</dd> \
963 			<dt> 301 </dt> \
964 			    <dd> Redirect (to another URL) </dd> \
965 			<dt> 401 </dt> \
966 			    <dd> Not authorized </dd> \
967 			<dt> 404 </dt> \
968 			    <dd> Not found </dd> \
969 		    </dl> \
970 		</td> \
971 	  </tr> \
972 	  <tr> \
973 		<th valign=\"top\"> r </th> \
974 		<td valign=\"top\"> reason </td> \
975 		<td> \
976 		  A text string describing the status code, \
977 		  e.g \"Redirect\" or \"Not Found.\" \
978 		</td> \
979 	  </tr> \
980 	  <tr> \
981 		<th valign=\"top\"> m </th> \
982 		<td valign=\"top\"> status code </td> \
983 		<td> \
984 		  The modification time of this document. While the code is \
985 		  fairly flexible about the time/date formats it accepts, it \
986 		  is recommended to use something standard, like \
987 		  RFC1123: Sun, 06 Nov 1994 08:49:37 GMT, or \
988 		  ISO-8601:  1994-11-06 08:49:37 GMT. \
989 		</td> \
990 	  </tr> \
991 	  <tr> \
992 		<th valign=\"top\"> t </th> \
993 		<td valign=\"top\"> content-type </td> \
994 		<td> \
995 		  A valid MIME type for the document, like text/html or text/plain. \
996 		</td> \
997 	  </tr> \
998 	  <tr> \
999 		<th valign=\"top\"> l </th> \
1000 		<td valign=\"top\"> content-length </td> \
1001 		<td> \
1002 		  The length of the document on the server, which may not \
1003 		  necessarily be the length of the buffer returned. \
1004 		</td> \
1005 	  </tr> \
1006 	  <tr> \
1007 		<th valign=\"top\"> u </th> \
1008 		<td valign=\"top\"> url </td> \
1009 		<td> \
1010 		  The URL of the document, or in the case of a redirect, the \
1011 		  URL that should be indexed as a result of the redirect. \
1012 		</td> \
1013 	  </tr> \
1014       </table>	   \
1015 " }, \
1016 { "extra_word_characters", "",  \
1017 	"string", "htdig htsearch", "", "3.1.2", "Indexing:What", "extra_word_characters: _", " \
1018 	These characters are considered part of a word. \
1019 	In contrast to the characters in the \
1020 	<a href=\"#valid_punctuation\">valid_punctuation</a> \
1021 	attribute, they are treated just like letter \
1022 	characters.  See also the <a href=\"#allow_numbers\">allow_numbers</a>\
1023 	attribute.<br> \
1024 	Note that the <a href=\"#locale\">locale</a> attribute \
1025 	is normally used to configure which characters \
1026 	constitute letter characters.<br> \
1027 	Note also that it is an error to have characters in both \
1028 	extra_word_characters and \
1029 	<a href=\"#valid_punctuation\">valid_punctuation</a>. \
1030 	To add one of the characters in the default valid_punctuation to \
1031 	extra_word_characters, an explicit valid_punctuation entry must be \
1032 	added to the configuration file.<br> \
1033 	See also the comments about special characters at \
1034 	<a href=\"#valid_punctuation\">valid_punctuation</a>. \
1035 " }, \
1036 { "head_before_get", "true",  \
1037 	"boolean", "htdig", "Server", "3.2.0b1", "Indexing:Connection", "head_before_get: false", " \
1038     If set to true, an HTTP/1.1 <em>HEAD</em> \
1039     call is made in order to retrieve header information about a document. \
1040     If the status code and the content-type returned show that the \
1041     document is parsable, then a subsequent 'GET' call is made.  In \
1042     general, it is recommended that this attribute be set to 'true', \
1043     as it can really improve performance (especially when used with \
1044     persistent connections).  This is particularly so during an \
1045     incremental dig, since in this case 'htdig' can ask the server if the \
1046     document has been modified since last dig. However there are a few \
1047     cases when it is better to switch it off: \
1048     <ul> \
1049         <li>the majority of documents are parsable (HTML or a type for which \
1050         an external parser has been provided) and must be retrieved anyway \
1051         (initial dig);</li> \
1052         <li>the server does not support the HEAD method or it is \
1053         disabled;</li> \
1054         <li>in some cases <a href=\"#persistent_connections\">persistent_connections</a> may \
1055         not work properly and either the 'head_before_get' attribute or the \
1056         'persistent_connections' attribute must be turned off.</li> \
1057     </ul> \
1058 " }, \
1059 { "heading_factor", "5",  \
1060 	"number", "htsearch", "", "3.2.0b1", "Searching:Ranking", "heading_factor: 20", " \
1061 			This is a factor which will be used to multiply the \
1062 			weight of words between &lt;h1&gt; and &lt;/h1&gt; \
1063 			tags, as well as headings of levels &lt;h2&gt; through \
1064 			&lt;h6&gt;. It is used to assign the level of importance \
1065 			to headings. Setting a factor to 0 will cause words \
1066 			in these headings to be ignored. The number may be a \
1067 	floating point number. See also \
1068 	<a href=\"#author_factor\">author_factor</a> \
1069 	<a href=\"#backlink_factor\">backlink_factor</a> \
1070 	<a href=\"#caps_factor\">caps_factor</a> \
1071 	<a href=\"#date_factor\">date_factor</a> \
1072 	<a href=\"#description_factor\">description_factor</a> \
1073 	<a href=\"#keywords_factor\">keywords_factor</a> \
1074 	<a href=\"#meta_description_factor\">meta_description_factor</a> \
1075 	<a href=\"#text_factor\">text_factor</a> \
1076 	<a href=\"#title_factor\">title_factor</a> \
1077 	<a href=\"#url_text_factor\">url_text_factor</a> \
1078 " }, \
1079 { "htnotify_prefix_file", "", \
1080     "string", "htnotify", "", "3.2.0b3", "Extra Output", "htnotify_prefix_file: ${common_dir}/notify_prefix.txt", " \
1081 	Specifies the file containing text to be inserted in each mail  \
1082 	message sent by htnotify before the list of expired webpages. If omitted,  \
1083 	nothing is inserted. \
1084 " }, \
1085 { "htnotify_replyto", "", \
1086     "string", "htnotify", "", "3.2.0b3", "Extra Output", "htnotify_replyto: design-group@foo.com", " \
1087 	This specifies the email address that htnotify email messages \
1088 	include in the Reply-to: field. \
1089 " }, \
1090 { "htnotify_sender", "webmaster@www",  \
1091 	"string", "htnotify", "", "all", "Extra Output", "htnotify_sender: bigboss@yourcompany.com", " \
1092 	This specifies the email address that htnotify email \
1093 	messages get sent out from. The address is forged using \
1094 	/usr/lib/sendmail. Check htnotify/htnotify.cc for \
1095 	detail on how this is done. \
1096 " }, \
1097 { "htnotify_suffix_file", "", \
1098     "string", "htnotify", "", "3.2.0b3", "Extra Output", "htnotify_suffix_file: ${common_dir}/notify_suffix.txt", " \
1099 	Specifies the file containing text to be inserted in each mail message  \
1100 	sent by htnotify after the list of expired webpages. If omitted, htnotify  \
1101 	will insert a standard message. \
1102 " }, \
1103 { "htnotify_webmaster",	 "ht://Dig Notification Service", \
1104     "string", "htnotify", "", "3.2.0b3", "Extra Output", "htnotify_webmaster: Notification Service", " \
1105 	This provides a name for the From field, in addition to the email \
1106 	address for the email messages sent out by htnotify. \
1107 " }, \
1108 { "http_proxy", "",  \
1109 	"string", "htdig", "URL", "3.0", "Indexing:Connection", "http_proxy: http://proxy.bigbucks.com:3128", " \
1110 	When this attribute is set, all HTTP document \
1111 	retrievals will be done using the HTTP-PROXY protocol. \
1112 	The URL specified in this attribute points to the host \
1113 	and port where the proxy server resides.<br> \
1114 	Later, this should be able to be overridden by the \
1115 	<code>http_proxy</code> environement variable, but it currently cannot.\
1116 	The use of a proxy server greatly improves performance \
1117 	of the indexing process.<br> \
1118 	See also \
1119 	<a href=\"#http_proxy_authorization\">http_proxy_authorization</a> and \
1120 	<a href=\"#http_proxy_exclude\">#http_proxy_exclude</a>. \
1121 " }, \
1122 { "http_proxy_authorization", "",  \
1123 	"string", "htdig", "URL", "3.2.0b4", "Indexing:Connection", "http_proxy_authorization: myusername:mypassword", " \
1124 	This tells htdig to send the supplied \
1125 	<em>username</em><strong>:</strong><em>password</em> with each HTTP request, \
1126 	when using a proxy with authorization requested. \
1127 	The credentials will be encoded using the \"Basic\" authentication \
1128 	scheme. There <em>must</em> be a colon (:) between the username and \
1129 	password.<br> \
1130 	If you use this option, be sure to protect the configuration file \
1131 	so it is readable only by you, and do not \
1132 	use that same configuration file for htsearch. \
1133 " }, \
1134 { "http_proxy_exclude", "", \
1135 	"pattern list", "htdig", "", "3.1.0b3", "Indexing:Connection", "http_proxy_exclude: http://intranet.foo.com/", " \
1136 	When this is set, URLs matching this will not use the \
1137 	proxy. This is useful when you have a mixture of sites \
1138 	near to the digging server and far away. \
1139 " }, \
1140 { "ignore_alt_text", "false",  \
1141 	"boolean", "htdig", "", "3.1.6", "Indexing:What", "ignore_alt_text: true", " \
1142 	If set, this causes the text of the ALT field in an &lt;IMG...&gt; tag \
1143 	not to be indexed as part of the text of the document, nor included in \
1144 	excerpts. \
1145 " }, \
1146 { "ignore_dead_servers", "true",  \
1147 	"boolean", "htdig", "", "3.1.6", "Indexing:Connection", "ignore_dead_servers: false", " \
1148 	Determines whether htdig will continue to index URLs from a \
1149 	server after an attempted connection to the server fails as \
1150 	&quot;no host found&quot; or &quot;host not found (port).&quot; If \
1151 	set to false, htdig will try <em>every</em> URL from that server. \
1152 " }, \
1153 { "image_list", "${database_base}.images",  \
1154 	"string", "htdig", "", "all", "Extra Output", "image_list: allimages", " \
1155 	This is the file that a list of image URLs gets written \
1156 	to by <a href=\"htdig.html\">htdig</a> when the \
1157 	<a href=\"#create_image_list\">create_image_list</a> is set to \
1158 	true. As image URLs are seen, they are just appended to \
1159 	this file, so after htdig finishes it is probably a \
1160 	good idea to run <code>sort -u</code> on the file to \
1161 	eliminate duplicates from the file. \
1162 " }, \
1163 { "image_url_prefix", IMAGE_URL_PREFIX,  \
1164 	"string", "htsearch", "", "all", "Presentation:Text", "image_url_prefix: /images/htdig", " \
1165 	This specifies the directory portion of the URL used \
1166 	to display star images. This attribute isn't directly \
1167 	used by htsearch, but is used in the default URL for \
1168 	the <a href=\"#star_image\">star_image</a> and \
1169 	<a href=\"#star_blank\">star_blank</a> attributes, and \
1170 	other attributes may be defined in terms of this one. \
1171 	<p> \
1172 	The default value of this attribute is determined at \
1173 	compile time. \
1174 	</p> \
1175 " }, \
1176 { "include", "", \
1177 	"string", "all", "", "3.1.0", "", "include: ${config_dir}/htdig.conf", " \
1178 	This is not quite a configuration attribute, but \
1179 	rather a directive. It can be used within one \
1180 	configuration file to include the definitions of \
1181 	another file. The last definition of an attribute \
1182 	is the one that applies, so after including a file, \
1183 	any of its definitions can be overridden with \
1184 	subsequent definitions. This can be useful when \
1185 	setting up many configurations that are mostly the \
1186 	same, so all the common attributes can be maintained \
1187 	in a single configuration file. The include directives \
1188 	can be nested, but watch out for nesting loops. \
1189 " }, \
1190 { "iso_8601", "false",  \
1191 	"boolean", "htsearch htnotify", "", "3.1.0b2", "Presentation:How,Extra Output", "iso_8601: true", " \
1192 	This sets whether dates should be output in ISO 8601 \
1193 	format. For example, this was written on: 1998-10-31 11:28:13 EST. \
1194 	See also the <a \
1195 	href=\"#date_format\">date_format</a> attribute, which \
1196 	can override any date format that \
1197 	<a href=\"htsearch.html\" target=\"_top\">htsearch</a> \
1198 	picks by default.<br> \
1199 	This attribute also affects the format of the date \
1200 	<a href=\"htnotify.html\">htnotify</a> expects to find \
1201 	in a <strong>htdig-notification-date</strong> field. \
1202 " }, \
1203 { "keywords", "", \
1204     "string list", "htsearch", "", "??", "Searching:Method", "keywords: documentation", " \
1205 	Keywords which <strong>must</strong> be found on all pages returned, \
1206     	even if the \"or\" (\"Any\") <a href=\"#method\">method</a> is \
1207 	selected. \
1208 " }, \
1209 { "keywords_factor", "100",  \
1210 	"number", "htsearch", "", "all", "Searching:Ranking", "keywords_factor: 12", " \
1211 	This is a factor which will be used to multiply the \
1212 	weight of words in the list of \
1213 	<a href=\"#keywords_meta_tag_names\">meta keywords</a> of a document. \
1214 	The number may be a floating point number. See also the \
1215 	<a href=\"#heading_factor\">heading_factor</a> attribute. \
1216 " }, \
1217 { "keywords_meta_tag_names", "keywords htdig-keywords",  \
1218 	"string list", "htdig", "", "3.0.6", "Indexing:What", "keywords_meta_tag_names: keywords description", " \
1219 	The words in this list are used to search for keywords \
1220 	in HTML <em>META</em> tags. This list can contain any \
1221 	number of strings that each will be seen as the name \
1222 	for whatever keyword convention is used.<br> \
1223 	The <em>META</em> tags have the following format:<br> \
1224 <code> \
1225 &nbsp;&nbsp;&lt;META name=\"<em>somename</em>\" content=\"<em>somevalue</em>\"&gt; \
1226 </code> \
1227 " }, \
1228 { "limit_normalized", "",  \
1229 	"pattern list", "htdig", "", "3.1.0b2", "Indexing:Where", "limit_normalized: http://www.mydomain.com", " \
1230 	This specifies a set of patterns that all URLs have to \
1231 	match against in order for them to be included in the \
1232 	search. Unlike the limit_urls_to attribute, this is done \
1233 	<strong>after</strong> the URL is normalized and the \
1234 	<a href=\"#server_aliases\">server_aliases</a> \
1235 	attribute is applied. This allows filtering after any \
1236 	hostnames and DNS aliases are resolved. Otherwise, this \
1237 	attribute is the same as the <a \
1238 	href=\"#limit_urls_to\">limit_urls_to</a> attribute. \
1239 " }, \
1240 { "limit_urls_to", "${start_url}",  \
1241 	"pattern list", "htdig", "", "all", "Indexing:Where", "limit_urls_to: .sdsu.edu kpbs [.*\\.html]", " \
1242 	This specifies a set of patterns that all URLs have to \
1243 	match against in order for them to be included in the \
1244 	search. Any number of strings can be specified, \
1245 	separated by spaces. If multiple patterns are given, at \
1246 	least one of the patterns has to match the URL.<br> \
1247 	Matching, by default, is a case-sensitive string match on the URL \
1248 	to be used, unless the <a href=\"#case_sensitive\">case_sensitive</a> \
1249 	attribute is false. The match will be performed <em>after</em> \
1250 	the relative references have been converted to a valid \
1251 	URL. This means that the URL will <em>always</em> start \
1252 	with a transport specifier (<code>http://</code> if none is \
1253 	specified).<br> \
1254 	Granted, this is not the perfect way of doing this, \
1255 	but it is simple enough and it covers most cases.<br> \
1256 	To limit URLs in htsearch, use \
1257 	<a href=\"#restrict\">restrict</a>. \
1258 " }, \
1259 { "local_default_doc", "index.html",  \
1260 	"string list", "htdig", "Server", "3.0.8b2", "Indexing:Where", "local_default_doc: default.html default.htm index.html index.htm", " \
1261 	Set this to the default documents in a directory used by the \
1262 	server. This is used for local filesystem access, \
1263 	using <a href=\"#local_urls\">local_urls</a>, to \
1264 	translate URLs like http://foo.com/ into something like \
1265 	/home/foo.com/index.html \
1266 	(see also <a href=\"#remove_default_doc\">remove_default_doc</a>). \
1267 	<br>The list should only contain names that the local server \
1268 	recognizes as default documents for directory URLs, as defined \
1269 	by the DirectoryIndex setting in Apache's srm.conf, for example. \
1270 	As of version 3.1.5, this can be a string list rather than a single \
1271 	name, and htdig will use the first name that works. Since this \
1272 	requires a loop, setting the most common name first will improve \
1273 	performance.  Special characters can be embedded in these names \
1274 	using %xx hex encoding. \
1275 " }, \
1276 { "local_urls", "",  \
1277 	"string list", "htdig", "", "3.0.8b2", "Indexing:Where", "local_urls: http://www.foo.com/=/usr/www/htdocs/", " \
1278 	Set this to tell ht://Dig to access certain URLs through \
1279 	local filesystems. At first ht://Dig will try to access \
1280 	pages with URLs matching the patterns through the \
1281 	filesystems specified. If it cannot find the file, or \
1282 	if it doesn't recognize the file name extension, it will \
1283 	try the URL through HTTP instead. Note the example--the \
1284 	equal sign and the final slashes in both the URL and the \
1285 	directory path are critical. \
1286 	<br>The fallback to HTTP can be disabled by setting the \
1287 	<a href=\"#local_urls_only\">local_urls_only</a> attribute to true. \
1288 	To access user directory URLs through the local filesystem, \
1289 	set <a href=\"#local_user_urls\">local_user_urls</a>.  \
1290 	File types which need processing by the HTTP server may be \
1291 	specified by the \
1292 	<a href=\"#bad_local_extensions\">bad_local_extensions</a> \
1293 	attribute. \
1294 	As of version 3.1.5, you can provide multiple mappings of a given \
1295 	URL to different directories, and htdig will use the first \
1296 	mapping that works. \
1297 	Special characters can be embedded in these names using %xx hex encoding. \
1298 	For example, you can use %3D to embed an \"=\" sign in an URL pattern. \
1299 	<br> \
1300 	See also <a href=\"#local_default_doc\">local_default_doc</a>. \
1301 " }, \
1302 { "local_urls_only", "false",  \
1303 	"boolean", "htdig", "", "3.1.4", "Indexing:Where", "local_urls_only: true", " \
1304 	Set this to tell ht://Dig to access files only through the  \
1305 	local filesystem, for URLs matching the patterns in the \
1306 	<a href=\"#local_urls\">local_urls</a> or \
1307 	<a href=\"#local_user_urls\">local_user_urls</a> attribute. If it \
1308 	cannot find the file, it will give up rather than trying HTTP or \
1309 	another protocol.  With this option, even <code>file://</code> urls \
1310 	are not retrieved, except throught the local_urls mechanism.\
1311 " }, \
1312 { "local_user_urls", "",  \
1313 	"string list", "htdig", "", "3.0.8b2", "Indexing:Where", "local_user_urls: http://www.my.org/=/home/,/www/", " \
1314 	Set this to access user directory URLs through the local \
1315 	filesystem. If you leave the \"path\" portion out, it will \
1316 	look up the user's home directory in /etc/password (or NIS \
1317 	or whatever). As with <a href=\"#local_urls\">local_urls</a>, \
1318 	if the files are not found, ht://Dig will try with HTTP or the \
1319 	appropriate protocol. Again, note the \
1320 	example's format. To map http://www.my.org/~joe/foo/bar.html \
1321 	to /home/joe/www/foo/bar.html, try the example below. \
1322 	<br>The fallback to HTTP can be disabled by setting the \
1323 	<a href=\"#local_urls_only\">local_urls_only</a> attribute to true. \
1324 	As of version 3.1.5, you can provide multiple mappings of a given \
1325 	URL to different directories, and htdig will use the first \
1326 	mapping that works. \
1327 	Special characters can be embedded in these names using %xx hex encoding. \
1328 	For example, you can use %3D to embed an \"=\" sign in an URL pattern. \
1329 " }, \
1330 { "locale", "C",  \
1331 	"string", "htdig", "", "3.0", "Indexing:What,Presentation:How", "locale: en_US", " \
1332 	Set this to whatever locale you want your search \
1333 	database cover. It affects the way international \
1334 	characters are dealt with. On most systems a list of \
1335 	legal locales can be found in /usr/lib/locale. Also \
1336 	check the <strong>setlocale(3C)</strong> man page. \
1337 	Note that depending the locale you choose, and whether \
1338 	your system's locale implementation affects floating \
1339 	point input, you may need to specify the decimal point \
1340 	as a comma rather than a period. This will affect \
1341 	settings of <a href=\"#search_algorithm\">search_algorithm</a> \
1342 	and any of the scoring factors. \
1343 " }, \
1344 { "logging", "false",  \
1345 	"boolean", "htsearch", "", "3.1.0b2", "Extra Output", "logging: true", " \
1346 	This sets whether htsearch should use the syslog() to log \
1347 	search requests. If set, this will log requests with a \
1348 	default level of LOG_INFO and a facility of LOG_LOCAL5. For \
1349 	details on redirecting the log into a separate file or other \
1350 	actions, see the <strong>syslog.conf(5)</strong> man \
1351 	page. To set the level and facility used in logging, change \
1352 	LOG_LEVEL and LOG_FACILITY in the include/htconfig.h file \
1353 	before compiling. \
1354 	<dl> \
1355 	  <dt> \
1356 	    Each line logged by htsearch contains the following: \
1357 	  </dt> \
1358 	  <dd> \
1359 	    REMOTE_ADDR [config] (match_method) [words] \
1360 	    [logicalWords] (matches/matches_per_page) - \
1361 	    page, HTTP_REFERER \
1362 	  </dd> \
1363 	</dl> \
1364 	where any of the above are null or empty, it \
1365 	either puts in '-' or 'default' (for config). \
1366 " }, \
1367 { "maintainer", "bogus@unconfigured.htdig.user",  \
1368 	"string", "htdig", "Server", "all", "Indexing:Out", "maintainer: ben.dover@uptight.com", " \
1369 	This should be the email address of the person in \
1370 	charge of the digging operation. This string is added \
1371 	to the user-agent: field when the digger sends a \
1372 	request to a server. \
1373 " }, \
1374 { "match_method", "and",  \
1375 	"string", "htsearch", "", "3.0", "Searching:Method", "match_method: boolean", " \
1376 	This is the default method for matching that htsearch \
1377 	uses. The valid choices are: \
1378 	<ul> \
1379 	  <li> or </li> \
1380 	  <li> and </li> \
1381 	  <li> boolean </li> \
1382 	</ul> \
1383 	This attribute will only be used if the HTML form that \
1384 	calls htsearch didn't have the \
1385 	<a href=\"hts_form.html#method\">method</a> value set. \
1386 " }, \
1387 { "matches_per_page", "10",  \
1388 	"integer", "htsearch", "", "3.0", "Searching:Method", "matches_per_page: 999", " \
1389 	If this is set to a relatively small number, the \
1390 	matches will be shown in pages instead of all at once. \
1391 	This attribute will only be used if the HTML form that \
1392 	calls htsearch didn't have the \
1393 	<a href=\"hts_form.html#matchesperpage\">matchesperpage</a> value set. \
1394 " }, \
1395 { "max_connection_requests", "-1", \
1396 	"integer", "htdig", "", "3.2.0b1", "Indexing:Connection", "max_connection_requests: 100", " \
1397 	This attribute tells htdig to limit the number of requests it will \
1398 	send to a server using a single, persistent HTTP connection. This \
1399 	only applies when the \
1400 	<a href=\"#persistent_connections\">persistent_connections</a> \
1401 	attribute is set. You may set the limit as high as you want, \
1402 	but it must be at least 1. A value of -1 specifies no limit. \
1403 	Requests in the queue for a server will be combined until either \
1404 	the limit is reached, or the queue is empty. \
1405 " }, \
1406 { "max_description_length", "60",  \
1407 	"integer", "htdig", "", "all", "Indexing:What", "max_description_length: 40", " \
1408 	While gathering descriptions of URLs, \
1409 	<a href=\"htdig.html\">htdig</a> will only record \
1410 	up to this many bytes of hyperlink descriptions for use in the \
1411 	<a href=\"hts_templates.html#DESCRIPTION\">DESCRIPTION</a> template \
1412 	variable.  This is used mostly to deal with broken HTML. (If a \
1413 	hyperlink is not terminated with a &lt;/a&gt; the \
1414 	description will go on until the end of the document.) \
1415 " }, \
1416 { "max_descriptions", "5",  \
1417 	"integer", "htdig", "", "all", "Indexing:What", "max_descriptions: 1", " \
1418 	While gathering <a href=\"#description_factor\">descriptions</a> of \
1419 	URLs for the \
1420 	<a href=\"hts_templates.html#DESCRIPTIONS\">DESCRIPTIONS</a> template \
1421 	variable, <a href=\"htdig.html\">htdig</a> will only record up to this \
1422 	number of descriptions, in the order in which it encounters \
1423 	them. This is used to prevent the database entry for a document \
1424 	from growing out of control if the document has a huge number \
1425 	of links to it. <br> \
1426 	Note that all descriptions are used for indexing. \
1427 " }, \
1428 { "max_doc_size", "100000",  \
1429 	"integer", "htdig", "URL", "3.0", "Indexing:What", "max_doc_size: 5000000", " \
1430 	This is the upper limit to the amount of data retrieved \
1431 	for documents (in bytes). This is mainly used to prevent \
1432 	unreasonable memory consumption since each document \
1433 	will be read into memory by <a href=\"htdig.html\"> \
1434 	htdig</a>. \
1435 " }, \
1436 { "max_excerpts", "1",  \
1437 	"integer", "htsearch", "URL", "3.1.6", "Presentation:How", "max_excerpts: 10", " \
1438 	This value determines the maximum number of excerpts \
1439 	that can be displayed for one matching document in the \
1440 	search results. \
1441 " }, \
1442 { "max_head_length", "512",  \
1443 	"integer", "htdig", "", "all", "Indexing:How", "max_head_length: 50000", " \
1444 	For each document retrieved, the top of the document is \
1445 	stored. This attribute determines the size of this \
1446 	block (in bytes). The text that will be stored is only the text; \
1447 	no markup is stored.<br> \
1448 	We found that storing 50,000 bytes will store about \
1449 	95% of all the documents completely. This really \
1450 	depends on how much storage is available and how much \
1451 	you want to show.  Currently, this is must not be 0. \
1452 " }, \
1453 { "max_hop_count", "999999",  \
1454 	"integer", "htdig", "", "all", "Indexing:Where", "max_hop_count: 4", " \
1455 	Instead of limiting the indexing process by URL \
1456 	pattern, it can also be limited by the number of hops \
1457 	or clicks a document is removed from the starting URL. \
1458 	<br> \
1459 	The starting page or pages will have hop count 0. \
1460 " }, \
1461 { "max_keywords", "-1",  \
1462 	"integer", "htdig", "", "3.2.0b1", "Indexing:What", "max_keywords: 10", " \
1463 	This attribute can be used to limit the number of keywords \
1464 	per document that htdig will accept from meta keywords tags. \
1465 	A value of -1 or less means no limit. This can help combat meta \
1466 	keyword spamming, by limiting the amount of keywords that will be \
1467 	indexed, but it will not completely prevent irrelevant matches \
1468 	in a search if the first few keywords in an offending document \
1469 	are not relevant to its contents. \
1470 " }, \
1471 { "max_meta_description_length", "512",  \
1472 	"integer", "htdig", "", "3.1.0b1", "Indexing:How", "max_meta_description_length: 1000", " \
1473 	While gathering descriptions from meta description tags, \
1474 	<a href=\"htdig.html\">htdig</a> will only store up to  \
1475 	this much of the text (in bytes) for each document to fill the \
1476 	<a href=\"hts_templates.html#METADESCRIPTION\">METADESCRIPTION</a> \
1477 	template variable.  All words in the meta description are still \
1478 	used for indexing. \
1479 " }, \
1480 { "max_prefix_matches", "1000",  \
1481 	"integer", "htsearch", "", "3.1.0b1", "Searching:Method", "max_prefix_matches: 100", " \
1482 	The Prefix <a href=\"#search_algorithm\">fuzzy algorithm</a> \
1483 	could potentially match a \
1484 	very large number of words. This value limits the \
1485 	number of words each prefix can match. Note \
1486 	that this does not limit the number of documents that \
1487 	are matched in any way. \
1488 " }, \
1489 { "max_retries", "3",  \
1490 	"integer", "htdig", "", "3.2.0b1", "Indexing:Connection", "max_retries: 6", " \
1491 	 This option set the maximum number of retries when retrieving a document \
1492 	 fails (mainly for reasons of connection). \
1493 " }, \
1494 { "max_stars", "4",  \
1495 	"integer", "htsearch", "", "all", "Presentation:How", "max_stars: 6", " \
1496 	When stars are used to display the score of a match, \
1497 	this value determines the maximum number of stars that \
1498 	can be displayed. \
1499 " }, \
1500 { "maximum_page_buttons", "${maximum_pages}",  \
1501 	"integer", "htsearch", "", "3.2.0b3", "Presentation:How", "maximum_page_buttons: 20", " \
1502 	This value limits the number of page links that will be \
1503 	included in the page list at the bottom of the search \
1504 	results page. By default, it takes on the value of the \
1505 	<a href=\"#maximum_pages\">maximum_pages</a> \
1506 	attribute, but you can set it to something lower to allow \
1507 	more pages than buttons. In this case, pages above this \
1508 	number will have no corresponding button. \
1509 " }, \
1510 { "maximum_pages", "10",  \
1511 	"integer", "htsearch", "", "all", "Presentation:How", "maximum_pages: 20", " \
1512 	This value limits the number of page links that will be \
1513 	included in the page list at the bottom of the search \
1514 	results page. As of version 3.1.4, this will limit the \
1515 	total number of matching documents that are shown. \
1516 	You can make the number of page buttons smaller than the \
1517 	number of allowed pages by setting the \
1518 	<a href=\"#maximum_page_buttons\">maximum_page_buttons</a> \
1519 	attribute. \
1520 " }, \
1521 { "maximum_word_length", "32",  \
1522 	"integer", "htdig htsearch htfuzzy", "", "3.1.3", "Indexing:What", "maximum_word_length: 15", " \
1523 	This sets the maximum length of words that will be \
1524 	indexed. Words longer than this value will be silently \
1525 	truncated when put into the index, or searched in the \
1526 	index. \
1527 " }, \
1528 { "md5_db", "${database_base}.md5hash.db",  \
1529 	"string", "htdig", "", "3.2.0b3", "File Layout", "md5_db: ${database_base}.md5.db", " \
1530 	This file holds a database of md5 and date hashes of pages to \
1531 	catch and eliminate duplicates of pages. See also the \
1532 	<a href=\"#check_unique_md5\">check_unique_md5</a> and \
1533 	<a href=\"#check_unique_date\">check_unique_date</a> attributes. \
1534 " }, \
1535 { "meta_description_factor", "50",  \
1536 	"number", "htsearch", "", "3.1.0b1", "Searching:Ranking", "meta_description_factor: 20", " \
1537 	This is a factor which will be used to multiply the \
1538 	weight of words in any META description tags in a document. \
1539 	The number may be a floating point number. See also the \
1540 	<a href=\"#heading_factor\">heading_factor</a> attribute and the \
1541 	<a href=\"#description_factor\">description_factor</a> attribute. \
1542 " }, \
1543 { "metaphone_db", "${database_base}.metaphone.db",  \
1544 	"string", "htfuzzy htsearch", "", "all", "File Layout", "metaphone_db: ${database_base}.mp.db", " \
1545 	The database file used for the fuzzy \"metaphone\" search \
1546 	algorithm. This database is created by \
1547 	<a href=\"htfuzzy.html\">htfuzzy</a> and used by \
1548 	<a href=\"htsearch.html\" target=\"_top\">htsearch</a>. \
1549 " }, \
1550 { "method_names", "and All or Any boolean Boolean",  \
1551 	"quoted string list", "htsearch", "", "all", "Searching:UI", "method_names: or Or and And", " \
1552 	These values are used to create the <strong> \
1553 	method</strong> menu. It consists of pairs. The first \
1554 	element of each pair is one of the known methods, the \
1555 	second element is the text that will be shown in the \
1556 	menu for that method. This text needs to be quoted if \
1557 	it contains spaces. \
1558 	See the <a href=\"hts_selectors.html\">select list documentation</a> \
1559 	for more information on how this attribute is used. \
1560 " }, \
1561 { "mime_types", "${config_dir}/mime.types", \
1562 	"string", "htdig", "", "3.2.0b1", "Indexing:Where", "mime_types: /etc/mime.types", " \
1563 	This file is used by htdig for local file access and resolving \
1564 	file:// URLs to ensure the files are parsable. If you are running \
1565 	a webserver with its own MIME file, you should set this attribute \
1566 	to point to that file. \
1567 	<p> \
1568 	See also <a href=\"#content_classifier\">content_classifier</a>.\
1569 "}, \
1570 { "minimum_prefix_length", "1",  \
1571 	"integer", "htsearch", "", "3.1.0b1", "Searching:Method", "minimum_prefix_length: 2", " \
1572 	This sets the minimum length of prefix matches used by the \
1573 	\"prefix\" fuzzy matching algorithm. Words shorter than this \
1574 	will not be used in prefix matching. \
1575 " }, \
1576 { "minimum_speling_length", "5",  \
1577 	"integer", "htsearch", "", "3.2.0b1", "Searching:Method", "minimum_speling_length: 3", " \
1578 	This sets the minimum length of words used by the \
1579 	\"speling\" fuzzy matching algorithm. Words shorter than this \
1580 	will not be used in this fuzzy matching. \
1581 " }, \
1582 { "minimum_word_length", "3",  \
1583 	"integer", "htdig htsearch", "", "all", "Indexing:What", "minimum_word_length: 2", " \
1584 	This sets the minimum length of words that will be \
1585 	indexed. Words shorter than this value will be silently \
1586 	ignored but still put into the excerpt.<br> \
1587 	Note that by making this value less than 3, a lot more \
1588 	words that are very frequent will be indexed. It might \
1589 	be advisable to add some of these to the \
1590 	<a href=\"#bad_word_list\">bad_words list</a>. \
1591 " }, \
1592 { "multimatch_factor", "1",  \
1593 	"number", "htsearch", "", "3.1.6", "Searching:Ranking", "multimatch_factor: 1000", " \
1594     	This factor gives higher rankings to documents that have more than \
1595 	one matching search word when the <strong>or</strong> \
1596 	<a href=\"#match_method\">match_method</a> is used. \
1597 	In version 3.1.6, the matching words' combined scores were multiplied \
1598 	by this factor for each additional matching word.  Currently, this \
1599 	multiplier is applied at most once. \
1600 " },
1601 { "next_page_text", "[next]",  \
1602 	"string", "htsearch", "", "3.1.0", "Presentation:Text", "next_page_text: &lt;img src=\"/htdig/buttonr.gif\"&gt;", " \
1603 	The text displayed in the hyperlink to go to the next \
1604 	page of matches. \
1605 " }, \
1606 { "no_excerpt_show_top", "false",  \
1607 	"boolean", "htsearch", "", "3.1.0b3", "Presentation:How", "no_excerpt_show_top: yes", " \
1608 	If no excerpt is available, this option will act the \
1609 	same as <a \
1610 	href=\"#excerpt_show_top\">excerpt_show_top</a>, that is, \
1611 	it will show the top of the document. \
1612 " }, \
1613 { "no_excerpt_text", "<em>(None of the search words were found in the top of this document.)</em>",  \
1614 	"string", "htsearch", "", "3.0", "Presentation:Text", "no_excerpt_text:", " \
1615 	This text will be displayed in place of the excerpt if \
1616 	there is no excerpt available. If this attribute is set \
1617 	to nothing (blank), the excerpt label will not be \
1618 	displayed in this case. \
1619 " }, \
1620 { "no_next_page_text", "${next_page_text}",  \
1621 	"string", "htsearch", "", "3.0", "Presentation:Text", "no_next_page_text:", " \
1622 	The text displayed where there would normally be a \
1623 	hyperlink to go to the next page of matches. \
1624 " }, \
1625 { "no_page_list_header", "",  \
1626 	"string", "htsearch", "", "3.0", "Presentation:Text", "no_page_list_header: &lt;hr noshade size=2&gt;All results on this page.&lt;br&gt;", " \
1627 	This text will be used as the value of the PAGEHEADER \
1628 	variable, for use in templates or the \
1629 	<a href=\"#search_results_footer\">search_results_footer</a> \
1630 	file, when all search results fit on a single page. \
1631 " }, \
1632 { "no_page_number_text", "",  \
1633 	"quoted string list", "htsearch", "", "3.0", "Presentation:Text", "no_page_number_text: \
1634 				  &lt;strong&gt;1&lt;/strong&gt; &lt;strong&gt;2&lt;/strong&gt; \\<br> \
1635 				  &lt;strong&gt;3&lt;/strong&gt; &lt;strong&gt;4&lt;/strong&gt; \\<br> \
1636 				  &lt;strong&gt;5&lt;/strong&gt; &lt;strong&gt;6&lt;/strong&gt; \\<br> \
1637 				  &lt;strong&gt;7&lt;/strong&gt; &lt;strong&gt;8&lt;/strong&gt; \\<br> \
1638 				  &lt;strong&gt;9&lt;/strong&gt; &lt;strong&gt;10&lt;/strong&gt; \
1639 ", " \
1640 	The text strings in this list will be used when putting \
1641 	together the PAGELIST variable, for use in templates or \
1642 	the <a href=\"#search_results_footer\">search_results_footer</a> \
1643 	file, when search results fit on more than page. The PAGELIST \
1644 	is the list of links at the bottom of the search results page. \
1645 	There should be as many strings in the list as there are \
1646 	pages allowed by the <a href=\"#maximum_page_buttons\">maximum_page_buttons</a> \
1647 	attribute. If there are not enough, or the list is empty, \
1648 	the page numbers alone will be used as the text for the links. \
1649 	An entry from this list is used for the current page, as the \
1650 	current page is shown in the page list without a hypertext link, \
1651 	while entries from the <a href=\"#page_number_text\"> \
1652 	page_number_text</a> list are used for the links to other pages. \
1653 	The text strings can contain HTML tags to highlight page numbers \
1654 	or embed images. The strings need to be quoted if they contain \
1655 	spaces. \
1656 " }, \
1657 { "no_prev_page_text", "${prev_page_text}",  \
1658 	"string", "htsearch", "", "3.0", "Presentation:Text", "no_prev_page_text:", " \
1659 	The text displayed where there would normally be a \
1660 	hyperlink to go to the previous page of matches. \
1661 " }, \
1662 { "no_title_text", "filename",  \
1663 	"string", "htsearch", "", "3.1.0", "Presentation:Text", "no_title_text: \"No Title Found\"", " \
1664 	This specifies the text to use in search results when no \
1665 	title is found in the document itself. If it is set to \
1666 	filename, htsearch will use the name of the file itself, \
1667 	enclosed in brackets (e.g. [index.html]). \
1668 " }, \
1669 { "noindex_end", "<!--/htdig_noindex--> </SCRIPT>",  \
1670 	"quoted string list", "htdig", "", "3.1.0", "Indexing:What", "noindex_end: &lt;/SCRIPT&gt;", " \
1671 	This string marks the end of a section of an HTML file that should be \
1672 	completely ignored when indexing.  Note that text between noindex_start\
1673 	and noindex_end isn't even counted as white space; the text \
1674 	\"<code>foo<!--htdig_noindex-->something<!--/htdig_noindex-->bar</code>\" \
1675 	matches the word \"foobar\", not the phrase \"foo bar\".  White space \
1676 	following noindex_end <em>is</em> counted as white space. See also \
1677 	<a href=\"#noindex_start\">noindex_start</a>. \
1678 " }, \
1679 { "noindex_start", "<!--htdig_noindex--> <SCRIPT",  \
1680 	"quoted string list", "htdig", "", "3.1.0", "Indexing:What", "noindex_start: &lt;SCRIPT", " \
1681 	These strings mark the start of a section of an HTML file that should \
1682 	be completely ignored when indexing. They work together with \
1683 	<a href=\"#noindex_end\">noindex_end</a>.  Once a string in \
1684 	noindex_start is found, text is ignored until the string at the \
1685 	<em>same position</em> within <a href=\"#noindex_end\">noindex_end</a> \
1686 	is encountered.  The sections marked off this way cannot overlap. \
1687 	As in the first default pattern, this can be SGML comment \
1688 	declarations that can be inserted anywhere in the documents to exclude \
1689 	different sections from being indexed. However, existing tags can also \
1690 	be used; this is especially useful to exclude some sections from being \
1691 	indexed where the files to be indexed can not be edited. The second \
1692 	default pattern shows how SCRIPT sections in 'uneditable' documents \
1693 	can be skipped; note how noindex_start does not contain an ending \
1694 	&gt;: this allows for all SCRIPT tags to be matched regardless of \
1695 	attributes defined (different types or languages). \
1696 	Note that the match for this string is case insensitive. \
1697 " }, \
1698 { "nothing_found_file", "${common_dir}/nomatch.html",  \
1699 	"string", "htsearch", "", "all", "Presentation:Files", "nothing_found_file: /www/searching/nothing.html", " \
1700 	This specifies the file which contains the <code> \
1701 	HTML</code> text to display when no matches were found. \
1702 	The file should contain a complete <code>HTML</code> \
1703 	document.<br> \
1704 	Note that this attribute could also be defined in \
1705 	terms of <a href=\"#database_base\">database_base</a> to \
1706 	make is specific to the current search database. \
1707 " }, \
1708 { "nph", "false",  \
1709 	"boolean", "htsearch", "", "3.2.0b2", "Presentation:How", "nph: true", " \
1710 	This attribute determines whether htsearch sends out full HTTP \
1711 	headers as required for an NPH (non-parsed header) CGI. Some \
1712 	servers assume CGIs will act in this fashion, for example MS \
1713 	IIS. If your server does not send out full HTTP headers, you \
1714 	should set this to true. \
1715 " }, \
1716 { "page_list_header", "<hr noshade size=2>Pages:<br>",  \
1717 	"string", "htsearch", "", "3.0", "Presentation:Text", "page_list_header:", " \
1718 	This text will be used as the value of the PAGEHEADER \
1719 	variable, for use in templates or the \
1720 	<a href=\"#search_results_footer\">search_results_footer</a> \
1721 	file, when all search results fit on more than one page. \
1722 " }, \
1723 { "page_number_separator", "\" \"",  \
1724 	"quoted string list", "htsearch", "", "3.1.4", "Presentation:Text", "page_number_separator: \"&lt;/td&gt; &lt;td&gt;\"", " \
1725 	The text strings in this list will be used when putting \
1726 	together the PAGELIST variable, for use in templates or \
1727 	the <a href=\"#search_results_footer\">search_results_footer</a> \
1728 	file, when search results fit on more than page. The PAGELIST \
1729 	is the list of links at the bottom of the search results page. \
1730 	The strings in the list will be used in rotation, and will \
1731 	separate individual entries taken from \
1732 	<a href=\"#page_number_text\">page_number_text</a> and \
1733 	<a href=\"#no_page_number_text\">no_page_number_text</a>. \
1734 	There can be as many or as few strings in the list as you like. \
1735 	If there are not enough for the number of pages listed, it goes \
1736 	back to the start of the list. If the list is empty, a space is \
1737 	used. The text strings can contain HTML tags. The strings need \
1738 	to be quoted if they contain spaces, or to specify an empty string. \
1739 " }, \
1740 { "page_number_text", "",  \
1741 	"quoted string list", "htsearch", "", "3.0", "Presentation:Text", "page_number_text: \
1742 				  &lt;em&gt;1&lt;/em&gt; &lt;em&gt;2&lt;/em&gt; \\<br> \
1743 				  &lt;em&gt;3&lt;/em&gt; &lt;em&gt;4&lt;/em&gt; \\<br> \
1744 				  &lt;em&gt;5&lt;/em&gt; &lt;em&gt;6&lt;/em&gt; \\<br> \
1745 				  &lt;em&gt;7&lt;/em&gt; &lt;em&gt;8&lt;/em&gt; \\<br> \
1746 				  &lt;em&gt;9&lt;/em&gt; &lt;em&gt;10&lt;/em&gt; \
1747 ", " \
1748 	The text strings in this list will be used when putting \
1749 	together the PAGELIST variable, for use in templates or \
1750 	the <a href=\"#search_results_footer\">search_results_footer</a> \
1751 	file, when search results fit on more than page. The PAGELIST \
1752 	is the list of links at the bottom of the search results page. \
1753 	There should be as many strings in the list as there are \
1754 	pages allowed by the <a href=\"#maximum_page_buttons\">maximum_page_buttons</a> \
1755 	attribute. If there are not enough, or the list is empty, \
1756 	the page numbers alone will be used as the text for the links. \
1757 	Entries from this list are used for the links to other pages, \
1758 	while an entry from the <a href=\"#no_page_number_text\"> \
1759 	no_page_number_text</a> list is used for the current page, as the \
1760 	current page is shown in the page list without a hypertext link. \
1761 	The text strings can contain HTML tags to highlight page numbers \
1762 	or embed images. The strings need to be quoted if they contain \
1763 	spaces. \
1764 " }, \
1765 { "persistent_connections", "true",  \
1766 	"boolean", "htdig", "Server", "3.2.0b1", "Indexing:Connection", "persistent_connections: false", " \
1767 	If set to true, when servers make it possible, htdig can take advantage \
1768 	of persistent connections, as defined by HTTP/1.1 (<em>RFC2616</em>). This permits \
1769 	to reduce the number of open/close operations of connections, when retrieving \
1770 	a document with HTTP. \
1771 " }, \
1772 { "plural_suffix", "s", \
1773 	"string", "htsearch", "", "3.2.0b2", "Presentation: Text", "plural_suffix: en", " \
1774 	Specifies the value of the PLURAL_MATCHES template \
1775 	variable used in the header, footer and template files. \
1776 	This can be used for localization for non-English languages \
1777 	where 's' is not the appropriate suffix. \
1778 " }, \
1779 { "prefix_match_character", "*",  \
1780 	"string", "htsearch", "", "3.1.0b1", "Searching:Method", "prefix_match_character: ing", " \
1781 	A null prefix character means that prefix matching should be \
1782 	applied to every search word. Otherwise prefix matching is \
1783 	done on any search word ending with the characters specified \
1784 	in this string, with the string being stripped off before \
1785 	looking for matches. The \"prefix\" algorithm must be enabled \
1786 	in <a href=\"#search_algorithm\">search_algorithm</a> \
1787 	for this to work. You may also want to set the <a \
1788 	href=\"#max_prefix_matches\">max_prefix_matches</a> and <a \
1789 	href=\"#minimum_prefix_length\">minimum_prefix_length</a> attributes \
1790 	to get it working as you want.<br> As a special case, in version \
1791 	3.1.6 and later, if this string is non-null and is entered alone \
1792 	as a search word, it is taken as a wildcard that matches all \
1793 	documents in the database. If this string is null, the wildcard \
1794 	for this special case will be <strong>*</strong>. This wildcard \
1795 	doesn't require the prefix algorithm to be enabled. \
1796 " }, \
1797 { "prev_page_text", "[prev]",  \
1798 	"string", "htsearch", "", "3.0", "Presentation:Text", "prev_page_text: &lt;img src=\"/htdig/buttonl.gif\"&gt;", " \
1799 	The text displayed in the hyperlink to go to the \
1800 	previous page of matches. \
1801 " }, \
1802 { "regex_max_words", "25",  \
1803 	"integer", "htsearch", "", "3.2.0b1", "Searching:Method", "regex_max_words: 10", " \
1804 	The \"regex\" <a href=\"#search_algorithm\">fuzzy algorithm</a> \
1805 	could potentially match a \
1806 	very large number of words. This value limits the \
1807 	number of words each regular expression can match. Note \
1808 	that this does not limit the number of documents that \
1809 	are matched in any way. \
1810 " }, \
1811 { "remove_bad_urls", "true",  \
1812 	"boolean", "htpurge", "Server", "all", "Indexing:How", "remove_bad_urls: true", " \
1813 	If TRUE, htpurge will remove any URLs which were marked \
1814 	as unreachable by htdig from the database. If FALSE, it \
1815 	will not do this. When htdig is run in initial mode, \
1816 	documents which were referred to but could not be \
1817 	accessed should probably be removed, and hence this \
1818 	option should then be set to TRUE, however, if htdig is \
1819 	run to update the database, this may cause documents on \
1820 	a server which is temporarily unavailable to be \
1821 	removed. This is probably NOT what was intended, so \
1822 	hence this option should be set to FALSE in that case. \
1823 " }, \
1824 { "remove_default_doc", "index.html",  \
1825 	"string list", "htdig", "", "3.1.0", "Indexing:How", "remove_default_doc: default.html default.htm index.html index.htm", " \
1826 	Set this to the default documents in a directory used by the \
1827 	servers you are indexing. These document names will be stripped \
1828 	off of URLs when they are normalized, if one of these names appears \
1829 	after the final slash, to translate URLs like \
1830 	http://foo.com/index.html into http://foo.com/<br> \
1831 	Note that you can disable stripping of these names during \
1832 	normalization by setting the list to an empty string. \
1833 	The list should only contain names that all servers you index \
1834 	recognize as default documents for directory URLs, as defined \
1835 	by the DirectoryIndex setting in Apache's srm.conf, for example. \
1836 	This does not apply to  file:///  or  ftp://  URLS. \
1837 	<br>See also <a href=\"#local_default_doc\">local_default_doc</a>. \
1838 " }, \
1839 { "remove_unretrieved_urls", "false",  \
1840 	"boolean", "htpurge", "Server", "3.2.0b1", "Indexing:How", "remove_unretrieved_urls: true", " \
1841 	If TRUE, htpurge will remove any URLs which were discovered \
1842 	and included as stubs in the database but not yet retrieved. If FALSE, it \
1843 	will not do this. When htdig is run in initial mode with no restrictions  \
1844 	on hopcount or maximum documents, these should probably be removed and set \
1845 	to true. However, if you are hoping to index a small set of documents and  \
1846 	eventually get to the rest, you should probably leave this as false. \
1847 " }, \
1848 { "restrict", "",  \
1849 	"pattern list", "htsearch", "", "3.2.0b4", "Searching:Method", "restrict: http://www.acme.com/widgets/", " \
1850 	This specifies a set of patterns that all URLs have to \
1851 	match against in order for them to be included in the search \
1852 	results. Any number of strings can be specified, separated by \
1853 	spaces. If multiple patterns are given, at least one of the \
1854 	patterns has to match the URL. The list can be specified \
1855 	from within the configuration file, and can be overridden \
1856 	with the \"restrict\" input parameter in the search form. Note \
1857 	that the restrict list does not take precedence over the \
1858 	<a href=\"#exclude\">exclude</a> list - if a URL matches patterns \
1859 	in both lists it is still excluded from the search results. \
1860 	<br>To restrict URLs in htdig, use \
1861 	<a href=\"#limit_urls_to\">limit_urls_to</a>. \
1862 " }, \
1863 { "robotstxt_name", "htdig",  \
1864 	"string", "htdig", "Server", "3.0.7", "Indexing:Out", "robotstxt_name: myhtdig", " \
1865 	Sets the name that htdig will look for when parsing \
1866 	robots.txt files. This can be used to make htdig appear \
1867 	as a different spider than ht://Dig. Useful to \
1868 	distinguish between a private and a global index. \
1869 " }, \
1870 { "script_name", "",  \
1871 	"string", "htsearch", "", "3.1.4", "Presentation:Text", "script_name: /search/results.shtml", " \
1872 	Overrides the value of the SCRIPT_NAME \
1873 	environment attribute. This is useful if \
1874 	htsearch is not being called directly as a CGI \
1875 	program, but indirectly from within a dynamic \
1876 	.shtml page using SSI directives. Previously, \
1877 	you needed a wrapper script to do this, but \
1878 	this configuration attribute makes wrapper \
1879 	scripts obsolete for SSI and possibly for \
1880 	other server scripting languages, as \
1881 	well. (You still need a wrapper script when \
1882 	using PHP, though.)<br> \
1883 	Check out the <code>contrib/scriptname</code> \
1884 	directory for a small example. Note that this \
1885 	attribute also affects the value of the <a \
1886 	href=\"hts_templates.html#CGI\">CGI</a> variable \
1887 	used in htsearch templates. \
1888 " }, \
1889 { "search_algorithm", "exact:1",  \
1890 	"string list", "htsearch", "", "all", "Searching:Method", "search_algorithm: exact:1 soundex:0.3", " \
1891 			Specifies the search algorithms and their weight to use \
1892 			when searching. Each entry in the list consists of the \
1893 			algorithm name, followed by a colon (:) followed by a \
1894 			weight multiplier. The multiplier is a floating point \
1895 			number between 0 and 1. Note that depending on your \
1896 			<a href=\"#locale\">locale</a> setting, and whether your \
1897 			system's locale implementation affects floating point \
1898 			input, you may need to specify the decimal point as a \
1899 			comma rather than a period.<br> \
1900 			<strong>Note:</strong>If the exact  \
1901 			method is not listed, the search may not work since the  \
1902 			original terms will not be used.<br> \
1903 			Current algorithms supported are: \
1904 			<dl> \
1905 			  <dt> \
1906 				exact \
1907 			  </dt> \
1908 			  <dd> \
1909 				The default exact word matching algorithm. This \
1910 				will find only exactly matched words. \
1911 			  </dd> \
1912 			  <dt> \
1913 				soundex \
1914 			  </dt> \
1915 			  <dd> \
1916 				Uses a slightly modified <a href=\"http://www.sog.org.uk/cig/vol6/605tdrake.pdf\">soundex</a> algorithm to match \
1917 				words. This requires that the soundex database be \
1918 				present. It is generated with the \
1919 				<a href=\"htfuzzy.html\">htfuzzy</a> program. \
1920 			  </dd> \
1921 			  <dt> \
1922 				metaphone \
1923 			  </dt> \
1924 			  <dd> \
1925 				Uses the metaphone algorithm for matching words. \
1926 				This algorithm is more specific to the english \
1927 				language than soundex. It requires the metaphone \
1928 				database, which is generated with the <a \
1929 				href=\"htfuzzy.html\">htfuzzy</a> program. \
1930 			  </dd> \
1931 			  <dt> \
1932 				accents \
1933 			  </dt> \
1934 			  <dd> \
1935 				Uses the accents algorithm for matching words. \
1936 				This algorithm will treat all accented letters \
1937 				as equivalent to their unaccented counterparts. \
1938 				It requires the accents database, which is \
1939 				generated with the <a \
1940 				href=\"htfuzzy.html\">htfuzzy</a> program. \
1941 			  </dd> \
1942 			  <dt> \
1943 				endings \
1944 			  </dt> \
1945 			  <dd> \
1946 				This algorithm uses language specific word endings \
1947 				to find matches. Each word is first reduced to its \
1948 				word root and then all known legal endings are used \
1949 				for the matching. This algorithm uses two databases \
1950 				which are generated with <a href=\"htfuzzy.html\"> \
1951 				htfuzzy</a>. \
1952 			  </dd> \
1953 			  <dt> \
1954 				synonyms \
1955 			  </dt> \
1956 			  <dd> \
1957 				Performs a dictionary lookup on all the words. This \
1958 				algorithm uses a database generated with the <a \
1959 				href=\"htfuzzy.html\">htfuzzy</a> program. \
1960 			  </dd> \
1961 			<dt> \
1962 			substring \
1963 			</dt> \
1964 			<dd> \
1965 			  Matches all words containing the queries as \
1966 			  substrings. Since this requires checking every word in \
1967 			  the database, this can really slow down searches \
1968 			  considerably. \
1969 			<dd> \
1970 			<dt> \
1971 			  prefix \
1972 			</dt> \
1973 			<dd> \
1974 			  Matches all words beginning with the query \
1975 			  strings. Uses the option <a \
1976 			  href=\"#prefix_match_character\">prefix_match_character</a> \
1977 			  to decide whether a query requires prefix \
1978 			  matching. For example \"abc*\" would perform prefix \
1979 			  matching on \"abc\" since * is the default \
1980 			  prefix_match_character. \
1981 			</dd> \
1982 			<dt> \
1983 			regex \
1984 			</dt> \
1985 			<dd> \
1986 			  Matches all words that match the patterns given as regular  \
1987 			  expressions. Since this requires checking every word in \
1988 			  the database, this can really slow down searches \
1989 			  considerably.  The config file used for searching \
1990 			  must include the regex meta-characters (^$\\[-]|.*) \
1991 			  included in <a href=\"#extra_word_characters\">extra_word_characters</a>, \
1992 			  while the config file used for digging should not.\
1993 			<dd> \
1994 			<dt> \
1995 			speling \
1996 			</dt> \
1997 			<dd> \
1998 			  A simple fuzzy algorithm that tries to find one-off spelling  \
1999 			  mistakes, such as transposition of two letters or an extra character. \
2000 			  Since this usually generates just a few possibilities, it is  \
2001 			  relatively quick. \
2002 			<dd> \
2003 			</dl> \
2004 " }, \
2005 { "search_results_contenttype", "text/html",  \
2006 	"string", "htsearch", "", "all", "Presentation:Files", "search_results_contenttype: text/xml", " \
2007 	This specifies a Content-type to be output as an HTTP header \
2008 	at the start of search results. If set to an empty string, \
2009 	the Content-type header will be omitted altogether. \
2010 " },
2011 { "search_results_footer", "${common_dir}/footer.html",  \
2012 	"string", "htsearch", "", "all", "Presentation:Files", "search_results_footer: /usr/local/etc/ht/end-stuff.html", " \
2013 			This specifies a filename to be output at the end of \
2014 			search results. While outputting the footer, some \
2015 			variables will be expanded. Variables use the same \
2016 			syntax as the Bourne shell. If there is a variable VAR, \
2017 			the following will all be recognized: \
2018 			<ul> \
2019 			  <li> \
2020 				$VAR \
2021 			  </li> \
2022 			  <li> \
2023 				$(VAR) \
2024 			  </li> \
2025 			  <li> \
2026 				${VAR} \
2027 			  </li> \
2028 			</ul> \
2029 	The following variables are available.  See \
2030 	<a href=\"hts_template.html\">hts_template.html</a> for a complete \
2031 	list. \
2032 			<dl> \
2033 			  <dt> \
2034 				MATCHES \
2035 			  </dt> \
2036 			  <dd> \
2037 				The number of documents that were matched. \
2038 			  </dd> \
2039 			  <dt> \
2040 				PLURAL_MATCHES \
2041 			  </dt> \
2042 			  <dd> \
2043 				If MATCHES is not 1, this will be the string \"s\", \
2044 				else it is an empty string. This can be used to say \
2045 				something like \"$(MATCHES) \
2046 				document$(PLURAL_MATCHES) were found\" \
2047 			  </dd> \
2048 			  <dt> \
2049 				MAX_STARS \
2050 			  </dt> \
2051 			  <dd> \
2052 				The value of the <a href=\"#max_stars\">max_stars</a> \
2053 				attribute. \
2054 			  </dd> \
2055 			  <dt> \
2056 				LOGICAL_WORDS \
2057 			  </dt> \
2058 			  <dd> \
2059 				A string of the search words with either \"and\" or \
2060 				\"or\" between the words, depending on the type of \
2061 				search. \
2062 			  </dd> \
2063 			  <dt> \
2064 				WORDS \
2065 			  </dt> \
2066 			  <dd> \
2067 				A string of the search words with spaces in \
2068 				between. \
2069 			  </dd> \
2070 			  <dt> \
2071 				PAGEHEADER \
2072 			  </dt> \
2073 			  <dd> \
2074 				This expands to either the value of the \
2075 				<a href=\"#page_list_header\">page_list_header</a> or \
2076 				<a href=\"#no_page_list_header\">no_page_list_header</a> \
2077 				attribute depending on how many pages there are. \
2078 			  </dd> \
2079 			</dl> \
2080 			Note that this file will <strong>NOT</strong> be output \
2081 			if no matches were found. In this case the \
2082 			<a href=\"#nothing_found_file\">nothing_found_file</a> \
2083 			attribute is used instead. \
2084 			Also, this file will not be output if it is \
2085 			overridden by defining the \
2086 			<a href=\"#search_results_wrapper\">search_results_wrapper</a> \
2087 			attribute. \
2088 " }, \
2089 { "search_results_header", "${common_dir}/header.html",  \
2090 	"string", "htsearch", "", "all", "Presentation:Files", "search_results_header: /usr/local/etc/ht/start-stuff.html", " \
2091 			This specifies a filename to be output at the start of \
2092 			search results. While outputting the header, some \
2093 			variables will be expanded. Variables use the same \
2094 			syntax as the Bourne shell. If there is a variable VAR, \
2095 			the following will all be recognized: \
2096 			<ul> \
2097 			  <li> \
2098 				$VAR \
2099 			  </li> \
2100 			  <li> \
2101 				$(VAR) \
2102 			  </li> \
2103 			  <li> \
2104 				${VAR} \
2105 			  </li> \
2106 			</ul> \
2107 	The following variables are available.  See \
2108 	<a href=\"hts_template.html\">hts_template.html</a> for a complete \
2109 	list. \
2110 	<!-- Do these need to be listed for both _footer and _header? --> \
2111 			<dl> \
2112 			  <dt> \
2113 				MATCHES \
2114 			  </dt> \
2115 			  <dd> \
2116 				The number of documents that were matched. \
2117 			  </dd> \
2118 			  <dt> \
2119 				PLURAL_MATCHES \
2120 			  </dt> \
2121 			  <dd> \
2122 				If MATCHES is not 1, this will be the string \"s\", \
2123 				else it is an empty string. This can be used to say \
2124 				something like \"$(MATCHES) \
2125 				document$(PLURAL_MATCHES) were found\" \
2126 			  </dd> \
2127 			  <dt> \
2128 				MAX_STARS \
2129 			  </dt> \
2130 			  <dd> \
2131 				The value of the <a href=\"#max_stars\">max_stars</a> \
2132 				attribute. \
2133 			  </dd> \
2134 			  <dt> \
2135 				LOGICAL_WORDS \
2136 			  </dt> \
2137 			  <dd> \
2138 				A string of the search words with either \"and\" or \
2139 				\"or\" between the words, depending on the type of \
2140 				search. \
2141 			  </dd> \
2142 			  <dt> \
2143 				WORDS \
2144 			  </dt> \
2145 			  <dd> \
2146 				A string of the search words with spaces in \
2147 				between. \
2148 			  </dd> \
2149 			</dl> \
2150 			Note that this file will <strong>NOT</strong> be output \
2151 			if no matches were found. In this case the \
2152 			<a href=\"#nothing_found_file\">nothing_found_file</a> \
2153 			attribute is used instead. \
2154 			Also, this file will not be output if it is \
2155 			overridden by defining the \
2156 			<a href=\"#search_results_wrapper\">search_results_wrapper</a> \
2157 			attribute. \
2158 " }, \
2159 { "search_results_order", "", \
2160 	"string list", "htsearch", "", "3.2.0b2", "Searching:Ranking", "search_results_order:  \
2161 	 /docs/|faq.html * /maillist/ /testresults/", " \
2162 	This specifies a list of patterns for URLs in \
2163 	search results.  Results will be displayed in the \
2164 	specified order, with the search algorithm result \
2165 	as the second order.  Remaining areas, that do not \
2166 	match any of the specified patterns, can be placed \
2167 	by using * as the pattern.  If no * is specified, \
2168 	one will be implicitly placed at the end of the \
2169 	list.<br> \
2170 	See also <a href=\"#url_seed_score\">url_seed_score</a>. \
2171 " }, \
2172 { "search_results_wrapper", "",  \
2173 	"string", "htsearch", "", "3.1.0", "Presentation:Files", "search_results_wrapper: ${common_dir}/wrapper.html", " \
2174 	This specifies a filename to be output at the start and \
2175 	end of search results. This file replaces the \
2176 	<a href=\"#search_results_header\">search_results_header</a> and \
2177 	<a href=\"#search_results_footer\">search_results_footer</a> \
2178 	files, with the contents of both in one file, and uses the \
2179 	pseudo-variable <strong>$(HTSEARCH_RESULTS)</strong> as a \
2180 	separator for the header and footer sections. \
2181 	If the filename is not specified, the file is unreadable, \
2182 	or the pseudo-variable above is not found, htsearch reverts \
2183 	to the separate header and footer files instead. \
2184 	While outputting the wrapper, \
2185 	some variables will be expanded, just as for the \
2186 	<a href=\"#search_results_header\">search_results_header</a> and \
2187 	<a href=\"#search_results_footer\">search_results_footer</a> \
2188 	files.<br> \
2189 	Note that this file will <strong>NOT</strong> be output \
2190 	if no matches were found. In this case the \
2191 	<a href=\"#nothing_found_file\">nothing_found_file</a> \
2192 	attribute is used instead. \
2193 " }, \
2194 { "search_rewrite_rules", "",
2195 	"string list", "htsearch", "", "3.1.6", "URLs", "search_rewrite_rules: http://(.*)\\\\.mydomain\\\\.org/([^/]*)  http://\\\\2.\\\\1.com \\<br> \
2196 	       http://www\\\\.myschool\\\\.edu/myorgs/([^/]*)  http://\\\\1.org", " \
2197 	This is a list of pairs, <em>regex</em> <em>replacement</em>, used \
2198 	to rewrite URLs in the search results. The left hand string is a \
2199 	regular expression; the right hand string is a literal string with \
2200 	embedded placeholders for fragments that matched inside brackets in \
2201 	the regular expression. \\0 is the whole matched string, \\1 to \\9 \
2202 	are bracketted substrings. The backslash must be doubled-up in the \
2203 	attribute setting to get past the variable expansion parsing. Rewrite \
2204 	rules are applied sequentially to each URL before it is displayed \
2205 	or checked against the <a href=\"#restrict\">restrict</a> or \
2206 	<a href=\"#exclude\">exclude</a> lists. Rewriting does not stop once a \
2207 	match has been made, so multiple rules may affect a given URL. See \
2208 	also <a href=\"#url_part_aliases\">url_part_aliases</a> which allows \
2209 	URLs to be of one form during indexing and translated for results, \
2210 	and <a href=\"#url_rewrite_rules\">url_rewrite_rules</a> which allows \
2211 	URLs to be rewritten while indexing. \
2212 " },
2213 { "server_aliases", "",  \
2214 	"string list", "htdig", "", "3.1.0b2", "Indexing:Where", "server_aliases: \
2215 				  foo.mydomain.com:80=www.mydomain.com:80 \\<br> \
2216 				  bar.mydomain.com:80=www.mydomain.com:80 \
2217 ", " \
2218 	This attribute tells the indexer that servers have several \
2219 	DNS aliases, which all point to the same machine and are NOT \
2220 	virtual hosts. This allows you to ensure pages are indexed \
2221 	only once on a given machine, despite the alias used in a URL. \
2222 	As shown in the example, the mapping goes from left to right, \
2223 	so the server name on the right hand side is the one that is \
2224 	used. As of version 3.1.3, the port number is optional, and is \
2225 	assumed to be 80 if omitted. There is no easy way to map all \
2226 	ports from one alias to another without listing them all. \
2227 " }, \
2228 { "server_max_docs", "-1",  \
2229 	"integer", "htdig", "Server", "3.1.0b3", "Indexing:Where", "server_max_docs: 50", " \
2230 	This attribute tells htdig to limit the dig to retrieve a maximum \
2231 	number of documents from each server. This can cause \
2232 	unusual behavior on update digs since the old URLs are \
2233 	stored alphabetically. Therefore, update digs will add \
2234 	additional URLs in pseudo-alphabetical order, up to the \
2235 	limit of the attribute. However, it is most useful to \
2236 	partially index a server as the URLs of additional \
2237 	documents are entered into the database, marked as never \
2238 	retrieved.<br> \
2239 	A value of -1 specifies no limit. \
2240 " }, \
2241 { "server_wait_time", "0",  \
2242 	"integer", "htdig", "Server", "3.1.0b3", "Indexing:Connection", "server_wait_time: 20", " \
2243 	This attribute tells htdig to ensure a server has had a \
2244 	delay (in seconds) from the beginning of the last \
2245 	connection. This can be used to prevent \"server abuse\" \
2246 	by digging without delay. It's recommended to set this \
2247 	to 10-30 (seconds) when indexing servers that you don't \
2248 	monitor yourself. Additionally, this attribute can slow \
2249 	down local indexing if set, which may or may not be what \
2250 	you intended. \
2251 " }, \
2252 { "sort", "score",  \
2253 	"string", "htsearch", "", "3.1.0", "Presentation:How", "sort: revtime", " \
2254 	This is the default sorting method that htsearch \
2255 	uses to determine the order in which matches are displayed. \
2256 	The valid choices are: \
2257 	<table border=\"0\"> \
2258 	<tr> \
2259 	<td> \
2260 	<ul> \
2261 	     <li> score </li> \
2262 	     <li> time </li> \
2263 	     <li> title </li> \
2264 	</ul> \
2265 	</td> \
2266 	<td> \
2267 	<ul> \
2268 	     <li> revscore </li> \
2269 	     <li> revtime </li> \
2270 	     <li> revtitle </li> \
2271 	</ul> \
2272 	</td> \
2273 	</tr> \
2274 	</table> \
2275 	This attribute will only be used if the HTML form that \
2276 	calls htsearch didn't have the <strong>sort</strong> \
2277 	value set. The words date and revdate can be used instead \
2278 	of time and revtime, as both will sort by the time that \
2279 	the document was last modified, if this information is \
2280 	given by the server. The default is to sort by the score, \
2281 	which ranks documents by best match. The sort methods that \
2282 	begin with \"rev\" simply reverse the order of the \
2283 	sort. Note that setting this to something other than \
2284 	\"score\" will incur a slowdown in searches. \
2285 " }, \
2286 { "sort_names", "score Score time Time title Title revscore 'Reverse Score' revtime 'Reverse Time' revtitle 'Reverse Title'",  \
2287 	"quoted string list", "htsearch", "", "3.1.0", "Searching:UI", "sort_names: \
2288 				  score 'Best Match' time Newest title A-Z \\<br> \
2289 				  revscore 'Worst Match' revtime Oldest revtitle Z-A \
2290 ", " \
2291 	These values are used to create the <strong> \
2292 	sort</strong> menu. It consists of pairs. The first \
2293 	element of each pair is one of the known sort methods, the \
2294 	second element is the text that will be shown in the \
2295 	menu for that sort method. This text needs to be quoted if \
2296 	it contains spaces. \
2297 	See the <a href=\"hts_selectors.html\">select list documentation</a> \
2298 	for more information on how this attribute is used. \
2299 " }, \
2300 { "soundex_db", "${database_base}.soundex.db",  \
2301 	"string", "htfuzzy htsearch", "", "all", "File Layout", "soundex_db: ${database_base}.snd.db", " \
2302 	The database file used for the fuzzy \"soundex\" search \
2303 	algorithm. This database is created by \
2304 	<a href=\"htfuzzy.html\">htfuzzy</a> and used by \
2305 	<a href=\"htsearch.html\" target=\"_top\">htsearch</a>. \
2306 " }, \
2307 { "star_blank", "${image_url_prefix}/star_blank.gif",  \
2308 	"string", "htsearch", "", "all", "Presentation:Text", "star_blank: http://www.somewhere.org/icons/noelephant.gif", " \
2309 	This specifies the URL to use to display a blank of the \
2310 	same size as the star defined in the \
2311 	<a href=\"#star_image\">star_image</a> attribute or in the \
2312 	<a href=\"#star_patterns\">star_patterns</a> attribute. \
2313 " }, \
2314 { "star_image", "${image_url_prefix}/star.gif",  \
2315 	"string", "htsearch", "", "all", "Presentation:Text", "star_image: http://www.somewhere.org/icons/elephant.gif", " \
2316 	This specifies the URL to use to display a star. This \
2317 	allows you to use some other icon instead of a star. \
2318 	(We like the star...)<br> \
2319 	The display of stars can be turned on or off with the \
2320 	<em><a href=\"#use_star_image\">use_star_image</a></em> \
2321 	attribute and the maximum number of stars that can be \
2322 	displayed is determined by the \
2323 	<em><a href=\"#max_stars\">max_stars</a></em> attribute.<br> \
2324 	Even though the image can be changed, the ALT value \
2325 	for the image will always be a '*'. \
2326 " }, \
2327 { "star_patterns", "",  \
2328 	"string list", "htsearch", "", "3.0", "Presentation:How", "star_patterns: \
2329 				  http://www.sdsu.edu /sdsu.gif \\<br> \
2330 				  http://www.ucsd.edu /ucsd.gif \
2331 ", " \
2332 	This attribute allows the star image to be changed \
2333 	depending on the URL or the match it is used for. This \
2334 	is mainly to make a visual distinction between matches \
2335 	on different web sites. The star image could be \
2336 	replaced with the logo of the company the match refers \
2337 	to.<br> \
2338 	It is advisable to keep all the images the same size \
2339 	in order to line things up properly in a short result \
2340 	listing.<br> \
2341 	The format is simple. It is a list of pairs. The first \
2342 	element of each pair is a pattern, the second element \
2343 	is a URL to the image for that pattern. \
2344 " }, \
2345 { "startday", "",  \
2346 	"integer", "htsearch", "", "3.1.6", "Searching:Method", "startday: 1", " \
2347 	Day component of first date allowed as last-modified date \
2348 	of returned docutments. \
2349 	This is most usefully specified as a \
2350 	<a href=\"hts_form.html#startyear\">GCI argument</a>. \
2351 	See also <a href=\"#startyear\">startyear</a>. \
2352 " }, \
2353 { "start_ellipses", "<strong><code>... </code></strong>",  \
2354 	"string", "htsearch", "", "all", "Presentation:Text", "start_ellipses: ...", " \
2355 	When excerpts are displayed in the search output, this \
2356 	string will be prepended to the excerpt if there is \
2357 	text before the text displayed. This is just a visual \
2358 	reminder to the user that the excerpt is only part of \
2359 	the complete document. \
2360 " }, \
2361 { "start_highlight", "<strong>",  \
2362 	"string", "htsearch", "", "3.1.4", "Presentation:Text", "start_highlight: &lt;font color=\"#FF0000\"&gt;", " \
2363 	When excerpts are displayed in the search output, matched \
2364 	words will be highlighted using this string and \
2365 	<a href=\"#end_highlight\"> end_highlight</a>. \
2366 	You should ensure that highlighting tags are balanced, \
2367 	that is, any formatting tags that this string \
2368 	opens should be closed by end_highlight. \
2369 " }, \
2370 { "startmonth", "",  \
2371 	"integer", "htsearch", "", "3.1.6", "Searching:Method", "startmonth: 1", " \
2372 	Month component of first date allowed as last-modified date \
2373 	of returned docutments. \
2374 	This is most usefully specified as a \
2375 	<a href=\"hts_form.html#startyear\">GCI argument</a>. \
2376 	See also <a href=\"#startyear\">startyear</a>. \
2377 " }, \
2378 { "start_url", "http://www.htdig.org/",  \
2379 	"string list", "htdig", "", "all", "Indexing:Where", "start_url: http://www.somewhere.org/alldata/index.html", " \
2380 	This is the list of URLs that will be used to start a \
2381 	dig when there was no existing database. Note that \
2382 	multiple URLs can be given here. \
2383 	<br>Note also that the value of <em>start_url</em> \
2384 	will be the default value for \
2385 	<a href=\"#limit_urls_to\">limit_urls_to</a>, so if \
2386 	you set start_url to the URLs for specific files, \
2387 	rather than a site or subdirectory URL, you may need \
2388 	to set limit_urls_to to something less restrictive \
2389 	so htdig doesn't reject links in the documents. \
2390 " }, \
2391 { "startyear", "",  \
2392 	"integer", "htsearch", "", "3.1.6", "Searching:Method", "startyear: 2001", " \
2393 	This specifies the year of the cutoff start date for \
2394 	search results. If the start or end date are specified, \
2395 	only results with a last modified date within this \
2396 	range are shown. If a start or end date is specified, but startyear \
2397 	is not, then it defaults to 1970. \
2398 	See also <a href=\"#startday\">startday</a>, \
2399 	<a href=\"#startmonth\">startmonth</a>, \
2400 	<a href=\"#endday\">endday</a>, \
2401 	<a href=\"#endmonth\">endmonth</a>, \
2402 	<a href=\"#endyear\">endyear</a>. \
2403 	These are most usefully specified as a \
2404 	<a href=\"hts_form.html#startyear\">GCI argument</a>.<br> \
2405 	For each component, if a negative number is given, \
2406 	it is taken as relative to the current date. \
2407 	Relative days can span several months or even years if desired, \
2408 	and relative months can span several years. A startday of \
2409 	-90 will select matching documents modified within \
2410 	the last 90 days. \
2411 " }, \
2412 { "store_phrases", "true",  \
2413 	"boolean", "htdig", "", "3.2.0b5", "Indexing:How", "startyear: false", " \
2414 	Causes htdig to record all occurrences of each word in a document, \
2415 	to allow accurate phrase searches.  If this is false, only the first \
2416 	occurrence of each word will be stored, causing many phrases to be \
2417 	missed. Setting this false increases indexing speed by about 20%, \
2418 	and reduces disk requirements by about 60%.\
2419 " }, \
2420 { "substring_max_words", "25",  \
2421 	"integer", "htsearch", "", "3.0.8b1", "Searching:Method", "substring_max_words: 100", " \
2422 	The Substring <a href=\"#search_algorithm\">fuzzy algorithm</a> \
2423 	could potentially match a \
2424 	very large number of words. This value limits the \
2425 	number of words each substring pattern can match. Note \
2426 	that this does not limit the number of documents that \
2427 	are matched in any way. \
2428 " }, \
2429 { "synonym_db", "${common_dir}/synonyms.db",  \
2430 	"string", "htsearch htfuzzy", "", "3.0", "File Layout", "synonym_db: ${database_base}.syn.db", " \
2431 	Points to the database that <a href=\"htfuzzy.html\"> \
2432 	htfuzzy</a> creates when the <strong>synonyms</strong> \
2433 	algorithm is used.<br> \
2434 	<a href=\"htsearch.html\" target=\"_top\">htsearch</a> \
2435 	uses this to perform synonym dictionary lookups. \
2436 " }, \
2437 { "synonym_dictionary", "${common_dir}/synonyms",  \
2438 	"string", "htfuzzy", "", "3.0", "File Layout", "synonym_dictionary: /usr/dict/synonyms", " \
2439 	This points to a text file containing the synonym \
2440 	dictionary used for the synonyms search algorithm.<br> \
2441 	Each line of this file has at least two words. The \
2442 	first word is the word to replace, the rest of the \
2443 	words are synonyms for that word. \
2444 " }, \
2445 { "syntax_error_file", "${common_dir}/syntax.html",  \
2446 	"string", "htsearch", "", "all", "Presentation:Files", "syntax_error_file: ${common_dir}/synerror.html", " \
2447 	This points to the file which will be displayed if a \
2448 	boolean expression syntax error was found. \
2449 " }, \
2450 { "tcp_max_retries", "1",  \
2451 	"integer", "htdig", "Server", "3.2.0b1", "Indexing:Connection", "tcp_max_retries: 6", " \
2452 	 This option set the maximum number of attempts when a connection \
2453 	 <A href=\"#timeout\">timeout</A>s. \
2454 	 After all these retries, the connection attempt results <timed out>. \
2455 " }, \
2456 { "tcp_wait_time", "5",  \
2457 	"integer", "htdig", "Server", "3.2.0b1", "Indexing:Connection", "tcp_wait_time: 10", " \
2458 	 This attribute sets the wait time (in seconds) after a connection \
2459 	 fails and the <A href=\"#timeout\">timeout</A> is raised. \
2460 " }, \
2461 { "template_map", "Long builtin-long builtin-long Short builtin-short builtin-short",  \
2462 	"quoted string list", "htsearch", "", "3.0", "Presentation:Files,Searching:UI", "template_map: \
2463 				  Short short ${common_dir}/short.html \\<br> \
2464 				  Normal normal builtin-long \\<br> \
2465 				  Detailed detail ${common_dir}/detail.html \
2466 ", " \
2467 	This maps match template names to internal names and \
2468 	template file names. It is a list of triplets. The \
2469 	first element in each triplet is the name that will be \
2470 	displayed in the FORMAT menu. The second element is the \
2471 	name used internally and the third element is a \
2472 	filename of the template to use.<br> \
2473 	There are two predefined templates, namely <strong> \
2474 	builtin-long</strong> and <strong> \
2475 	builtin-short</strong>. If the filename is one of \
2476 	those, they will be used instead.<br> \
2477 	More information about templates can be found in the \
2478 	<a href=\"htsearch.html\" target=\"_top\">htsearch</a> \
2479 	documentation.  The particular template is selecterd by the \
2480 	<a href=\"hts_form.html#format\">format</a> cgi argument, and the \
2481 	default is given by <a href=\"#template_name\">template_name</a> in \
2482 	the config file. \
2483 " }, \
2484 { "template_name", "builtin-long",  \
2485 	"string", "htsearch", "", "3.0", "Searching:UI,Presentation:How", "template_name: long", " \
2486 	Specifies the default template if no \
2487 	<a href=\"hts_form.html#format\">format</a> field is given by the \
2488 	search form. This needs to map to the \
2489 	<a href=\"#template_map\">template_map</a>. \
2490 " }, \
2491 { "template_patterns", "",  \
2492 	"string list", "htsearch", "", "3.1.4", "Presentation:How", "template_patterns: \
2493 				  http://www.sdsu.edu ${common_dir}/sdsu.html \\<br> \
2494 				  http://www.ucsd.edu ${common_dir}/ucsd.html \
2495 ", " \
2496 	This attribute allows the results template to be changed \
2497 	depending on the URL or the match it is used for. This \
2498 	is mainly to make a visual distinction between matches \
2499 	on different web sites. The results for each site could \
2500 	thus be shown in a style matching that site.<br> \
2501 	The format is simply a list of pairs. The first \
2502 	element of each pair is a pattern, the second element \
2503 	is the name of the template file for that pattern.<br> \
2504 	More information about templates can be found in the \
2505 	<a href=\"htsearch.html\" target=\"_top\">htsearch</a> \
2506 	documentation.<br> \
2507 	Normally, when using this template selection method, you \
2508 	would disable user selection of templates via the <strong>format</strong> \
2509 	input parameter in search forms, as the two methods were not \
2510 	really designed to interact. Templates selected by URL patterns \
2511 	would override any user selection made in the form. If you want \
2512 	to use the two methods together, see the notes on \
2513 	<a href=\"hts_selectors.html#template_patterns\">combining</a> \
2514 	them for an example of how to do this. \
2515 " }, \
2516 { "text_factor", "1",  \
2517 	"number", "htsearch", "", "3.0", "Searching:Ranking", "text_factor: 0", " \
2518 	This is a factor which will be used to multiply the \
2519 	weight of words that are not in any special part of a \
2520 	document. Setting a factor to 0 will cause normal words \
2521 	to be ignored. The number may be a floating point \
2522 	number. See also the <a href=\"#heading_factor\"> heading_factor</a> \
2523 	attribute. \
2524 " }, \
2525 { "timeout", "30",  \
2526 	"integer", "htdig", "Server", "all", "Indexing:Connection", "timeout: 42", " \
2527 	Specifies the time the digger will wait to complete a \
2528 	network read. This is just a safeguard against \
2529 	unforeseen things like the all too common \
2530 	transformation from a network to a notwork.<br> \
2531 	The timeout is specified in seconds. \
2532 " }, \
2533 { "title_factor", "100",  \
2534 	"number", "htsearch", "", "all", "Searching:Ranking", "title_factor: 12", " \
2535 	This is a factor which will be used to multiply the \
2536 	weight of words in the title of a document. Setting a \
2537 	factor to 0 will cause words in the title to be \
2538 	ignored. The number may be a floating point number. See \
2539 	also the <a href=\"#heading_factor\"> \
2540 	heading_factor</a> attribute. \
2541 " }, \
2542 { "translate_latin1", "true",  \
2543 	"boolean", "htdig htsearch", "", "3.2.0b5", "Indexing:What", "translate_latin1: false", " \
2544 	If set to false, the SGML entities for ISO-8859-1 (or \
2545 	Latin 1) characters above &amp;nbsp; (or &amp;#160;) \
2546 	will not be translated into their 8-bit equivalents. \
2547 	This attribute should be set to false when using a \
2548 	<a href=\"#locale\">locale</a> that doesn't use the \
2549 	ISO-8859-1 character set, to avoid these entities \
2550 	being mapped to inappropriate 8-bit characters, or \
2551 	perhaps more importantly to avoid 8-bit characters from \
2552 	your locale being mapped back to Latin 1 SGML entities \
2553 	in search results. \
2554 " }, \
2555 { "url_list", "${database_base}.urls",  \
2556 	"string", "htdig", "", "all", "Extra Output", "url_list: /tmp/urls", " \
2557 	This file is only created if \
2558 	<em><a href=\"#create_url_list\">create_url_list</a></em> is set to \
2559 	true. It will contain a list of all URLs that were \
2560 	seen. \
2561 " }, \
2562 { "url_log", "${database_base}.log",  \
2563 	"string", "htdig", "", "3.1.0", "Extra Output", "url_log: /tmp/htdig.progress", " \
2564 	If <a href=\"htdig.html\">htdig</a> is \
2565 	interrupted, it will write out its progress to this \
2566 	file. Note that if it has a large number of URLs to write, \
2567 	it may take some time to exit. This can especially happen \
2568 	when running update digs and the run is interrupted soon \
2569 	after beginning. \
2570 " }, \
2571 { "url_part_aliases", "",  \
2572 	"string list", "all", "", "3.1.0", "URLs", "url_part_aliases: \
2573 				   http://search.example.com/~htdig *site \\<br> \
2574 				   http://www.htdig.org/this/ *1 \\<br> \
2575 				   .html *2 \
2576 url_part_aliases: \
2577 				   http://www.htdig.org/ *site \\<br> \
2578 				   http://www.htdig.org/that/ *1 \\<br> \
2579 				   .htm *2 \
2580 ", " \
2581 	A list of translations pairs <em>from</em> and \
2582 	<em>to</em>, used when accessing the database. \
2583 	If a part of an URL matches with the \
2584 	<em>from</em>-string of each pair, it will be \
2585 	translated into the <em>to</em>-string just before \
2586 	writing the URL to the database, and translated \
2587 	back just after reading it from the database.<br> \
2588 	This is primarily used to provide an easy way to \
2589 	rename parts of URLs for e.g. changing \
2590 	www.example.com/~htdig to www.htdig.org.  Two \
2591 	different configuration files for digging and \
2592 	searching are then used, with url_part_aliases \
2593 	having different <em>from</em> strings, but \
2594 	identical <em>to</em>-strings.<br> \
2595 	See also <a \
2596 	href=\"#common_url_parts\">common_url_parts</a>.<br> \
2597 	Strings that are normally incorrect in URLs or \
2598 	very seldom used, should be used as \
2599 	<em>to</em>-strings, since extra storage will be \
2600 	used each time one is found as normal part of a \
2601 	URL.  Translations will be performed with priority \
2602 	for the leftmost longest match.	 Each \
2603 	<em>to</em>-string must be unique and not be a \
2604 	part of any other <em>to</em>-string.  It also helps \
2605 	to keep the <em>to</em>-strings short to save space \
2606 	in the database. Other than that, the choice of \
2607 	<em>to</em>-strings is pretty arbitrary, as they \
2608 	just provide a temporary, internal encoding in the \
2609 	databases, and none of the characters in these \
2610 	strings have any special meaning.<br> \
2611 	Note that when this attribute is changed, the \
2612 	database should be rebuilt, unless the effect of \
2613 	\"moving\" the affected URLs in the database is \
2614 	wanted, as described above.<br> \
2615 	<strong>Please note:</strong> Don't just copy the \
2616 	example below into a single configuration file. \
2617 	There are two separate settings of \
2618 	<em>url_part_aliases</em> below; the first one is \
2619 	for the configuration file to be used by htdig, \
2620 	htmerge, and htnotify, and the second one is for the \
2621 	configuration file to be used by htsearch. \
2622 	In this example, htdig will encode the URL \
2623 	\"http://search.example.com/~htdig/contrib/stuff.html\" \
2624 	as \"*sitecontrib/stuff*2\" in the databases, and \
2625 	htsearch will decode it as \
2626 	\"http://www.htdig.org/contrib/stuff.htm\".<br> \
2627 	As of version 3.1.6, you can also do more complex \
2628 	rewriting of URLs using \
2629 	<a href=\"#url_rewrite_rules\">url_rewrite_rules</a> and \
2630 	<a href=\"#search_rewrite_rules\">search_rewrite_rules</a>. \
2631 " }, \
2632 { "url_rewrite_rules", "", \
2633     "string list", "htdig", "", "3.2.0b3", "URLs", "url_rewrite_rules:	(.*)\\\\?JServSessionIdroot=.*		\\\\1 \\<br> \
2634 			(.*)\\\\&amp;JServSessionIdroot=.*		\\\\1 \\<br> \
2635 			(.*)&amp;context=.*				\\\\1<br>", " \
2636 	This is a list of pairs, <em>regex</em> <em>replacement</em> used to \
2637 	permanently rewrite URLs as they are indexed. The left hand string is \
2638 	a regular expression; the right hand string is  a literal string with \
2639 	embedded placeholders for fragments that matched  inside brackets in \
2640 	the regex. \\0 is the whole matched string, \\1 to \\9 are  bracketted \
2641 	substrings. Note that the <strong>entire</strong> URL is replaced by \
2642 	the right hand string (not just the portion which matches the left hand\
2643 	string).  Thus, a leading and trailing (.*) should be included in the \
2644 	pattern, with matching placeholders in the replacement string.<br> \
2645 	Rewrite rules are applied sequentially to each  \
2646 	incoming URL before normalization occurs. Rewriting does not stop \
2647 	once a match has been made, so multiple rules may affect a given URL. \
2648 	See also <a href=\"#url_part_aliases\">url_part_aliases</a> which \
2649 	allows URLs to be of one  \
2650 form during indexing and translated for results. \
2651 "}, \
2652 { "url_seed_score", "", \
2653     "string list", "htsearch", "", "3.2.0b2", "Searching::Ranking", "url_seed_score:  \
2654 	      /mailinglist/ *.5-1e6 <br> \
2655 	      /docs/|/news/ *1.5 <br> \
2656 	      /testresults/ &quot;*.7 -200&quot; <br> \
2657 	      /faq-area/ *2+10000", " \
2658 	This is a list of pairs, <em>pattern</em> \
2659 	<em>formula</em>, used to weigh the score of \
2660 	hits, depending on the URL of the document.<br> \
2661 	The <em>pattern</em> part is a substring to match \
2662 	against the URL.  Pipe ('|') characters can be \
2663 	used in the pattern to concatenate substrings for \
2664 	web-areas that have the same formula.<br> \
2665 	The formula describes a <em>factor</em> and a \
2666 	<em>constant</em>, by which the hit score is \
2667 	weighed.  The <em>factor</em> part is multiplied \
2668 	to the original score, then the <em>constant</em> \
2669 	part is added.<br> \
2670 	The format of the formula is the factor part: \
2671 	&quot;*<em>N</em>&quot; optionally followed by comma and \
2672 	spaces, followed by the constant part : \
2673 	&quot;+<em>M</em>&quot;, where the plus sign may be emitted \
2674 	for negative numbers.  Either part is optional, \
2675 	but must come in this order.<br> \
2676 	The numbers <em>N</em> and <em>M</em> are floating \
2677 	point constants.<br> \
2678 	More straightforward is to think of the format as \
2679 	&quot;newscore = oldscore*<em>N</em>+<em>M</em>&quot;, \
2680 	but with the &quot;newscore = oldscore&quot; part left out. \
2681 " }, \
2682 { "url_text_factor", "1",  \
2683 	"number", "htsearch", "", "??", "Searching:Ranking", "url_text_factor: 1", " \
2684 	TO BE COMPLETED<br> \
2685 	See also <a href=\"#heading_factor\">heading_factor</a>. \
2686 " }, \
2687 { "use_doc_date", "false",  \
2688 	"boolean", "htdig", "", "3.2.0b1", "Indexing:How", "use_doc_date: true", " \
2689 	If set to true, htdig will use META date tags in documents, \
2690 	overriding the modification date returned by the server. \
2691 	Any documents that do not have META date tags will retain \
2692 	the last modified date returned by the server or found on \
2693 	the local file system. \
2694 	As of version 3.1.6, in addition to META date tags, htdig will also \
2695 	recognize dc.date, dc.date.created and dc.date.modified. \
2696 " }, \
2697 { "use_meta_description", "false",  \
2698 	"boolean", "htsearch", "", "3.1.0b1", "Presentation:How", "use_meta_description: true", " \
2699 	If set to true, any META description tags will be used as \
2700 	excerpts by htsearch. Any documents that do not have META \
2701 	descriptions will retain their normal excerpts. \
2702 " }, \
2703 { "use_star_image", "true",  \
2704 	"boolean", "htsearch", "", "all", "Presentation:How", "use_star_image: no", " \
2705 	If set to true, the <em><a href=\"#star_image\"> \
2706 	star_image</a></em> attribute is used to display upto \
2707 	<em><a href=\"#max_stars\">max_stars</a></em> images for \
2708 	each match. \
2709 " }, \
2710 { "user_agent", "htdig",  \
2711 	"string", "htdig", "Server", "3.1.0b2", "Indexing:Out", "user_agent: htdig-digger", " \
2712 	This allows customization of the user_agent: field sent when \
2713 	the digger requests a file from a server. \
2714 " }, \
2715 { "valid_extensions", "",  \
2716 	"string list", "htdig", "URL", "3.1.4", "Indexing:Where", "valid_extensions: .html .htm .shtml", " \
2717 	This is a list of extensions on URLs which are \
2718 	the only ones considered acceptable. This list is used to \
2719 	supplement the MIME-types that the HTTP server provides \
2720 	with documents. Some HTTP servers do not have a correct \
2721 	list of MIME-types and so can advertise certain \
2722 	documents as text while they are some binary format. \
2723 	If the list is empty, then all extensions are acceptable, \
2724 	provided they pass other criteria for acceptance or rejection. \
2725 	If the list is not empty, only documents with one of the \
2726 	extensions in the list are parsed. \
2727 	See also <a href=\"#bad_extensions\">bad_extensions</a>. \
2728 " }, \
2729 { "valid_punctuation", ".-_/!#\\$%^&'",  \
2730 	"string", "htdig htsearch", "", "all", "Indexing:What", "valid_punctuation: -'", " \
2731 	This is the set of characters which may be deleted \
2732 	from the document before determining what a word is. \
2733 	This means that if a document contains something like \
2734 	<code>half-hearted</code> the digger will see this as the three \
2735 	words <code> half</code>, <code>hearted</code> and \
2736 	<code>halfhearted</code>.<br> \
2737 	These characters are also removed before keywords are passed to the \
2738 	search engine, so a search for \"half-hearted\" works as expected.<br> \
2739 	Note that the dollar sign ($) and backslash (\\) must be escaped by a \
2740 	backslash in both valid_punctuation and extra_word_characters. \
2741 	Moreover, the backslash should not be the last character on the line. \
2742 	There is currently no way to include a back-quote (`) in \
2743 	extra_word_characters or valid_punctuation.<br> \
2744 	See also the \
2745 	<a href=\"#extra_word_characters\">extra_word_characters</a> \
2746 	and <a href=\"#allow_numbers\">allow_numbers</a> \
2747 	attributes.  \
2748 " }, \
2749 { "version", VERSION,  \
2750 	"string", "htsearch", "", "all", "Presentation:Text", "version: 3.2.0", " \
2751 	This specifies the value of the VERSION \
2752 	variable which can be used in search templates. \
2753 	The default value of this attribute is determined \
2754 	at compile time, and will not normally be set \
2755 	in configuration files. \
2756 " }, \
2757 { "word_db", "${database_base}.words.db",  \
2758 	"string", "all", "", "all", "File Layout", "word_db: ${database_base}.allwords.db", " \
2759 	This is the main word database. It is an index of all \
2760 	the words to a list of documents that contain the \
2761 	words. This database can grow large pretty quickly. \
2762 " }, \
2763 { "word_dump", "${database_base}.worddump",  \
2764 	"string", "htdig htdump htload", "", "3.2.0b1", "File Layout", "word_dump: /tmp/words.txt", " \
2765 	This file is basically a text version of the file \
2766 	specified in <em><a href=\"#word_db\">word_db</a></em>. Its \
2767 	only use is to have a human readable database of all \
2768 	words. The file is easy to parse with tools like \
2769 	perl or tcl. \
2770 " }, \
2771 { "wordlist_cache_inserts", "false",  \
2772 	"boolean", "???", "", "???", "Indexing:How", "wordlist_cache_inserts: true", " \
2773 	 If true, create a cache of size  wordlist_cache_size/2  for class \
2774 	 WordListOne. <em>I don't know what this is for.  Does anyone?</em> \
2775 " }, \
2776 { "wordlist_cache_size", "10000000",  \
2777 	"integer", "all", "", "3.2.0b1", "Indexing:How", "wordlist_cache_size: 40000000", " \
2778 	Size (in bytes) of memory cache used by Berkeley DB (DB used by the indexer) \
2779 	IMPORTANT: It  makes a <strong>huge</strong> difference. The rule  \
2780 	is that the cache size should be at least 2% of the expected index size. The \
2781 	Berkeley DB file has 1% of internal pages that <em>must</em> be cached for good \
2782 	performances. Giving an additional 1% leaves room for caching leaf pages. \
2783 " }, \
2784 { "wordlist_compress", "true",  \
2785 	"boolean", "all", "", "3.2.0b1", "Indexing:How", "wordlist_compress: false", " \
2786 	Enables or disables the default compression system for the indexer. \
2787 	This currently attempts to compress the index by a factor of 8. If the \
2788 	Zlib library is not found on the system, the default is false. \
2789 " }, \
2790 { "wordlist_compress_zlib", "true",  \
2791 	"boolean", "all", "", "3.2.0b4", "Indexing:How", "wordlist_compress_zlib: false", " \
2792 	Enables or disables the zlib compression system for the indexer. \
2793 	Both <a href=\"#wordlist_compress\">wordlist_compress</a> and \
2794 	<a href=\"#compression_level\">compression_level</a> must be true \
2795 	(non-zero) to use this option!\
2796 " }, \
2797 { "wordlist_monitor", "false", \
2798 	"boolean", "all", "", "3.2.0b1", "Extra Output", "wordlist_monitor: true", " \
2799 	This enables monitoring of what's happening in the indexer. \
2800 	It can help to detect performance/configuration problems. \
2801 " }, \
2802 { "wordlist_monitor_period","0", \
2803 	"number", "all", "", "3.2.0b1", "Extra Output", "wordlist_monitor_period: .1", " \
2804 	Sets the number of seconds between each monitor output. \
2805 " }, \
2806 { "wordlist_monitor_output","", \
2807 	"string", "all", "", "3.2.0b1", "Extra Output", "wordlist_monitor_output: myfile", " \
2808 	Print monitoring output on file instead of the default stderr. \
2809 " },
2810 { "wordlist_page_size", "0",  \
2811 	"integer", "all", "", "3.2.0b1", "Indexing:How", "wordlist_page_size: 8192", " \
2812 	Size (in bytes) of pages used by Berkeley DB (DB used by the indexer). \
2813 	Must be a power of two. \
2814 " }, \
2815 { "wordlist_verbose", "",  \
2816 	"integer", "", "", "", "", "wordlist_verbose: true", " \
2817 	wordlist_verbose 1 walk logic<br>    \
2818 	wordlist_verbose 2 walk logic details<br>    \
2819 	wordlist_verbose 2 walk logic lots of details<br>    \
2820 " }, \
2821 { "wordlist_wordkey_description", "Word/DocID 32/Flags 8/Location 16", \
2822 	"string", "all", "", "3.2.0b1", "Indexing:How", "**this should not be configured by user**", " \
2823 	Internal key description: *not user configurable* \
2824 " }, \
2825 { "wordlist_wordrecord_description", "DATA", \
2826 	"string", "all", "", "3.2.0b1", "Indexing:How", "**this should not be configured by user**", " \
2827 	Internal data description: *not user configurable* \
2828 " }, \
2829 {0, 0, 0, 0, 0, 0, 0, 0, 0}
2830 };
2831 
2832 HtConfiguration	config;
2833