1 // 2 // defaults.cc 3 // 4 // defaults: default values for the ht programs through the 5 // HtConfiguration class 6 // 7 // Part of the ht://Dig package <http://www.htdig.org/> 8 // Copyright (c) 1995-2004 The ht://Dig Group 9 // For copyright details, see the file COPYING in your distribution 10 // or the GNU Library General Public License (LGPL) version 2 or later 11 // <http://www.gnu.org/copyleft/lgpl.html> 12 // 13 // $Id: defaults.cc,v 1.112 2004/06/12 13:39:12 lha Exp $ 14 // 15 16 #ifdef HAVE_CONFIG_H 17 #include "htconfig.h" 18 #endif /* HAVE_CONFIG_H */ 19 20 #include "HtConfiguration.h" 21 22 // Fields and their values: 23 // Attribute name 24 // Default value ("" becomes "no default" in .html docs) 25 // Type (boolean, number, integer, string, string list, quoted string list, 26 // pattern list) 27 // Commands using attribute (all, htdig, htsearch, htfuzzy, 28 // htdump, htload, htnotify, htpurge) 29 // Block (Global, Server, URL) 30 // Versions for which attribute is present 31 // Class (Extra Output, External:Parsers, External:Protocols, 32 // File Layout, 33 // Indexing:Connection, Indexing:Out, Indexing:What,Indexing:Where, 34 // Presentation:Files, Presentation:How, Presentation:Text, 35 // Searching:Method, Searching:Ranking, Searching:UI, 36 // URLs) 37 // Example 38 // Description 39 40 ConfigDefaults defaults[] = 41 { 42 43 { "accents_db", "${database_base}.accents.db", \ 44 "string", "htfuzzy htsearch", "", "all", "File Layout", "accents_db: ${database_base}.uml.db", " \ 45 The database file used for the fuzzy \"accents\" search \ 46 algorithm. This database is created by \ 47 <a href=\"htfuzzy.html\">htfuzzy</a> and used by \ 48 <a href=\"htsearch.html\" target=\"_top\">htsearch</a>. \ 49 " }, \ 50 { "accept_language", "", \ 51 "string list", "htdig", "Server", "3.2.0b4", "Indexing:Out", "accept_language: en-us en it", " \ 52 This attribute allows you to restrict the set of natural languages \ 53 that are preferred as a response to an HTTP request performed by the \ 54 digger. This can be done by putting one or more language tags \ 55 (as defined by RFC 1766) in the preferred order, separated by spaces. \ 56 By doing this, when the server performs a content negotiation based \ 57 on the 'accept-language' given by the HTTP user agent, a different \ 58 content can be shown depending on the value of this attribute. If \ 59 set to an empty list, no language will be sent and the server default \ 60 will be returned. \ 61 " }, \ 62 { "add_anchors_to_excerpt", "true", \ 63 "boolean", "htsearch", "", "3.1.0", "Presentation:How", "add_anchors_to_excerpt: no", " \ 64 If set to true, the first occurrence of each matched \ 65 word in the excerpt will be linked to the closest \ 66 anchor in the document. This only has effect if the \ 67 <strong>EXCERPT</strong> variable is used in the output \ 68 template and the excerpt is actually going to be displayed. \ 69 " }, \ 70 { "allow_double_slash", "false", \ 71 "boolean", "htdig", "", "3.2.0b4", "Indexing:Out", "allow_double_slash: true", " \ 72 If set to true, strings of multiple slashes ('/') in URL paths \ 73 will be left intact, rather than being collapsed. This is necessary \ 74 for some search engine URLs which use slashes to separate fields rather \ 75 than to separate directory components. However, it can lead to multiple database \ 76 entries refering to the same file, and it causes '/foo//../' to \ 77 be equivalent to '/foo/', rather than to '/'. \ 78 " }, \ 79 { "allow_in_form", "", \ 80 "string list", "htsearch", "", "3.1.0", "Searching:UI", "allow_in_form: search_algorithm search_results_header", " \ 81 Allows the specified config file attributes to be specified \ 82 in search forms as separate fields. This could be used to \ 83 allow form writers to design their own headers and footers \ 84 and specify them in the search form. Another example would \ 85 be to offer a menu of search_algorithms in the form. \ 86 <table> \ 87 <tr> \ 88 <td nowrap> \ 89 <code> \ 90 <SELECT NAME=\"search_algorithm\"><br> \ 91 <OPTION VALUE=\"exact:1 prefix:0.6 synonyms:0.5 endings:0.1\" SELECTED>fuzzy<br> \ 92 <OPTION VALUE=\"exact:1\">exact<br> \ 93 </SELECT> \ 94 </code></td> \ 95 </tr> \ 96 </table> \ 97 The general idea behind this is to make an input parameter out \ 98 of any configuration attribute that's not already automatically \ 99 handled by an input parameter. You can even make up your own \ 100 configuration attribute names, for purposes of passing data from \ 101 the search form to the results output. You're not restricted to \ 102 the existing attribute names. The attributes listed in the \ 103 allow_in_form list will be settable in the search form using \ 104 input parameters of the same name, and will be propagated to \ 105 the follow-up search form in the results template using template \ 106 variables of the same name in upper-case. \ 107 You can also make select lists out of any of these input \ 108 parameters, in the follow-up search form, using the \ 109 <a href=\"#build_select_lists\">build_select_lists</a> \ 110 configuration attribute. \ 111 <br>WARNING: Extreme care are should be taken with this option, as \ 112 allowing CGI scripts to set file names can open security holes.\ 113 " }, \ 114 { "allow_numbers", "false", \ 115 "boolean", "htdig htsearch", "", "all", "Indexing:What", "allow_numbers: true", " \ 116 If set to true, numbers are considered words. This \ 117 means that searches can be done on strings of digits as well as \ 118 regular words. All the same rules apply to numbers as \ 119 to words. This does not cause numbers containing a decimal point or \ 120 commas to be treated as a single entity. \ 121 When allow_numbers is false, words are stil \ 122 allowed to contain digits, but they must also contain at \ 123 least one alphabetic character or \ 124 <a href=\"#extra_word_characters\">extra word</a> character. \ 125 To disallow digits in words, add the digits to \ 126 <a href=\"#valid_punctuation\">valid_punctuation</a>. \ 127 " }, \ 128 { "allow_space_in_url", "false", \ 129 "boolean", "htdig", "", "3.2.0b6", "Indexing:Where", "allow_space_in_url: true", " \ 130 If set to true, htdig will handle URLs that contain \ 131 embedded spaces. Technically, this is a violation of \ 132 RFC 2396, which says spaces should be stripped out \ 133 (as htdig does by default). However, many web browsers \ 134 and HTML code generators violate this standard already, \ 135 so enabling this attribute allows htdig to handle these \ 136 non-compliant URLs. Even with this attribute set, htdig \ 137 still strips out all white space (leading, trailing and \ 138 embedded), except that space characters embedded within \ 139 the URL will be encoded as %20. \ 140 " }, \ 141 { "allow_virtual_hosts", "true", \ 142 "boolean", "htdig", "", "3.0.8b2", "Indexing:Where", "allow_virtual_hosts: false", " \ 143 If set to true, htdig will index virtual web sites as \ 144 expected. If false, all URL host names will be \ 145 normalized into whatever the DNS server claims the IP \ 146 address to map to. If this option is set to false, \ 147 there is no way to index either \"soft\" or \"hard\" \ 148 virtual web sites. \ 149 " }, \ 150 { "anchor_target", "", \ 151 "string", "htsearch", "", "3.1.6", "Presentation:How", "anchor_target: body", " \ 152 When the first matched word in the excerpt is linked \ 153 to the closest anchor in the document, this string \ 154 can be set to specify a target in the link so the \ 155 resulting page is displayed in the desired frame. \ 156 This value will only be used if the \ 157 <a href=\"#add_anchors_to_excerpt\">add_anchors_to_excerpt</a> \ 158 attribute is set to true, the <strong>EXCERPT</strong> \ 159 variable is used in the output template and the \ 160 excerpt is actually displayed with a link. \ 161 " }, \ 162 { "any_keywords", "false", \ 163 "boolean", "htsearch", "", "3.2.0b2", "Searching:Method", "any_keywords: yes", " \ 164 If set to true, the words in the <strong>keywords</strong> \ 165 input parameter in the search form will be joined with logical \ 166 ORs rather than ANDs, so that any of the words provided will do. \ 167 Note that this has nothing to do with limiting the search to \ 168 words in META keywords tags. See the <a href=\"hts_form.html\"> \ 169 search form</a> documentation for details on this. \ 170 " }, \ 171 { "author_factor", "1", \ 172 "number", "htsearch", "", "3.2.0b4", "Searching:Ranking", "author_factor: 1", " \ 173 Weighting applied to words in a <meta name=\"author\" ... > \ 174 tag.<br> \ 175 See also <a href=\"#heading_factor\">heading_factor</a>. \ 176 " }, \ 177 { "authorization", "", \ 178 "string", "htdig", "URL", "3.1.4", "Indexing:Out", "authorization: myusername:mypassword", " \ 179 This tells htdig to send the supplied \ 180 <em>username</em><strong>:</strong><em>password</em> with each HTTP request. \ 181 The credentials will be encoded using the \"Basic\" authentication \ 182 scheme. There <em>must</em> be a colon (:) between the username and \ 183 password.<br> \ 184 This attribute can also be specified on htdig's command line using \ 185 the -u option, and will be blotted out so it won't show up in a \ 186 process listing. If you use it directly in a configuration file, \ 187 be sure to protect it so it is readable only by you, and do not \ 188 use that same configuration file for htsearch. \ 189 " }, \ 190 { "backlink_factor", "0.1", \ 191 "number", "htsearch", "", "3.1.0", "Searching:Ranking", "backlink_factor: 501.1", " \ 192 This is a weight of \"how important\" a page is, based on \ 193 the number of URLs pointing to it. It's actually \ 194 multiplied by the ratio of the incoming URLs (backlinks) \ 195 and outgoing URLs (links on the page), to balance out pages \ 196 with lots of links to pages that link back to them. The ratio \ 197 gives lower weight to \"link farms\", which often have many \ 198 links to them. This factor can \ 199 be changed without changing the database in any way. \ 200 However, setting this value to something other than 0 \ 201 incurs a slowdown on search results. \ 202 " }, \ 203 { "bad_extensions", ".wav .gz .z .sit .au .zip .tar .hqx .exe .com .gif .jpg .jpeg .aiff .class .map .ram .tgz .bin .rpm .mpg .mov .avi .css", \ 204 "string list", "htdig", "URL", "all", "Indexing:Where", "bad_extensions: .foo .bar .bad", " \ 205 This is a list of extensions on URLs which are \ 206 considered non-parsable. This list is used mainly to \ 207 supplement the MIME-types that the HTTP server provides \ 208 with documents. Some HTTP servers do not have a correct \ 209 list of MIME-types and so can advertise certain \ 210 documents as text while they are some binary format. \ 211 If the list is empty, then all extensions are acceptable, \ 212 provided they pass other criteria for acceptance or rejection. \ 213 See also <a href=\"#valid_extensions\">valid_extensions</a>. \ 214 " }, \ 215 { "bad_local_extensions", ".php .shtml .cgi", \ 216 "string list", "htdig", "URL", "all", "Indexing:Where", "bad_extensions: .foo .bar .bad", " \ 217 This is a list of extensions on URLs which must be retrieved \ 218 using the URL's true transport mechanism (such as HTTP). \ 219 If <a href=\"#local_urls\">local_urls</a> is specified, URLs not \ 220 ending with these extensions may instead be retrieved through \ 221 the local filesystem for efficiency. \ 222 " }, 223 { "bad_querystr", "", \ 224 "pattern list", "htdig", "URL", "3.1.0", "Indexing:Where", "bad_querystr: forum=private section=topsecret&passwd=required", " \ 225 This is a list of CGI query strings to be excluded from \ 226 indexing. This can be used in conjunction with CGI-generated \ 227 portions of a website to control which pages are \ 228 indexed. \ 229 " }, \ 230 { "bad_word_list", "${common_dir}/bad_words", \ 231 "string", "htdig htsearch", "", "all", "Indexing:What,Searching:Method", "bad_word_list: ${common_dir}/badwords.txt", " \ 232 This specifies a file which contains words which should \ 233 be excluded when digging or searching. This list should \ 234 include the most common words or other words that you \ 235 don't want to be able to search on (things like <em> \ 236 sex</em> or <em>smut</em> are examples of these.)<br> \ 237 The file should contain one word per line. A sample \ 238 bad words file is located in the <code>contrib/examples</code> \ 239 directory. \ 240 " }, \ 241 { "bin_dir", BIN_DIR, \ 242 "string", "all", "", "all", "File Layout", "bin_dir: /usr/local/bin", " \ 243 This is the directory in which the executables \ 244 related to ht://Dig are installed. It is never used \ 245 directly by any of the programs, but other attributes \ 246 can be defined in terms of this one. \ 247 <p> \ 248 The default value of this attribute is determined at \ 249 compile time. \ 250 </p> \ 251 " }, \ 252 { "boolean_keywords", "and or not", \ 253 "string list", "htsearch", "", "3.1.6", "Presentation:How", "boolean_keywords: et ou non", " \ 254 These three strings are used as the keywords used in \ 255 constructing the \ 256 <a href=\"hts_templates.html#LOGICAL_WORDS\">LOGICAL_WORDS</a> \ 257 template variable, \ 258 and in parsing the <a href=\"hts_form.html#words\">words</a> input \ 259 parameter when the <a href=\"hts_form.html#method\">method</a> \ 260 parameter or <a href=\"#match_method\">match_method</a> attribute \ 261 is set to <code>boolean</code>. \ 262 See also the \ 263 <a href=\"#boolean_syntax_errors\">boolean_syntax_errors</a> attribute. \ 264 " }, 265 { "boolean_syntax_errors", "Expected \ 266 'a search word, a quoted phrase or a boolean expression between ()' \ 267 'at the end' 'instead of' 'end of expression' quotes", \ 268 "quoted string list", "htsearch", "", "3.1.6", "Presentation:How", 269 "boolean_syntax_errors: Attendait \"un mot\" \"à la fin\" \ 270 \"au lieu de\" \"fin d'expression\" \"guillemet\"", " \ 271 These six strings are used as the keywords used to \ 272 construct various syntax error messages for errors encountered in \ 273 parsing the <a href=\"hts_form.html#words\">words</a> input \ 274 parameter when the <a href=\"hts_form.html#method\">method</a> parameter \ 275 or <a href=\"#match_method\">match_method</a> attribute \ 276 is set to <code>boolean</code>. \ 277 They are used in conjunction with the \ 278 <a href=\"#boolean_keywords\">boolean_keywords</a> attribute, and \ 279 comprise all \ 280 English-specific parts of these error messages. The order in which \ 281 the strings are put together may not be ideal, or even gramatically \ 282 correct, for all languages, but they can be used to make fairly \ 283 intelligible messages in many languages. \ 284 " }, 285 { "build_select_lists", "", \ 286 "quoted string list", "htsearch", "", "3.2.0b1", "Searching:UI", "build_select_lists: \ 287 MATCH_LIST matchesperpage matches_per_page_list \\<br> \ 288 1 1 1 matches_per_page \"Previous Amount\" \\<br> \ 289 RESTRICT_LIST,multiple restrict restrict_names 2 1 2 restrict \"\" \\<br> \ 290 FORMAT_LIST,radio format template_map 3 2 1 template_name \"\"", " \ 291 This list allows you to define any htsearch input parameter as \ 292 a select list for use in templates, provided you also define \ 293 the corresponding name list attribute which enumerates all the \ 294 choices to put in the list. It can be used for existing input \ 295 parameters, as well as any you define using the \ 296 <a href=\"#allow_in_form\">allow_in_form</a> \ 297 attribute. The entries in this list each consist of an octuple, \ 298 a set of eight strings defining the variables and how they are to \ 299 be used to build a select list. The attribute can contain many \ 300 of these octuples. The strings in the string list are merely \ 301 taken eight at a time. For each octuple of strings specified in \ 302 build_select_lists, the elements have the following meaning: \ 303 <ol> \ 304 <li>the name of the template variable to be defined as a list, \ 305 optionally followed by a comma and the type of list, and \ 306 optional formatting codes \ 307 <li>the input parameter name that the select list will set \ 308 <li>the name of the user-defined attribute containing the \ 309 name list \ 310 <li>the tuple size used in the name list above \ 311 <li>the index into a name list tuple for the value \ 312 <li>the index for the corresponding label on the selector \ 313 <li>the configuration attribute where the default value for \ 314 this input parameter is defined \ 315 <li>the default label, if not an empty string, which will be \ 316 used as the label for an additional list item for the current \ 317 input parameter value if it doesn't match any value in the \ 318 given list \ 319 </ol> \ 320 See the <a href=\"hts_selectors.html\">select list documentation</a> \ 321 for more information on this attribute. \ 322 " }, \ 323 { "caps_factor", "1", \ 324 "number", "htsearch", "", "??", "Searching:Ranking", "caps_factor: 1", " \ 325 TO BE COMPLETED<br> \ 326 See also <a href=\"#heading_factor\">heading_factor</a>. \ 327 " }, \ 328 { "case_sensitive", "true", \ 329 "boolean", "htdig", "", "3.1.0b2", "Indexing:Where", "case_sensitive: false", " \ 330 This specifies whether ht://Dig should consider URLs \ 331 case-sensitive or not. If your server is case-insensitive, \ 332 you should probably set this to false. <br> \ 333 Even if this is false, \ 334 <a href=\"#common_url_parts\">common_url_parts</a>, \ 335 <a href=\"#url_part_aliases\">url_part_aliases</a> and \ 336 <a href=\"#url_rewrite_rules\">url_rewrite_rules</a> \ 337 are all still case sensitive, and \ 338 <a href=\"#server_aliases\">server_aliases</a> \ 339 is still case insensitive. \ 340 " }, \ 341 { "check_unique_date", "false", \ 342 "boolean", "htdig", "Global", "3.2.0b3", "", "check_unique_date: false", " \ 343 Include the modification date of the page in the MD5 hash, to reduce the \ 344 problem with identical but physically separate pages in different parts of the tree pointing to \ 345 different pages. \ 346 " }, \ 347 { "check_unique_md5", "false", \ 348 "boolean", "htdig", "Global", "3.2.0b3", "", "check_unique_md5: false", " \ 349 Uses the MD5 hash of pages to reject aliases, prevents multiple entries \ 350 in the index caused by such things as symbolic links \ 351 Note: May not do the right thing for incremental update \ 352 " }, \ 353 { "collection_names", "", \ 354 "string list", "htsearch", "", "3.2.0b2", "", "collection_names: htdig_docs htdig_bugs", " \ 355 This is a list of config file names that are used for searching multiple databases. \ 356 Simply put, htsearch will loop through the databases specified by each of these config \ 357 files and present the result of the search on all of the databases. \ 358 The corresponding config files are looked up in the <a href=\"#config_dir\">config_dir</a> directory. \ 359 Each listed config file <strong>must</strong> exist, as well as the corresponding databases. \ 360 " }, \ 361 { "common_dir", COMMON_DIR, \ 362 "string", "all", "", "all", "File Layout", "common_dir: /tmp", " \ 363 Specifies the directory for files that will or can be \ 364 shared among different search databases. The default \ 365 value for this attribute is defined at compile time. \ 366 " }, \ 367 { "common_url_parts", "http:// http://www. ftp:// ftp://ftp. /pub/ .html .htm .shtml /index.html /index.htm .com/ .com mailto:", \ 368 "string list", "all", "", "3.1.0", "URLs", "common_url_parts: http://www.htdig.org/ml/ \\<br> \ 369 .html \\<br> \ 370 http://dev.htdig.org/ \\<br> \ 371 http://www.htdig.org/", " \ 372 Sub-strings often found in URLs stored in the \ 373 database. These are replaced in the database by an \ 374 internal space-saving encoding. If a string \ 375 specified in <a href=\"#url_part_aliases\">url_part_aliases</a>, \ 376 overlaps any string in common_url_parts, the \ 377 common_url_parts string is ignored.<br> \ 378 Note that when this attribute is changed, the \ 379 database should be rebuilt, unless the effect of \ 380 \"changing\" the affected URLs in the database is \ 381 wanted.<br> \ 382 " }, \ 383 { "compression_level", "6", \ 384 "integer", "htdig", "", "3.1.0", "Indexing:How", "compression_level: 0", " \ 385 If non-zero and the \ 386 <a href=\"http://www.cdrom.com/pub/infozip/zlib/\">zlib</a> \ 387 compression library was available when compiled, \ 388 this attribute controls the amount of compression used in the \ 389 <a href=\"#doc_excerpt\">doc_excerpt</a> file. \ 390 <br/>This must be in the range 0-9, and must be non-zero when \ 391 <a href=\"#wordlist_compress_zlib\">wordlist_compress_zlib</a> \ 392 is used. \ 393 " }, \ 394 { "config", "", \ 395 "string", "all", "", "??", "File Layout", "", " \ 396 Name of configuration file to load. \ 397 For security reasons, restrictions are placed on the values which \ 398 can be specified on the command line to \ 399 <a href=\"htsearch.html\" target=\"_top\">htsearch</a>. \ 400 The default value of this attribute is determined at \ 401 compile time. \ 402 " }, \ 403 { "config_dir", CONFIG_DIR, \ 404 "string", "all", "", "all", "File Layout", "config_dir: /var/htdig/conf", " \ 405 This is the directory which contains all configuration \ 406 files related to ht://Dig. It is never used \ 407 directly by any of the programs, but other attributes \ 408 or the <a href=\"#include\">include</a> directive \ 409 can be defined in terms of this one. \ 410 <p> \ 411 The default value of this attribute is determined at \ 412 compile time. \ 413 </p> \ 414 " }, 415 { "content_classifier", "${bin_dir}/HtFileType", \ 416 "string", "htdig", "", "3.2.0b4", "Indexing:What", "content_classifier: file -i -b", " \ 417 When ht://Dig can't determine the type of a <code>file://</code> \ 418 URL from its extension, this program is used to determine the type. \ 419 The program is called with one argument, the name of (possibly a \ 420 temporary copy of) the file. \ 421 <p> \ 422 See also <a href=\"#mime_types\">mime_types</a>.\ 423 </p> \ 424 " }, \ 425 { "cookies_input_file", "", \ 426 "string", "htdig", "", "3.2.0b4", "Indexing:Connection", "cookies_input_file: ${common_dir}/cookies.txt", " \ 427 Specifies the location of the file used for importing cookies \ 428 for the crawl. These cookies will be preloaded into htdig's \ 429 in-memory cookie jar, but aren't written back to the file. \ 430 Cookies are specified according to Netscape's format \ 431 (tab-separated fields). If this attribute is left blank, \ 432 no cookie file will be read. \ 433 For more information, see the sample cookies.txt file in the \ 434 ht://Dig source distribution. \ 435 " }, \ 436 { "create_image_list", "false", \ 437 "boolean", "htdig", "", "all", "Extra Output", "create_image_list: yes", " \ 438 If set to true, a file with all the image URLs that \ 439 were seen will be created, one URL per line. This list \ 440 will not be in any order and there will be lots of \ 441 duplicates, so after htdig has completed, it should be \ 442 piped through <code>sort -u</code> to get a unique list. \ 443 " }, \ 444 { "create_url_list", "false", \ 445 "boolean", "htdig", "", "all", "Extra Output", "create_url_list: yes", " \ 446 If set to true, a file with all the URLs that were seen \ 447 will be created, one URL per line. This list will not \ 448 be in any order and there will be lots of duplicates, \ 449 so after htdig has completed, it should be piped \ 450 through <code>sort -u</code> to get a unique list. \ 451 " }, \ 452 { "database_base", "${database_dir}/db", \ 453 "string", "all", "", "all", "File Layout", "database_base: ${database_dir}/sales", " \ 454 This is the common prefix for files that are specific \ 455 to a search database. Many different attributes use \ 456 this prefix to specify filenames. Several search \ 457 databases can share the same directory by just changing \ 458 this value for each of the databases. \ 459 " }, \ 460 { "database_dir", DATABASE_DIR, \ 461 "string", "all", "", "all", "File Layout", "database_dir: /var/htdig", " \ 462 This is the directory which contains all database and \ 463 other files related to ht://Dig. It is never used \ 464 directly by any of the programs, but other attributes \ 465 are defined in terms of this one. \ 466 <p> \ 467 The default value of this attribute is determined at \ 468 compile time. \ 469 </p> \ 470 " }, \ 471 { "date_factor", "0", \ 472 "number", "htsearch", "", "3.1.0", "Searching:Ranking", "date_factor: 0.35", " \ 473 This factor, gives higher \ 474 rankings to newer documents and lower rankings to older \ 475 documents. Before setting this factor, it's advised to \ 476 make sure your servers are returning accurate dates \ 477 (check the dates returned in the long format). \ 478 Additionally, setting this to a nonzero value incurs a \ 479 small performance hit on searching. \ 480 " }, \ 481 { "date_format", "", \ 482 "string", "htsearch", "", "3.1.2", "Presentation:How", "date_format: %Y-%m-%d", " \ 483 This format string determines the output format for \ 484 modification dates of documents in the search results. \ 485 It is interpreted by your system's <em>strftime</em> \ 486 function. Please refer to your system's manual page \ 487 for this function, for a description of available \ 488 format codes. If this format string is empty, as it \ 489 is by default, \ 490 <a href=\"htsearch.html\" target=\"_top\">htsearch</a> \ 491 will pick a format itself. In this case, the <a \ 492 href=\"#iso_8601\">iso_8601</a> attribute can be used \ 493 to modify the appearance of the date. \ 494 " }, \ 495 { "description_factor", "150", \ 496 "number", "htsearch", "", "3.1.0b3", "Searching:Ranking", "description_factor: 350", " \ 497 Plain old \"descriptions\" are the text of a link pointing \ 498 to a document. This factor gives weight to the words of \ 499 these descriptions of the document. Not surprisingly, \ 500 these can be pretty accurate summaries of a document's \ 501 content. See also <a href=\"#heading_factor\">heading_factor</a> \ 502 and <a href=\"#meta_description_factor\">meta_description_factor</a>. \ 503 " }, \ 504 { "description_meta_tag_names", "description", \ 505 "string list", "htdig", "", "3.1.6", "Searching:Ranking", "description_meta_tag_names: \"description htdig-description\"", " \ 506 The words in this list are used to search for descriptions in HTML \ 507 <em>META</em> tags. This list can contain any number of strings \ 508 that each will be seen as the name for whatever description \ 509 convention is used. While words in any of the specified \ 510 description contents will be indexed, only the last meta tag \ 511 containing a description will be kept for the \ 512 <a href=\"hts_templates.html#METADESCRIPTION\"METADESCRIPTION</a> \ 513 variable in search results. The order in \ 514 which the names are specified in this configuration attribute \ 515 is irrelevant, as it is the order in which the tags appear in \ 516 the documents that matters.<br> The <em>META</em> tags have the \ 517 following format:<br> \ 518 <tt> <META name=\"<em>somename</em>\" \ 519 content=\"<em>somevalue</em>\"> </tt><br> \ 520 See also <a href=\"#meta_description_factor\">meta_description_factor</a>. \ 521 " }, \ 522 { "disable_cookies", "true", \ 523 "boolean", "htdig", "Server", "3.2.0b4", "Indexing:Connection", "disable_cookies: true", " \ 524 This option, if set to true, will disable HTTP cookies. \ 525 " }, \ 526 { "doc_db", "${database_base}.docdb", \ 527 "string", "all", "", "all", "File Layout", "doc_db: ${database_base}documents.db", " \ 528 This file will contain a Berkeley database of documents \ 529 indexed by document number. It contains all the information \ 530 gathered for each document, except the document excerpts \ 531 which are stored in the <a href=\"#doc_excerpt\"><em> \ 532 doc_excerpt</em></a> file. \ 533 " }, \ 534 { "doc_excerpt", "${database_base}.excerpts", \ 535 "string", "all", "", "3.2.0b1", "File Layout", "doc_excerpt: ${database_base}excerpts.db", " \ 536 This file will contain a Berkeley database of document excerpts \ 537 indexed by document number. It contains all the text \ 538 gathered for each document, so this file can become \ 539 rather large if <a href=\"#max_head_length\"><em> \ 540 max_head_length</em></a> is set to a large value. \ 541 The size can be reduced by setting the \ 542 <a href=\"#compression_level\"><em>compression_level</em></a>, \ 543 if supported on your system. \ 544 " }, \ 545 { "doc_index", "${database_base}.docs.index", \ 546 "string", "htdig", "", "all", "File Layout", "doc_index: documents.index.db", " \ 547 This file contains a mapping of document numbers to URLs and is \ 548 used by htdig during indexing. It is used on updates if it exists. \ 549 " }, \ 550 { "doc_list", "${database_base}.docs", \ 551 "string", "htdig htdump htload", "", "all", "File Layout", "doc_list: /tmp/documents.text", " \ 552 This file is basically a text version of the file \ 553 specified in <em><a href=\"#doc_db\">doc_db</a></em>. Its \ 554 only use is to have a human readable database of all \ 555 documents. The file is easy to parse with tools like \ 556 perl or tcl. \ 557 " }, \ 558 { "endday", "", \ 559 "integer", "htsearch", "", "3.1.6", "Searching:Method", "endday: 31", " \ 560 Day component of last date allowed as last-modified date \ 561 of returned docutments. \ 562 This is most usefully specified as a \ 563 <a href=\"hts_form.html#startyear\">GCI argument</a>. \ 564 See also <a href=\"#startyear\">startyear</a>. \ 565 " }, \ 566 { "end_ellipses", "<strong><code> ...</code></strong>", \ 567 "string", "htsearch", "", "all", "Presentation:Text", "end_ellipses: ...", " \ 568 When excerpts are displayed in the search output, this \ 569 string will be appended to the excerpt if there is text \ 570 following the text displayed. This is just a visual \ 571 reminder to the user that the excerpt is only part of \ 572 the complete document. \ 573 " }, \ 574 { "end_highlight", "</strong>", \ 575 "string", "htsearch", "", "3.1.4", "Presentation:Text", "end_highlight: </font>", " \ 576 When excerpts are displayed in the search output, matched \ 577 words will be highlighted using <a href=\"#start_highlight\"> \ 578 start_highlight</a> and this string. \ 579 You should ensure that highlighting tags are balanced, \ 580 that is, this string should close any formatting \ 581 tag opened by start_highlight. \ 582 " }, \ 583 { "endings_affix_file", "${common_dir}/english.aff", \ 584 "string", "htfuzzy", "", "all", "File Layout", "endings_affix_file: /var/htdig/affix_rules", " \ 585 Specifies the location of the file which contains the \ 586 affix rules used to create the endings search algorithm \ 587 databases. Consult the documentation on \ 588 <a href=\"htfuzzy.html\">htfuzzy</a> for more information on the \ 589 format of this file. \ 590 " }, \ 591 { "endings_dictionary", "${common_dir}/english.0", \ 592 "string", "htfuzzy", "", "all", "File Layout", "endings_dictionary: /var/htdig/dictionary", " \ 593 Specifies the location of the file which contains the \ 594 dictionary used to create the endings search algorithm \ 595 databases. Consult the documentation on \ 596 <a href=\"htfuzzy.html\">htfuzzy</a> for more information on the \ 597 format of this file. \ 598 " }, \ 599 { "endings_root2word_db", "${common_dir}/root2word.db", \ 600 "string", "htfuzzy htsearch", "", "all", "File Layout", "endings_root2word_db: /var/htdig/r2w.db", " \ 601 This attributes specifies the database filename to be \ 602 used in the 'endings' fuzzy search algorithm. The \ 603 database maps word roots to all legal words with that \ 604 root. For more information about this and other fuzzy \ 605 search algorithms, consult the \ 606 <a href=\"htfuzzy.html\">htfuzzy</a> documentation.<br> \ 607 Note that the default value uses the \ 608 <a href=\"#common_dir\">common_dir</a> attribute instead of the \ 609 <a href=\"#database_dir\">database_dir</a> attribute. \ 610 This is because this database can be shared with \ 611 different search databases. \ 612 " }, \ 613 { "endings_word2root_db", "${common_dir}/word2root.db", \ 614 "string", "htfuzzy htsearch", "", "all", "File Layout", "endings_word2root_db: /var/htdig/w2r.bm", " \ 615 This attributes specifies the database filename to be \ 616 used in the 'endings' fuzzy search algorithm. The \ 617 database maps words to their root. For more information \ 618 about this and other fuzzy search algorithms, consult \ 619 the <a href=\"htfuzzy.html\">htfuzzy</a> \ 620 documentation.<br> \ 621 Note that the default value uses the \ 622 <a href=\"#common_dir\">common_dir</a> attribute instead of the \ 623 <a href=\"#database_dir\">database_dir</a> attribute. \ 624 This is because this database can be shared with \ 625 different search databases. \ 626 " }, \ 627 { "endmonth", "", \ 628 "integer", "htsearch", "", "3.1.6", "Searching:Method", "endmonth: 12", " \ 629 Month component of last date allowed as last-modified date \ 630 of returned docutments. \ 631 This is most usefully specified as a \ 632 <a href=\"hts_form.html#startyear\">GCI argument</a>. \ 633 See also <a href=\"#startyear\">startyear</a>. \ 634 " }, \ 635 { "endyear", "", \ 636 "integer", "htsearch", "", "3.1.6", "Searching:Method", "endyear: 2002", " \ 637 Year component of last date allowed as last-modified date \ 638 of returned docutments. \ 639 This is most usefully specified as a \ 640 <a href=\"hts_form.html#startyear\">GCI argument</a>. \ 641 See also <a href=\"#startyear\">startyear</a>. \ 642 " }, \ 643 { "excerpt_length", "300", \ 644 "integer", "htsearch", "", "all", "Presentation:How", "excerpt_length: 500", " \ 645 This is the maximum number of characters the displayed \ 646 excerpt will be limited to. The first matched word will \ 647 be highlighted in the middle of the excerpt so that there is \ 648 some surrounding context.<br> \ 649 The <em><a href=\"#start_ellipses\"> \ 650 start_ellipses</a></em> and \ 651 <em><a href=\"#end_ellipses\">end_ellipses</a></em> are used to \ 652 indicate that the document contains text before and \ 653 after the displayed excerpt respectively. \ 654 The <em><a href=\"#start_highlight\">start_highlight</a></em> and \ 655 <em><a href=\"#end_highlight\">end_highlight</a></em> are used to \ 656 specify what formatting tags are used to highlight matched words. \ 657 " }, \ 658 { "excerpt_show_top", "false", \ 659 "boolean", "htsearch", "", "all", "Presentation:How", "excerpt_show_top: yes", " \ 660 If set to true, the excerpt of a match will always show \ 661 the top of the matching document. If it is false (the \ 662 default), the excerpt will attempt to show the part of \ 663 the document that actually contains one of the words. \ 664 " }, \ 665 { "exclude", "", \ 666 "pattern list", "htsearch", "", "3.2.0b4", "Searching:Method", "exclude: myhost.com/mailarchive/", " \ 667 If a URL contains any of the space separated patterns, it will be \ 668 discarded in the searching phase. This is used to exclude certain \ 669 URLs from search results. The list can be specified from within \ 670 the configuration file, and can be overridden with the \"exclude\" \ 671 input parameter in the search form. \ 672 " }, \ 673 { "exclude_urls", "/cgi-bin/ .cgi", \ 674 "pattern list", "htdig", "URL", "all", "Indexing:Where", "exclude_urls: students.html cgi-bin", " \ 675 If a URL contains any of the space separated patterns, \ 676 it will be rejected. This is used to exclude such \ 677 common things such as an infinite virtual web-tree \ 678 which start with cgi-bin. \ 679 " }, \ 680 { "external_parsers", "", \ 681 "quoted string list", "htdig", "", "3.0.7", "External:Parsers", "external_parsers: text/html /usr/local/bin/htmlparser \\<br> \ 682 application/pdf /usr/local/bin/parse_doc.pl \\<br> \ 683 application/msword->text/plain \"/usr/local/bin/mswordtotxt -w\" \\<br> \ 684 application/x-gunzip->user-defined /usr/local/bin/ungzipper", " \ 685 This attribute is used to specify a list of \ 686 content-type/parsers that are to be used to parse \ 687 documents that cannot by parsed by any of the internal \ 688 parsers. The list of external parsers is examined \ 689 before the builtin parsers are checked, so this can be \ 690 used to override the internal behavior without \ 691 recompiling htdig.<br> \ 692 The external parsers are specified as pairs of \ 693 strings. The first string of each pair is the \ 694 content-type that the parser can handle while the \ 695 second string of each pair is the path to the external \ 696 parsing program. If quoted, it may contain parameters, \ 697 separated by spaces.<br> \ 698 External parsing can also be done with external \ 699 converters, which convert one content-type to \ 700 another. To do this, instead of just specifying \ 701 a single content-type as the first string \ 702 of a pair, you specify two types, in the form \ 703 <em>type1</em><strong>-></strong><em>type2</em>, \ 704 as a single string with no spaces. The second \ 705 string will define an external converter \ 706 rather than an external parser, to convert \ 707 the first type to the second. If the second \ 708 type is <strong>user-defined</strong>, then \ 709 it's up to the converter script to put out a \ 710 \"Content-Type: <em>type</em>\" header followed \ 711 by a blank line, to indicate to htdig what type it \ 712 should expect for the output, much like what a CGI \ 713 script would do. The resulting content-type must \ 714 be one that htdig can parse, either internally, \ 715 or with another external parser or converter.<br> \ 716 Only one external parser or converter can be \ 717 specified for any given content-type. However, \ 718 an external converter for one content-type can be \ 719 chained to the internal parser for the same type, \ 720 by appending <strong>-internal</strong> to the \ 721 second type string (e.g. text/html->text/html-internal) \ 722 to perform external preprocessing on documents of \ 723 this type before internal parsing. \ 724 There are two internal parsers, for text/html and \ 725 text/plain.<p> \ 726 The parser program takes four command-line \ 727 parameters, not counting any parameters already \ 728 given in the command string:<br> \ 729 <em>infile content-type URL configuration-file</em><br> \ 730 <table border=\"1\"> \ 731 <tr> \ 732 <th> Parameter </th> \ 733 <th> Description </th> \ 734 <th> Example </th> \ 735 </tr> \ 736 <tr> \ 737 <td valign=\"top\"> infile </td> \ 738 <td> A temporary file with the contents to be parsed. </td> \ 739 <td> /var/tmp/htdext.14242 </td> \ 740 </tr> \ 741 <tr> \ 742 <td valign=\"top\"> content-type </td> \ 743 <td> The MIME-type of the contents. </td> \ 744 <td> text/html </td> \ 745 </tr> \ 746 <tr> \ 747 <td valign=\"top\"> URL </td> \ 748 <td> The URL of the contents. </td> \ 749 <td> http://www.htdig.org/attrs.html </td> \ 750 </tr> \ 751 <tr> \ 752 <td valign=\"top\"> configuration-file </td> \ 753 <td> The configuration-file in effect. </td> \ 754 <td> /etc/htdig/htdig.conf </td> \ 755 </tr> \ 756 </table><p> \ 757 The external parser is to write information for \ 758 htdig on its standard output. Unless it is an \ 759 external converter, which will output a document \ 760 of a different content-type, then its output must \ 761 follow the format described here.<br> \ 762 The output consists of records, each record terminated \ 763 with a newline. Each record is a series of (unless \ 764 expressively allowed to be empty) non-empty tab-separated \ 765 fields. The first field is a single character \ 766 that specifies the record type. The rest of the fields \ 767 are determined by the record type. \ 768 <table border=\"1\"> \ 769 <tr> \ 770 <th> Record type </th> \ 771 <th> Fields </th> \ 772 <th> Description </th> \ 773 </tr> \ 774 <tr> \ 775 <th rowspan=\"3\" valign=\"top\"> w </th> \ 776 <td valign=\"top\"> word </td> \ 777 <td> A word that was found in the document. </td> \ 778 </tr> \ 779 <tr> \ 780 <td valign=\"top\"> location </td> \ 781 <td> \ 782 A number indicating the normalized location of \ 783 the word within the document. The number has to \ 784 fall in the range 0-1000 where 0 means the top of \ 785 the document. \ 786 </td> \ 787 </tr> \ 788 <tr> \ 789 <td valign=\"top\"> heading level </td> \ 790 <td> \ 791 A heading level that is used to compute the \ 792 weight of the word depending on its context in \ 793 the document itself. The level is in the range of \ 794 0-11 and are defined as follows: \ 795 <dl compact> \ 796 <dt> 0 </dt> <dd> Normal text </dd> \ 797 <dt> 1 </dt> <dd> Title text </dd> \ 798 <dt> 2 </dt> <dd> Heading 1 text </dd> \ 799 <dt> 3 </dt> <dd> Heading 2 text </dd> \ 800 <dt> 4 </dt> <dd> Heading 3 text </dd> \ 801 <dt> 5 </dt> <dd> Heading 4 text </dd> \ 802 <dt> 6 </dt> <dd> Heading 5 text </dd> \ 803 <dt> 7 </dt> <dd> Heading 6 text </dd> \ 804 <dt> 8 </dt> <dd> text alternative to images </dd> \ 805 <dt> 9 </dt> <dd> Keywords </dd> \ 806 <dt> 10 </dt> <dd> Meta-description </dd> \ 807 <dt> 11 </dt> <dd> Author </dd> \ 808 </dl> \ 809 </td> \ 810 </tr> \ 811 <tr> \ 812 <th rowspan=\"2\" valign=\"top\"> u </th> \ 813 <td valign=\"top\"> document URL </td> \ 814 <td> \ 815 A hyperlink to another document that is \ 816 referenced by the current document. It must be \ 817 complete and non-relative, using the URL parameter to \ 818 resolve any relative references found in the document. \ 819 </td> \ 820 </tr> \ 821 <tr> \ 822 <td valign=\"top\"> hyperlink description </td> \ 823 <td> \ 824 For HTML documents, this would be the text \ 825 between the <a href...> and </a> \ 826 tags. \ 827 </td> \ 828 </tr> \ 829 <tr> \ 830 <th valign=\"top\"> t </th> \ 831 <td valign=\"top\"> title </td> \ 832 <td> The title of the document </td> \ 833 </tr> \ 834 <tr> \ 835 <th valign=\"top\"> h </th> \ 836 <td valign=\"top\"> head </td> \ 837 <td> \ 838 The top of the document itself. This is used to \ 839 build the excerpt. This should only contain \ 840 normal ASCII text \ 841 </td> \ 842 </tr> \ 843 <tr> \ 844 <th valign=\"top\"> a </th> \ 845 <td valign=\"top\"> anchor </td> \ 846 <td> \ 847 The label that identifies an anchor that can be \ 848 used as a target in an URL. This really only \ 849 makes sense for HTML documents. \ 850 </td> \ 851 </tr> \ 852 <tr> \ 853 <th valign=\"top\"> i </th> \ 854 <td valign=\"top\"> image URL </td> \ 855 <td> \ 856 An URL that points at an image that is part of \ 857 the document. \ 858 </td> \ 859 </tr> \ 860 <tr> \ 861 <th rowspan=\"3\" valign=\"top\"> m </th> \ 862 <td valign=\"top\"> http-equiv </td> \ 863 <td> \ 864 The HTTP-EQUIV attribute of a \ 865 <a href=\"meta.html\"><em>META</em> tag</a>. \ 866 May be empty. \ 867 </td> \ 868 </tr> \ 869 <tr> \ 870 <td valign=\"top\"> name </td> \ 871 <td> \ 872 The NAME attribute of this \ 873 <a href=\"meta.html\"><em>META</em> tag</a>. \ 874 May be empty. \ 875 </td> \ 876 </tr> \ 877 <tr> \ 878 <td valign=\"top\"> contents </td> \ 879 <td> \ 880 The CONTENTS attribute of this \ 881 <a href=\"meta.html\"><em>META</em> tag</a>. \ 882 May be empty. \ 883 </td> \ 884 </tr> \ 885 </table> \ 886 <p><em>See also FAQ questions <a href=\"FAQ.html#q4.8\">4.8</a> and \ 887 <a href=\"FAQ.html#q4.9\">4.9</a> for more examples.</em></p> \ 888 " }, \ 889 { "external_protocols", "", \ 890 "quoted string list", "htdig", "", "3.2.0b1", "External:Protocols", "external_protocols: https /usr/local/bin/handler.pl \\<br> \ 891 ftp /usr/local/bin/ftp-handler.pl", " \ 892 This attribute is a bit like \ 893 <a href=\"#external_parsers\">external_parsers</a> since it specifies \ 894 a list of protocols/handlers that are used to download documents \ 895 that cannot be retrieved using the internal methods. This enables \ 896 htdig to index documents with URL schemes it does not understand, \ 897 or to use more advanced authentication for the documents it is \ 898 retrieving. This list is checked before HTTP or other methods, \ 899 so this can override the internal behavior without writing additional \ 900 code for htdig.<br> \ 901 The external protocols are specified as pairs of strings, the first \ 902 being the URL scheme that the script can handle while the second \ 903 is the path to the script itself. If the second is \ 904 quoted, then additional command-line arguments may be given.<br> \ 905 If the external protocol does not contain a colon (:), it is assumed \ 906 to have the standard format \ 907 \"protocol://[usr[:password]@]address[:port]/path\". \ 908 If it ends with a colon, then it is assumed to have the simpler format \ 909 \"protocol:path\". If it ends with \"://\" then the standard form is \ 910 again assumed. <br> \ 911 The program takes three command-line parameters, not counting any \ 912 parameters already given in the command string:<br> \ 913 <em>protocol URL configuration-file</em><br> \ 914 <table border=\"1\"> \ 915 <tr> \ 916 <th> Parameter </th> \ 917 <th> Description </th> \ 918 <th> Example </th> \ 919 </tr> \ 920 <tr> \ 921 <td valign=\"top\"> protocol </td> \ 922 <td> The URL scheme to be used. </td> \ 923 <td> https </td> \ 924 </tr> \ 925 <tr> \ 926 <td valign=\"top\"> URL </td> \ 927 <td> The URL to be retrieved. </td> \ 928 <td> https://www.htdig.org:8008/attrs.html </td> \ 929 </tr> \ 930 <tr> \ 931 <td valign=\"top\"> configuration-file </td> \ 932 <td> The configuration-file in effect. </td> \ 933 <td> /etc/htdig/htdig.conf </td> \ 934 </tr> \ 935 </table><p> \ 936 The external protocol script is to write information for htdig on the \ 937 standard output. The output must follow the form described here. The \ 938 output consists of a header followed by a blank line, followed by \ 939 the contents of the document. Each record in the header is terminated \ 940 with a newline. Each record is a series of (unless expressively \ 941 allowed to be empty) non-empty tab-separated fields. The first field \ 942 is a single character that specifies the record type. The rest of \ 943 the fields are determined by the record type. \ 944 <table border=\"1\"> \ 945 <tr> \ 946 <th> Record type </th> \ 947 <th> Fields </th> \ 948 <th> Description </th> \ 949 </tr> \ 950 <tr> \ 951 <th valign=\"top\"> s </th> \ 952 <td valign=\"top\"> status code </td> \ 953 <td> \ 954 An HTTP-style status code, e.g. 200, 404. Typical codes include: \ 955 <dl compact> \ 956 <dt> 200 </dt> \ 957 <dd> Successful retrieval </dd> \ 958 <dt> 304 </dt> \ 959 <dd> \ 960 Not modified (for example, if the document hasn\'t \ 961 changed since the last dig) \ 962 </dd> \ 963 <dt> 301 </dt> \ 964 <dd> Redirect (to another URL) </dd> \ 965 <dt> 401 </dt> \ 966 <dd> Not authorized </dd> \ 967 <dt> 404 </dt> \ 968 <dd> Not found </dd> \ 969 </dl> \ 970 </td> \ 971 </tr> \ 972 <tr> \ 973 <th valign=\"top\"> r </th> \ 974 <td valign=\"top\"> reason </td> \ 975 <td> \ 976 A text string describing the status code, \ 977 e.g \"Redirect\" or \"Not Found.\" \ 978 </td> \ 979 </tr> \ 980 <tr> \ 981 <th valign=\"top\"> m </th> \ 982 <td valign=\"top\"> status code </td> \ 983 <td> \ 984 The modification time of this document. While the code is \ 985 fairly flexible about the time/date formats it accepts, it \ 986 is recommended to use something standard, like \ 987 RFC1123: Sun, 06 Nov 1994 08:49:37 GMT, or \ 988 ISO-8601: 1994-11-06 08:49:37 GMT. \ 989 </td> \ 990 </tr> \ 991 <tr> \ 992 <th valign=\"top\"> t </th> \ 993 <td valign=\"top\"> content-type </td> \ 994 <td> \ 995 A valid MIME type for the document, like text/html or text/plain. \ 996 </td> \ 997 </tr> \ 998 <tr> \ 999 <th valign=\"top\"> l </th> \ 1000 <td valign=\"top\"> content-length </td> \ 1001 <td> \ 1002 The length of the document on the server, which may not \ 1003 necessarily be the length of the buffer returned. \ 1004 </td> \ 1005 </tr> \ 1006 <tr> \ 1007 <th valign=\"top\"> u </th> \ 1008 <td valign=\"top\"> url </td> \ 1009 <td> \ 1010 The URL of the document, or in the case of a redirect, the \ 1011 URL that should be indexed as a result of the redirect. \ 1012 </td> \ 1013 </tr> \ 1014 </table> \ 1015 " }, \ 1016 { "extra_word_characters", "", \ 1017 "string", "htdig htsearch", "", "3.1.2", "Indexing:What", "extra_word_characters: _", " \ 1018 These characters are considered part of a word. \ 1019 In contrast to the characters in the \ 1020 <a href=\"#valid_punctuation\">valid_punctuation</a> \ 1021 attribute, they are treated just like letter \ 1022 characters. See also the <a href=\"#allow_numbers\">allow_numbers</a>\ 1023 attribute.<br> \ 1024 Note that the <a href=\"#locale\">locale</a> attribute \ 1025 is normally used to configure which characters \ 1026 constitute letter characters.<br> \ 1027 Note also that it is an error to have characters in both \ 1028 extra_word_characters and \ 1029 <a href=\"#valid_punctuation\">valid_punctuation</a>. \ 1030 To add one of the characters in the default valid_punctuation to \ 1031 extra_word_characters, an explicit valid_punctuation entry must be \ 1032 added to the configuration file.<br> \ 1033 See also the comments about special characters at \ 1034 <a href=\"#valid_punctuation\">valid_punctuation</a>. \ 1035 " }, \ 1036 { "head_before_get", "true", \ 1037 "boolean", "htdig", "Server", "3.2.0b1", "Indexing:Connection", "head_before_get: false", " \ 1038 If set to true, an HTTP/1.1 <em>HEAD</em> \ 1039 call is made in order to retrieve header information about a document. \ 1040 If the status code and the content-type returned show that the \ 1041 document is parsable, then a subsequent 'GET' call is made. In \ 1042 general, it is recommended that this attribute be set to 'true', \ 1043 as it can really improve performance (especially when used with \ 1044 persistent connections). This is particularly so during an \ 1045 incremental dig, since in this case 'htdig' can ask the server if the \ 1046 document has been modified since last dig. However there are a few \ 1047 cases when it is better to switch it off: \ 1048 <ul> \ 1049 <li>the majority of documents are parsable (HTML or a type for which \ 1050 an external parser has been provided) and must be retrieved anyway \ 1051 (initial dig);</li> \ 1052 <li>the server does not support the HEAD method or it is \ 1053 disabled;</li> \ 1054 <li>in some cases <a href=\"#persistent_connections\">persistent_connections</a> may \ 1055 not work properly and either the 'head_before_get' attribute or the \ 1056 'persistent_connections' attribute must be turned off.</li> \ 1057 </ul> \ 1058 " }, \ 1059 { "heading_factor", "5", \ 1060 "number", "htsearch", "", "3.2.0b1", "Searching:Ranking", "heading_factor: 20", " \ 1061 This is a factor which will be used to multiply the \ 1062 weight of words between <h1> and </h1> \ 1063 tags, as well as headings of levels <h2> through \ 1064 <h6>. It is used to assign the level of importance \ 1065 to headings. Setting a factor to 0 will cause words \ 1066 in these headings to be ignored. The number may be a \ 1067 floating point number. See also \ 1068 <a href=\"#author_factor\">author_factor</a> \ 1069 <a href=\"#backlink_factor\">backlink_factor</a> \ 1070 <a href=\"#caps_factor\">caps_factor</a> \ 1071 <a href=\"#date_factor\">date_factor</a> \ 1072 <a href=\"#description_factor\">description_factor</a> \ 1073 <a href=\"#keywords_factor\">keywords_factor</a> \ 1074 <a href=\"#meta_description_factor\">meta_description_factor</a> \ 1075 <a href=\"#text_factor\">text_factor</a> \ 1076 <a href=\"#title_factor\">title_factor</a> \ 1077 <a href=\"#url_text_factor\">url_text_factor</a> \ 1078 " }, \ 1079 { "htnotify_prefix_file", "", \ 1080 "string", "htnotify", "", "3.2.0b3", "Extra Output", "htnotify_prefix_file: ${common_dir}/notify_prefix.txt", " \ 1081 Specifies the file containing text to be inserted in each mail \ 1082 message sent by htnotify before the list of expired webpages. If omitted, \ 1083 nothing is inserted. \ 1084 " }, \ 1085 { "htnotify_replyto", "", \ 1086 "string", "htnotify", "", "3.2.0b3", "Extra Output", "htnotify_replyto: design-group@foo.com", " \ 1087 This specifies the email address that htnotify email messages \ 1088 include in the Reply-to: field. \ 1089 " }, \ 1090 { "htnotify_sender", "webmaster@www", \ 1091 "string", "htnotify", "", "all", "Extra Output", "htnotify_sender: bigboss@yourcompany.com", " \ 1092 This specifies the email address that htnotify email \ 1093 messages get sent out from. The address is forged using \ 1094 /usr/lib/sendmail. Check htnotify/htnotify.cc for \ 1095 detail on how this is done. \ 1096 " }, \ 1097 { "htnotify_suffix_file", "", \ 1098 "string", "htnotify", "", "3.2.0b3", "Extra Output", "htnotify_suffix_file: ${common_dir}/notify_suffix.txt", " \ 1099 Specifies the file containing text to be inserted in each mail message \ 1100 sent by htnotify after the list of expired webpages. If omitted, htnotify \ 1101 will insert a standard message. \ 1102 " }, \ 1103 { "htnotify_webmaster", "ht://Dig Notification Service", \ 1104 "string", "htnotify", "", "3.2.0b3", "Extra Output", "htnotify_webmaster: Notification Service", " \ 1105 This provides a name for the From field, in addition to the email \ 1106 address for the email messages sent out by htnotify. \ 1107 " }, \ 1108 { "http_proxy", "", \ 1109 "string", "htdig", "URL", "3.0", "Indexing:Connection", "http_proxy: http://proxy.bigbucks.com:3128", " \ 1110 When this attribute is set, all HTTP document \ 1111 retrievals will be done using the HTTP-PROXY protocol. \ 1112 The URL specified in this attribute points to the host \ 1113 and port where the proxy server resides.<br> \ 1114 Later, this should be able to be overridden by the \ 1115 <code>http_proxy</code> environement variable, but it currently cannot.\ 1116 The use of a proxy server greatly improves performance \ 1117 of the indexing process.<br> \ 1118 See also \ 1119 <a href=\"#http_proxy_authorization\">http_proxy_authorization</a> and \ 1120 <a href=\"#http_proxy_exclude\">#http_proxy_exclude</a>. \ 1121 " }, \ 1122 { "http_proxy_authorization", "", \ 1123 "string", "htdig", "URL", "3.2.0b4", "Indexing:Connection", "http_proxy_authorization: myusername:mypassword", " \ 1124 This tells htdig to send the supplied \ 1125 <em>username</em><strong>:</strong><em>password</em> with each HTTP request, \ 1126 when using a proxy with authorization requested. \ 1127 The credentials will be encoded using the \"Basic\" authentication \ 1128 scheme. There <em>must</em> be a colon (:) between the username and \ 1129 password.<br> \ 1130 If you use this option, be sure to protect the configuration file \ 1131 so it is readable only by you, and do not \ 1132 use that same configuration file for htsearch. \ 1133 " }, \ 1134 { "http_proxy_exclude", "", \ 1135 "pattern list", "htdig", "", "3.1.0b3", "Indexing:Connection", "http_proxy_exclude: http://intranet.foo.com/", " \ 1136 When this is set, URLs matching this will not use the \ 1137 proxy. This is useful when you have a mixture of sites \ 1138 near to the digging server and far away. \ 1139 " }, \ 1140 { "ignore_alt_text", "false", \ 1141 "boolean", "htdig", "", "3.1.6", "Indexing:What", "ignore_alt_text: true", " \ 1142 If set, this causes the text of the ALT field in an <IMG...> tag \ 1143 not to be indexed as part of the text of the document, nor included in \ 1144 excerpts. \ 1145 " }, \ 1146 { "ignore_dead_servers", "true", \ 1147 "boolean", "htdig", "", "3.1.6", "Indexing:Connection", "ignore_dead_servers: false", " \ 1148 Determines whether htdig will continue to index URLs from a \ 1149 server after an attempted connection to the server fails as \ 1150 "no host found" or "host not found (port)." If \ 1151 set to false, htdig will try <em>every</em> URL from that server. \ 1152 " }, \ 1153 { "image_list", "${database_base}.images", \ 1154 "string", "htdig", "", "all", "Extra Output", "image_list: allimages", " \ 1155 This is the file that a list of image URLs gets written \ 1156 to by <a href=\"htdig.html\">htdig</a> when the \ 1157 <a href=\"#create_image_list\">create_image_list</a> is set to \ 1158 true. As image URLs are seen, they are just appended to \ 1159 this file, so after htdig finishes it is probably a \ 1160 good idea to run <code>sort -u</code> on the file to \ 1161 eliminate duplicates from the file. \ 1162 " }, \ 1163 { "image_url_prefix", IMAGE_URL_PREFIX, \ 1164 "string", "htsearch", "", "all", "Presentation:Text", "image_url_prefix: /images/htdig", " \ 1165 This specifies the directory portion of the URL used \ 1166 to display star images. This attribute isn't directly \ 1167 used by htsearch, but is used in the default URL for \ 1168 the <a href=\"#star_image\">star_image</a> and \ 1169 <a href=\"#star_blank\">star_blank</a> attributes, and \ 1170 other attributes may be defined in terms of this one. \ 1171 <p> \ 1172 The default value of this attribute is determined at \ 1173 compile time. \ 1174 </p> \ 1175 " }, \ 1176 { "include", "", \ 1177 "string", "all", "", "3.1.0", "", "include: ${config_dir}/htdig.conf", " \ 1178 This is not quite a configuration attribute, but \ 1179 rather a directive. It can be used within one \ 1180 configuration file to include the definitions of \ 1181 another file. The last definition of an attribute \ 1182 is the one that applies, so after including a file, \ 1183 any of its definitions can be overridden with \ 1184 subsequent definitions. This can be useful when \ 1185 setting up many configurations that are mostly the \ 1186 same, so all the common attributes can be maintained \ 1187 in a single configuration file. The include directives \ 1188 can be nested, but watch out for nesting loops. \ 1189 " }, \ 1190 { "iso_8601", "false", \ 1191 "boolean", "htsearch htnotify", "", "3.1.0b2", "Presentation:How,Extra Output", "iso_8601: true", " \ 1192 This sets whether dates should be output in ISO 8601 \ 1193 format. For example, this was written on: 1998-10-31 11:28:13 EST. \ 1194 See also the <a \ 1195 href=\"#date_format\">date_format</a> attribute, which \ 1196 can override any date format that \ 1197 <a href=\"htsearch.html\" target=\"_top\">htsearch</a> \ 1198 picks by default.<br> \ 1199 This attribute also affects the format of the date \ 1200 <a href=\"htnotify.html\">htnotify</a> expects to find \ 1201 in a <strong>htdig-notification-date</strong> field. \ 1202 " }, \ 1203 { "keywords", "", \ 1204 "string list", "htsearch", "", "??", "Searching:Method", "keywords: documentation", " \ 1205 Keywords which <strong>must</strong> be found on all pages returned, \ 1206 even if the \"or\" (\"Any\") <a href=\"#method\">method</a> is \ 1207 selected. \ 1208 " }, \ 1209 { "keywords_factor", "100", \ 1210 "number", "htsearch", "", "all", "Searching:Ranking", "keywords_factor: 12", " \ 1211 This is a factor which will be used to multiply the \ 1212 weight of words in the list of \ 1213 <a href=\"#keywords_meta_tag_names\">meta keywords</a> of a document. \ 1214 The number may be a floating point number. See also the \ 1215 <a href=\"#heading_factor\">heading_factor</a> attribute. \ 1216 " }, \ 1217 { "keywords_meta_tag_names", "keywords htdig-keywords", \ 1218 "string list", "htdig", "", "3.0.6", "Indexing:What", "keywords_meta_tag_names: keywords description", " \ 1219 The words in this list are used to search for keywords \ 1220 in HTML <em>META</em> tags. This list can contain any \ 1221 number of strings that each will be seen as the name \ 1222 for whatever keyword convention is used.<br> \ 1223 The <em>META</em> tags have the following format:<br> \ 1224 <code> \ 1225 <META name=\"<em>somename</em>\" content=\"<em>somevalue</em>\"> \ 1226 </code> \ 1227 " }, \ 1228 { "limit_normalized", "", \ 1229 "pattern list", "htdig", "", "3.1.0b2", "Indexing:Where", "limit_normalized: http://www.mydomain.com", " \ 1230 This specifies a set of patterns that all URLs have to \ 1231 match against in order for them to be included in the \ 1232 search. Unlike the limit_urls_to attribute, this is done \ 1233 <strong>after</strong> the URL is normalized and the \ 1234 <a href=\"#server_aliases\">server_aliases</a> \ 1235 attribute is applied. This allows filtering after any \ 1236 hostnames and DNS aliases are resolved. Otherwise, this \ 1237 attribute is the same as the <a \ 1238 href=\"#limit_urls_to\">limit_urls_to</a> attribute. \ 1239 " }, \ 1240 { "limit_urls_to", "${start_url}", \ 1241 "pattern list", "htdig", "", "all", "Indexing:Where", "limit_urls_to: .sdsu.edu kpbs [.*\\.html]", " \ 1242 This specifies a set of patterns that all URLs have to \ 1243 match against in order for them to be included in the \ 1244 search. Any number of strings can be specified, \ 1245 separated by spaces. If multiple patterns are given, at \ 1246 least one of the patterns has to match the URL.<br> \ 1247 Matching, by default, is a case-sensitive string match on the URL \ 1248 to be used, unless the <a href=\"#case_sensitive\">case_sensitive</a> \ 1249 attribute is false. The match will be performed <em>after</em> \ 1250 the relative references have been converted to a valid \ 1251 URL. This means that the URL will <em>always</em> start \ 1252 with a transport specifier (<code>http://</code> if none is \ 1253 specified).<br> \ 1254 Granted, this is not the perfect way of doing this, \ 1255 but it is simple enough and it covers most cases.<br> \ 1256 To limit URLs in htsearch, use \ 1257 <a href=\"#restrict\">restrict</a>. \ 1258 " }, \ 1259 { "local_default_doc", "index.html", \ 1260 "string list", "htdig", "Server", "3.0.8b2", "Indexing:Where", "local_default_doc: default.html default.htm index.html index.htm", " \ 1261 Set this to the default documents in a directory used by the \ 1262 server. This is used for local filesystem access, \ 1263 using <a href=\"#local_urls\">local_urls</a>, to \ 1264 translate URLs like http://foo.com/ into something like \ 1265 /home/foo.com/index.html \ 1266 (see also <a href=\"#remove_default_doc\">remove_default_doc</a>). \ 1267 <br>The list should only contain names that the local server \ 1268 recognizes as default documents for directory URLs, as defined \ 1269 by the DirectoryIndex setting in Apache's srm.conf, for example. \ 1270 As of version 3.1.5, this can be a string list rather than a single \ 1271 name, and htdig will use the first name that works. Since this \ 1272 requires a loop, setting the most common name first will improve \ 1273 performance. Special characters can be embedded in these names \ 1274 using %xx hex encoding. \ 1275 " }, \ 1276 { "local_urls", "", \ 1277 "string list", "htdig", "", "3.0.8b2", "Indexing:Where", "local_urls: http://www.foo.com/=/usr/www/htdocs/", " \ 1278 Set this to tell ht://Dig to access certain URLs through \ 1279 local filesystems. At first ht://Dig will try to access \ 1280 pages with URLs matching the patterns through the \ 1281 filesystems specified. If it cannot find the file, or \ 1282 if it doesn't recognize the file name extension, it will \ 1283 try the URL through HTTP instead. Note the example--the \ 1284 equal sign and the final slashes in both the URL and the \ 1285 directory path are critical. \ 1286 <br>The fallback to HTTP can be disabled by setting the \ 1287 <a href=\"#local_urls_only\">local_urls_only</a> attribute to true. \ 1288 To access user directory URLs through the local filesystem, \ 1289 set <a href=\"#local_user_urls\">local_user_urls</a>. \ 1290 File types which need processing by the HTTP server may be \ 1291 specified by the \ 1292 <a href=\"#bad_local_extensions\">bad_local_extensions</a> \ 1293 attribute. \ 1294 As of version 3.1.5, you can provide multiple mappings of a given \ 1295 URL to different directories, and htdig will use the first \ 1296 mapping that works. \ 1297 Special characters can be embedded in these names using %xx hex encoding. \ 1298 For example, you can use %3D to embed an \"=\" sign in an URL pattern. \ 1299 <br> \ 1300 See also <a href=\"#local_default_doc\">local_default_doc</a>. \ 1301 " }, \ 1302 { "local_urls_only", "false", \ 1303 "boolean", "htdig", "", "3.1.4", "Indexing:Where", "local_urls_only: true", " \ 1304 Set this to tell ht://Dig to access files only through the \ 1305 local filesystem, for URLs matching the patterns in the \ 1306 <a href=\"#local_urls\">local_urls</a> or \ 1307 <a href=\"#local_user_urls\">local_user_urls</a> attribute. If it \ 1308 cannot find the file, it will give up rather than trying HTTP or \ 1309 another protocol. With this option, even <code>file://</code> urls \ 1310 are not retrieved, except throught the local_urls mechanism.\ 1311 " }, \ 1312 { "local_user_urls", "", \ 1313 "string list", "htdig", "", "3.0.8b2", "Indexing:Where", "local_user_urls: http://www.my.org/=/home/,/www/", " \ 1314 Set this to access user directory URLs through the local \ 1315 filesystem. If you leave the \"path\" portion out, it will \ 1316 look up the user's home directory in /etc/password (or NIS \ 1317 or whatever). As with <a href=\"#local_urls\">local_urls</a>, \ 1318 if the files are not found, ht://Dig will try with HTTP or the \ 1319 appropriate protocol. Again, note the \ 1320 example's format. To map http://www.my.org/~joe/foo/bar.html \ 1321 to /home/joe/www/foo/bar.html, try the example below. \ 1322 <br>The fallback to HTTP can be disabled by setting the \ 1323 <a href=\"#local_urls_only\">local_urls_only</a> attribute to true. \ 1324 As of version 3.1.5, you can provide multiple mappings of a given \ 1325 URL to different directories, and htdig will use the first \ 1326 mapping that works. \ 1327 Special characters can be embedded in these names using %xx hex encoding. \ 1328 For example, you can use %3D to embed an \"=\" sign in an URL pattern. \ 1329 " }, \ 1330 { "locale", "C", \ 1331 "string", "htdig", "", "3.0", "Indexing:What,Presentation:How", "locale: en_US", " \ 1332 Set this to whatever locale you want your search \ 1333 database cover. It affects the way international \ 1334 characters are dealt with. On most systems a list of \ 1335 legal locales can be found in /usr/lib/locale. Also \ 1336 check the <strong>setlocale(3C)</strong> man page. \ 1337 Note that depending the locale you choose, and whether \ 1338 your system's locale implementation affects floating \ 1339 point input, you may need to specify the decimal point \ 1340 as a comma rather than a period. This will affect \ 1341 settings of <a href=\"#search_algorithm\">search_algorithm</a> \ 1342 and any of the scoring factors. \ 1343 " }, \ 1344 { "logging", "false", \ 1345 "boolean", "htsearch", "", "3.1.0b2", "Extra Output", "logging: true", " \ 1346 This sets whether htsearch should use the syslog() to log \ 1347 search requests. If set, this will log requests with a \ 1348 default level of LOG_INFO and a facility of LOG_LOCAL5. For \ 1349 details on redirecting the log into a separate file or other \ 1350 actions, see the <strong>syslog.conf(5)</strong> man \ 1351 page. To set the level and facility used in logging, change \ 1352 LOG_LEVEL and LOG_FACILITY in the include/htconfig.h file \ 1353 before compiling. \ 1354 <dl> \ 1355 <dt> \ 1356 Each line logged by htsearch contains the following: \ 1357 </dt> \ 1358 <dd> \ 1359 REMOTE_ADDR [config] (match_method) [words] \ 1360 [logicalWords] (matches/matches_per_page) - \ 1361 page, HTTP_REFERER \ 1362 </dd> \ 1363 </dl> \ 1364 where any of the above are null or empty, it \ 1365 either puts in '-' or 'default' (for config). \ 1366 " }, \ 1367 { "maintainer", "bogus@unconfigured.htdig.user", \ 1368 "string", "htdig", "Server", "all", "Indexing:Out", "maintainer: ben.dover@uptight.com", " \ 1369 This should be the email address of the person in \ 1370 charge of the digging operation. This string is added \ 1371 to the user-agent: field when the digger sends a \ 1372 request to a server. \ 1373 " }, \ 1374 { "match_method", "and", \ 1375 "string", "htsearch", "", "3.0", "Searching:Method", "match_method: boolean", " \ 1376 This is the default method for matching that htsearch \ 1377 uses. The valid choices are: \ 1378 <ul> \ 1379 <li> or </li> \ 1380 <li> and </li> \ 1381 <li> boolean </li> \ 1382 </ul> \ 1383 This attribute will only be used if the HTML form that \ 1384 calls htsearch didn't have the \ 1385 <a href=\"hts_form.html#method\">method</a> value set. \ 1386 " }, \ 1387 { "matches_per_page", "10", \ 1388 "integer", "htsearch", "", "3.0", "Searching:Method", "matches_per_page: 999", " \ 1389 If this is set to a relatively small number, the \ 1390 matches will be shown in pages instead of all at once. \ 1391 This attribute will only be used if the HTML form that \ 1392 calls htsearch didn't have the \ 1393 <a href=\"hts_form.html#matchesperpage\">matchesperpage</a> value set. \ 1394 " }, \ 1395 { "max_connection_requests", "-1", \ 1396 "integer", "htdig", "", "3.2.0b1", "Indexing:Connection", "max_connection_requests: 100", " \ 1397 This attribute tells htdig to limit the number of requests it will \ 1398 send to a server using a single, persistent HTTP connection. This \ 1399 only applies when the \ 1400 <a href=\"#persistent_connections\">persistent_connections</a> \ 1401 attribute is set. You may set the limit as high as you want, \ 1402 but it must be at least 1. A value of -1 specifies no limit. \ 1403 Requests in the queue for a server will be combined until either \ 1404 the limit is reached, or the queue is empty. \ 1405 " }, \ 1406 { "max_description_length", "60", \ 1407 "integer", "htdig", "", "all", "Indexing:What", "max_description_length: 40", " \ 1408 While gathering descriptions of URLs, \ 1409 <a href=\"htdig.html\">htdig</a> will only record \ 1410 up to this many bytes of hyperlink descriptions for use in the \ 1411 <a href=\"hts_templates.html#DESCRIPTION\">DESCRIPTION</a> template \ 1412 variable. This is used mostly to deal with broken HTML. (If a \ 1413 hyperlink is not terminated with a </a> the \ 1414 description will go on until the end of the document.) \ 1415 " }, \ 1416 { "max_descriptions", "5", \ 1417 "integer", "htdig", "", "all", "Indexing:What", "max_descriptions: 1", " \ 1418 While gathering <a href=\"#description_factor\">descriptions</a> of \ 1419 URLs for the \ 1420 <a href=\"hts_templates.html#DESCRIPTIONS\">DESCRIPTIONS</a> template \ 1421 variable, <a href=\"htdig.html\">htdig</a> will only record up to this \ 1422 number of descriptions, in the order in which it encounters \ 1423 them. This is used to prevent the database entry for a document \ 1424 from growing out of control if the document has a huge number \ 1425 of links to it. <br> \ 1426 Note that all descriptions are used for indexing. \ 1427 " }, \ 1428 { "max_doc_size", "100000", \ 1429 "integer", "htdig", "URL", "3.0", "Indexing:What", "max_doc_size: 5000000", " \ 1430 This is the upper limit to the amount of data retrieved \ 1431 for documents (in bytes). This is mainly used to prevent \ 1432 unreasonable memory consumption since each document \ 1433 will be read into memory by <a href=\"htdig.html\"> \ 1434 htdig</a>. \ 1435 " }, \ 1436 { "max_excerpts", "1", \ 1437 "integer", "htsearch", "URL", "3.1.6", "Presentation:How", "max_excerpts: 10", " \ 1438 This value determines the maximum number of excerpts \ 1439 that can be displayed for one matching document in the \ 1440 search results. \ 1441 " }, \ 1442 { "max_head_length", "512", \ 1443 "integer", "htdig", "", "all", "Indexing:How", "max_head_length: 50000", " \ 1444 For each document retrieved, the top of the document is \ 1445 stored. This attribute determines the size of this \ 1446 block (in bytes). The text that will be stored is only the text; \ 1447 no markup is stored.<br> \ 1448 We found that storing 50,000 bytes will store about \ 1449 95% of all the documents completely. This really \ 1450 depends on how much storage is available and how much \ 1451 you want to show. Currently, this is must not be 0. \ 1452 " }, \ 1453 { "max_hop_count", "999999", \ 1454 "integer", "htdig", "", "all", "Indexing:Where", "max_hop_count: 4", " \ 1455 Instead of limiting the indexing process by URL \ 1456 pattern, it can also be limited by the number of hops \ 1457 or clicks a document is removed from the starting URL. \ 1458 <br> \ 1459 The starting page or pages will have hop count 0. \ 1460 " }, \ 1461 { "max_keywords", "-1", \ 1462 "integer", "htdig", "", "3.2.0b1", "Indexing:What", "max_keywords: 10", " \ 1463 This attribute can be used to limit the number of keywords \ 1464 per document that htdig will accept from meta keywords tags. \ 1465 A value of -1 or less means no limit. This can help combat meta \ 1466 keyword spamming, by limiting the amount of keywords that will be \ 1467 indexed, but it will not completely prevent irrelevant matches \ 1468 in a search if the first few keywords in an offending document \ 1469 are not relevant to its contents. \ 1470 " }, \ 1471 { "max_meta_description_length", "512", \ 1472 "integer", "htdig", "", "3.1.0b1", "Indexing:How", "max_meta_description_length: 1000", " \ 1473 While gathering descriptions from meta description tags, \ 1474 <a href=\"htdig.html\">htdig</a> will only store up to \ 1475 this much of the text (in bytes) for each document to fill the \ 1476 <a href=\"hts_templates.html#METADESCRIPTION\">METADESCRIPTION</a> \ 1477 template variable. All words in the meta description are still \ 1478 used for indexing. \ 1479 " }, \ 1480 { "max_prefix_matches", "1000", \ 1481 "integer", "htsearch", "", "3.1.0b1", "Searching:Method", "max_prefix_matches: 100", " \ 1482 The Prefix <a href=\"#search_algorithm\">fuzzy algorithm</a> \ 1483 could potentially match a \ 1484 very large number of words. This value limits the \ 1485 number of words each prefix can match. Note \ 1486 that this does not limit the number of documents that \ 1487 are matched in any way. \ 1488 " }, \ 1489 { "max_retries", "3", \ 1490 "integer", "htdig", "", "3.2.0b1", "Indexing:Connection", "max_retries: 6", " \ 1491 This option set the maximum number of retries when retrieving a document \ 1492 fails (mainly for reasons of connection). \ 1493 " }, \ 1494 { "max_stars", "4", \ 1495 "integer", "htsearch", "", "all", "Presentation:How", "max_stars: 6", " \ 1496 When stars are used to display the score of a match, \ 1497 this value determines the maximum number of stars that \ 1498 can be displayed. \ 1499 " }, \ 1500 { "maximum_page_buttons", "${maximum_pages}", \ 1501 "integer", "htsearch", "", "3.2.0b3", "Presentation:How", "maximum_page_buttons: 20", " \ 1502 This value limits the number of page links that will be \ 1503 included in the page list at the bottom of the search \ 1504 results page. By default, it takes on the value of the \ 1505 <a href=\"#maximum_pages\">maximum_pages</a> \ 1506 attribute, but you can set it to something lower to allow \ 1507 more pages than buttons. In this case, pages above this \ 1508 number will have no corresponding button. \ 1509 " }, \ 1510 { "maximum_pages", "10", \ 1511 "integer", "htsearch", "", "all", "Presentation:How", "maximum_pages: 20", " \ 1512 This value limits the number of page links that will be \ 1513 included in the page list at the bottom of the search \ 1514 results page. As of version 3.1.4, this will limit the \ 1515 total number of matching documents that are shown. \ 1516 You can make the number of page buttons smaller than the \ 1517 number of allowed pages by setting the \ 1518 <a href=\"#maximum_page_buttons\">maximum_page_buttons</a> \ 1519 attribute. \ 1520 " }, \ 1521 { "maximum_word_length", "32", \ 1522 "integer", "htdig htsearch htfuzzy", "", "3.1.3", "Indexing:What", "maximum_word_length: 15", " \ 1523 This sets the maximum length of words that will be \ 1524 indexed. Words longer than this value will be silently \ 1525 truncated when put into the index, or searched in the \ 1526 index. \ 1527 " }, \ 1528 { "md5_db", "${database_base}.md5hash.db", \ 1529 "string", "htdig", "", "3.2.0b3", "File Layout", "md5_db: ${database_base}.md5.db", " \ 1530 This file holds a database of md5 and date hashes of pages to \ 1531 catch and eliminate duplicates of pages. See also the \ 1532 <a href=\"#check_unique_md5\">check_unique_md5</a> and \ 1533 <a href=\"#check_unique_date\">check_unique_date</a> attributes. \ 1534 " }, \ 1535 { "meta_description_factor", "50", \ 1536 "number", "htsearch", "", "3.1.0b1", "Searching:Ranking", "meta_description_factor: 20", " \ 1537 This is a factor which will be used to multiply the \ 1538 weight of words in any META description tags in a document. \ 1539 The number may be a floating point number. See also the \ 1540 <a href=\"#heading_factor\">heading_factor</a> attribute and the \ 1541 <a href=\"#description_factor\">description_factor</a> attribute. \ 1542 " }, \ 1543 { "metaphone_db", "${database_base}.metaphone.db", \ 1544 "string", "htfuzzy htsearch", "", "all", "File Layout", "metaphone_db: ${database_base}.mp.db", " \ 1545 The database file used for the fuzzy \"metaphone\" search \ 1546 algorithm. This database is created by \ 1547 <a href=\"htfuzzy.html\">htfuzzy</a> and used by \ 1548 <a href=\"htsearch.html\" target=\"_top\">htsearch</a>. \ 1549 " }, \ 1550 { "method_names", "and All or Any boolean Boolean", \ 1551 "quoted string list", "htsearch", "", "all", "Searching:UI", "method_names: or Or and And", " \ 1552 These values are used to create the <strong> \ 1553 method</strong> menu. It consists of pairs. The first \ 1554 element of each pair is one of the known methods, the \ 1555 second element is the text that will be shown in the \ 1556 menu for that method. This text needs to be quoted if \ 1557 it contains spaces. \ 1558 See the <a href=\"hts_selectors.html\">select list documentation</a> \ 1559 for more information on how this attribute is used. \ 1560 " }, \ 1561 { "mime_types", "${config_dir}/mime.types", \ 1562 "string", "htdig", "", "3.2.0b1", "Indexing:Where", "mime_types: /etc/mime.types", " \ 1563 This file is used by htdig for local file access and resolving \ 1564 file:// URLs to ensure the files are parsable. If you are running \ 1565 a webserver with its own MIME file, you should set this attribute \ 1566 to point to that file. \ 1567 <p> \ 1568 See also <a href=\"#content_classifier\">content_classifier</a>.\ 1569 "}, \ 1570 { "minimum_prefix_length", "1", \ 1571 "integer", "htsearch", "", "3.1.0b1", "Searching:Method", "minimum_prefix_length: 2", " \ 1572 This sets the minimum length of prefix matches used by the \ 1573 \"prefix\" fuzzy matching algorithm. Words shorter than this \ 1574 will not be used in prefix matching. \ 1575 " }, \ 1576 { "minimum_speling_length", "5", \ 1577 "integer", "htsearch", "", "3.2.0b1", "Searching:Method", "minimum_speling_length: 3", " \ 1578 This sets the minimum length of words used by the \ 1579 \"speling\" fuzzy matching algorithm. Words shorter than this \ 1580 will not be used in this fuzzy matching. \ 1581 " }, \ 1582 { "minimum_word_length", "3", \ 1583 "integer", "htdig htsearch", "", "all", "Indexing:What", "minimum_word_length: 2", " \ 1584 This sets the minimum length of words that will be \ 1585 indexed. Words shorter than this value will be silently \ 1586 ignored but still put into the excerpt.<br> \ 1587 Note that by making this value less than 3, a lot more \ 1588 words that are very frequent will be indexed. It might \ 1589 be advisable to add some of these to the \ 1590 <a href=\"#bad_word_list\">bad_words list</a>. \ 1591 " }, \ 1592 { "multimatch_factor", "1", \ 1593 "number", "htsearch", "", "3.1.6", "Searching:Ranking", "multimatch_factor: 1000", " \ 1594 This factor gives higher rankings to documents that have more than \ 1595 one matching search word when the <strong>or</strong> \ 1596 <a href=\"#match_method\">match_method</a> is used. \ 1597 In version 3.1.6, the matching words' combined scores were multiplied \ 1598 by this factor for each additional matching word. Currently, this \ 1599 multiplier is applied at most once. \ 1600 " }, 1601 { "next_page_text", "[next]", \ 1602 "string", "htsearch", "", "3.1.0", "Presentation:Text", "next_page_text: <img src=\"/htdig/buttonr.gif\">", " \ 1603 The text displayed in the hyperlink to go to the next \ 1604 page of matches. \ 1605 " }, \ 1606 { "no_excerpt_show_top", "false", \ 1607 "boolean", "htsearch", "", "3.1.0b3", "Presentation:How", "no_excerpt_show_top: yes", " \ 1608 If no excerpt is available, this option will act the \ 1609 same as <a \ 1610 href=\"#excerpt_show_top\">excerpt_show_top</a>, that is, \ 1611 it will show the top of the document. \ 1612 " }, \ 1613 { "no_excerpt_text", "<em>(None of the search words were found in the top of this document.)</em>", \ 1614 "string", "htsearch", "", "3.0", "Presentation:Text", "no_excerpt_text:", " \ 1615 This text will be displayed in place of the excerpt if \ 1616 there is no excerpt available. If this attribute is set \ 1617 to nothing (blank), the excerpt label will not be \ 1618 displayed in this case. \ 1619 " }, \ 1620 { "no_next_page_text", "${next_page_text}", \ 1621 "string", "htsearch", "", "3.0", "Presentation:Text", "no_next_page_text:", " \ 1622 The text displayed where there would normally be a \ 1623 hyperlink to go to the next page of matches. \ 1624 " }, \ 1625 { "no_page_list_header", "", \ 1626 "string", "htsearch", "", "3.0", "Presentation:Text", "no_page_list_header: <hr noshade size=2>All results on this page.<br>", " \ 1627 This text will be used as the value of the PAGEHEADER \ 1628 variable, for use in templates or the \ 1629 <a href=\"#search_results_footer\">search_results_footer</a> \ 1630 file, when all search results fit on a single page. \ 1631 " }, \ 1632 { "no_page_number_text", "", \ 1633 "quoted string list", "htsearch", "", "3.0", "Presentation:Text", "no_page_number_text: \ 1634 <strong>1</strong> <strong>2</strong> \\<br> \ 1635 <strong>3</strong> <strong>4</strong> \\<br> \ 1636 <strong>5</strong> <strong>6</strong> \\<br> \ 1637 <strong>7</strong> <strong>8</strong> \\<br> \ 1638 <strong>9</strong> <strong>10</strong> \ 1639 ", " \ 1640 The text strings in this list will be used when putting \ 1641 together the PAGELIST variable, for use in templates or \ 1642 the <a href=\"#search_results_footer\">search_results_footer</a> \ 1643 file, when search results fit on more than page. The PAGELIST \ 1644 is the list of links at the bottom of the search results page. \ 1645 There should be as many strings in the list as there are \ 1646 pages allowed by the <a href=\"#maximum_page_buttons\">maximum_page_buttons</a> \ 1647 attribute. If there are not enough, or the list is empty, \ 1648 the page numbers alone will be used as the text for the links. \ 1649 An entry from this list is used for the current page, as the \ 1650 current page is shown in the page list without a hypertext link, \ 1651 while entries from the <a href=\"#page_number_text\"> \ 1652 page_number_text</a> list are used for the links to other pages. \ 1653 The text strings can contain HTML tags to highlight page numbers \ 1654 or embed images. The strings need to be quoted if they contain \ 1655 spaces. \ 1656 " }, \ 1657 { "no_prev_page_text", "${prev_page_text}", \ 1658 "string", "htsearch", "", "3.0", "Presentation:Text", "no_prev_page_text:", " \ 1659 The text displayed where there would normally be a \ 1660 hyperlink to go to the previous page of matches. \ 1661 " }, \ 1662 { "no_title_text", "filename", \ 1663 "string", "htsearch", "", "3.1.0", "Presentation:Text", "no_title_text: \"No Title Found\"", " \ 1664 This specifies the text to use in search results when no \ 1665 title is found in the document itself. If it is set to \ 1666 filename, htsearch will use the name of the file itself, \ 1667 enclosed in brackets (e.g. [index.html]). \ 1668 " }, \ 1669 { "noindex_end", "<!--/htdig_noindex--> </SCRIPT>", \ 1670 "quoted string list", "htdig", "", "3.1.0", "Indexing:What", "noindex_end: </SCRIPT>", " \ 1671 This string marks the end of a section of an HTML file that should be \ 1672 completely ignored when indexing. Note that text between noindex_start\ 1673 and noindex_end isn't even counted as white space; the text \ 1674 \"<code>foo<!--htdig_noindex-->something<!--/htdig_noindex-->bar</code>\" \ 1675 matches the word \"foobar\", not the phrase \"foo bar\". White space \ 1676 following noindex_end <em>is</em> counted as white space. See also \ 1677 <a href=\"#noindex_start\">noindex_start</a>. \ 1678 " }, \ 1679 { "noindex_start", "<!--htdig_noindex--> <SCRIPT", \ 1680 "quoted string list", "htdig", "", "3.1.0", "Indexing:What", "noindex_start: <SCRIPT", " \ 1681 These strings mark the start of a section of an HTML file that should \ 1682 be completely ignored when indexing. They work together with \ 1683 <a href=\"#noindex_end\">noindex_end</a>. Once a string in \ 1684 noindex_start is found, text is ignored until the string at the \ 1685 <em>same position</em> within <a href=\"#noindex_end\">noindex_end</a> \ 1686 is encountered. The sections marked off this way cannot overlap. \ 1687 As in the first default pattern, this can be SGML comment \ 1688 declarations that can be inserted anywhere in the documents to exclude \ 1689 different sections from being indexed. However, existing tags can also \ 1690 be used; this is especially useful to exclude some sections from being \ 1691 indexed where the files to be indexed can not be edited. The second \ 1692 default pattern shows how SCRIPT sections in 'uneditable' documents \ 1693 can be skipped; note how noindex_start does not contain an ending \ 1694 >: this allows for all SCRIPT tags to be matched regardless of \ 1695 attributes defined (different types or languages). \ 1696 Note that the match for this string is case insensitive. \ 1697 " }, \ 1698 { "nothing_found_file", "${common_dir}/nomatch.html", \ 1699 "string", "htsearch", "", "all", "Presentation:Files", "nothing_found_file: /www/searching/nothing.html", " \ 1700 This specifies the file which contains the <code> \ 1701 HTML</code> text to display when no matches were found. \ 1702 The file should contain a complete <code>HTML</code> \ 1703 document.<br> \ 1704 Note that this attribute could also be defined in \ 1705 terms of <a href=\"#database_base\">database_base</a> to \ 1706 make is specific to the current search database. \ 1707 " }, \ 1708 { "nph", "false", \ 1709 "boolean", "htsearch", "", "3.2.0b2", "Presentation:How", "nph: true", " \ 1710 This attribute determines whether htsearch sends out full HTTP \ 1711 headers as required for an NPH (non-parsed header) CGI. Some \ 1712 servers assume CGIs will act in this fashion, for example MS \ 1713 IIS. If your server does not send out full HTTP headers, you \ 1714 should set this to true. \ 1715 " }, \ 1716 { "page_list_header", "<hr noshade size=2>Pages:<br>", \ 1717 "string", "htsearch", "", "3.0", "Presentation:Text", "page_list_header:", " \ 1718 This text will be used as the value of the PAGEHEADER \ 1719 variable, for use in templates or the \ 1720 <a href=\"#search_results_footer\">search_results_footer</a> \ 1721 file, when all search results fit on more than one page. \ 1722 " }, \ 1723 { "page_number_separator", "\" \"", \ 1724 "quoted string list", "htsearch", "", "3.1.4", "Presentation:Text", "page_number_separator: \"</td> <td>\"", " \ 1725 The text strings in this list will be used when putting \ 1726 together the PAGELIST variable, for use in templates or \ 1727 the <a href=\"#search_results_footer\">search_results_footer</a> \ 1728 file, when search results fit on more than page. The PAGELIST \ 1729 is the list of links at the bottom of the search results page. \ 1730 The strings in the list will be used in rotation, and will \ 1731 separate individual entries taken from \ 1732 <a href=\"#page_number_text\">page_number_text</a> and \ 1733 <a href=\"#no_page_number_text\">no_page_number_text</a>. \ 1734 There can be as many or as few strings in the list as you like. \ 1735 If there are not enough for the number of pages listed, it goes \ 1736 back to the start of the list. If the list is empty, a space is \ 1737 used. The text strings can contain HTML tags. The strings need \ 1738 to be quoted if they contain spaces, or to specify an empty string. \ 1739 " }, \ 1740 { "page_number_text", "", \ 1741 "quoted string list", "htsearch", "", "3.0", "Presentation:Text", "page_number_text: \ 1742 <em>1</em> <em>2</em> \\<br> \ 1743 <em>3</em> <em>4</em> \\<br> \ 1744 <em>5</em> <em>6</em> \\<br> \ 1745 <em>7</em> <em>8</em> \\<br> \ 1746 <em>9</em> <em>10</em> \ 1747 ", " \ 1748 The text strings in this list will be used when putting \ 1749 together the PAGELIST variable, for use in templates or \ 1750 the <a href=\"#search_results_footer\">search_results_footer</a> \ 1751 file, when search results fit on more than page. The PAGELIST \ 1752 is the list of links at the bottom of the search results page. \ 1753 There should be as many strings in the list as there are \ 1754 pages allowed by the <a href=\"#maximum_page_buttons\">maximum_page_buttons</a> \ 1755 attribute. If there are not enough, or the list is empty, \ 1756 the page numbers alone will be used as the text for the links. \ 1757 Entries from this list are used for the links to other pages, \ 1758 while an entry from the <a href=\"#no_page_number_text\"> \ 1759 no_page_number_text</a> list is used for the current page, as the \ 1760 current page is shown in the page list without a hypertext link. \ 1761 The text strings can contain HTML tags to highlight page numbers \ 1762 or embed images. The strings need to be quoted if they contain \ 1763 spaces. \ 1764 " }, \ 1765 { "persistent_connections", "true", \ 1766 "boolean", "htdig", "Server", "3.2.0b1", "Indexing:Connection", "persistent_connections: false", " \ 1767 If set to true, when servers make it possible, htdig can take advantage \ 1768 of persistent connections, as defined by HTTP/1.1 (<em>RFC2616</em>). This permits \ 1769 to reduce the number of open/close operations of connections, when retrieving \ 1770 a document with HTTP. \ 1771 " }, \ 1772 { "plural_suffix", "s", \ 1773 "string", "htsearch", "", "3.2.0b2", "Presentation: Text", "plural_suffix: en", " \ 1774 Specifies the value of the PLURAL_MATCHES template \ 1775 variable used in the header, footer and template files. \ 1776 This can be used for localization for non-English languages \ 1777 where 's' is not the appropriate suffix. \ 1778 " }, \ 1779 { "prefix_match_character", "*", \ 1780 "string", "htsearch", "", "3.1.0b1", "Searching:Method", "prefix_match_character: ing", " \ 1781 A null prefix character means that prefix matching should be \ 1782 applied to every search word. Otherwise prefix matching is \ 1783 done on any search word ending with the characters specified \ 1784 in this string, with the string being stripped off before \ 1785 looking for matches. The \"prefix\" algorithm must be enabled \ 1786 in <a href=\"#search_algorithm\">search_algorithm</a> \ 1787 for this to work. You may also want to set the <a \ 1788 href=\"#max_prefix_matches\">max_prefix_matches</a> and <a \ 1789 href=\"#minimum_prefix_length\">minimum_prefix_length</a> attributes \ 1790 to get it working as you want.<br> As a special case, in version \ 1791 3.1.6 and later, if this string is non-null and is entered alone \ 1792 as a search word, it is taken as a wildcard that matches all \ 1793 documents in the database. If this string is null, the wildcard \ 1794 for this special case will be <strong>*</strong>. This wildcard \ 1795 doesn't require the prefix algorithm to be enabled. \ 1796 " }, \ 1797 { "prev_page_text", "[prev]", \ 1798 "string", "htsearch", "", "3.0", "Presentation:Text", "prev_page_text: <img src=\"/htdig/buttonl.gif\">", " \ 1799 The text displayed in the hyperlink to go to the \ 1800 previous page of matches. \ 1801 " }, \ 1802 { "regex_max_words", "25", \ 1803 "integer", "htsearch", "", "3.2.0b1", "Searching:Method", "regex_max_words: 10", " \ 1804 The \"regex\" <a href=\"#search_algorithm\">fuzzy algorithm</a> \ 1805 could potentially match a \ 1806 very large number of words. This value limits the \ 1807 number of words each regular expression can match. Note \ 1808 that this does not limit the number of documents that \ 1809 are matched in any way. \ 1810 " }, \ 1811 { "remove_bad_urls", "true", \ 1812 "boolean", "htpurge", "Server", "all", "Indexing:How", "remove_bad_urls: true", " \ 1813 If TRUE, htpurge will remove any URLs which were marked \ 1814 as unreachable by htdig from the database. If FALSE, it \ 1815 will not do this. When htdig is run in initial mode, \ 1816 documents which were referred to but could not be \ 1817 accessed should probably be removed, and hence this \ 1818 option should then be set to TRUE, however, if htdig is \ 1819 run to update the database, this may cause documents on \ 1820 a server which is temporarily unavailable to be \ 1821 removed. This is probably NOT what was intended, so \ 1822 hence this option should be set to FALSE in that case. \ 1823 " }, \ 1824 { "remove_default_doc", "index.html", \ 1825 "string list", "htdig", "", "3.1.0", "Indexing:How", "remove_default_doc: default.html default.htm index.html index.htm", " \ 1826 Set this to the default documents in a directory used by the \ 1827 servers you are indexing. These document names will be stripped \ 1828 off of URLs when they are normalized, if one of these names appears \ 1829 after the final slash, to translate URLs like \ 1830 http://foo.com/index.html into http://foo.com/<br> \ 1831 Note that you can disable stripping of these names during \ 1832 normalization by setting the list to an empty string. \ 1833 The list should only contain names that all servers you index \ 1834 recognize as default documents for directory URLs, as defined \ 1835 by the DirectoryIndex setting in Apache's srm.conf, for example. \ 1836 This does not apply to file:/// or ftp:// URLS. \ 1837 <br>See also <a href=\"#local_default_doc\">local_default_doc</a>. \ 1838 " }, \ 1839 { "remove_unretrieved_urls", "false", \ 1840 "boolean", "htpurge", "Server", "3.2.0b1", "Indexing:How", "remove_unretrieved_urls: true", " \ 1841 If TRUE, htpurge will remove any URLs which were discovered \ 1842 and included as stubs in the database but not yet retrieved. If FALSE, it \ 1843 will not do this. When htdig is run in initial mode with no restrictions \ 1844 on hopcount or maximum documents, these should probably be removed and set \ 1845 to true. However, if you are hoping to index a small set of documents and \ 1846 eventually get to the rest, you should probably leave this as false. \ 1847 " }, \ 1848 { "restrict", "", \ 1849 "pattern list", "htsearch", "", "3.2.0b4", "Searching:Method", "restrict: http://www.acme.com/widgets/", " \ 1850 This specifies a set of patterns that all URLs have to \ 1851 match against in order for them to be included in the search \ 1852 results. Any number of strings can be specified, separated by \ 1853 spaces. If multiple patterns are given, at least one of the \ 1854 patterns has to match the URL. The list can be specified \ 1855 from within the configuration file, and can be overridden \ 1856 with the \"restrict\" input parameter in the search form. Note \ 1857 that the restrict list does not take precedence over the \ 1858 <a href=\"#exclude\">exclude</a> list - if a URL matches patterns \ 1859 in both lists it is still excluded from the search results. \ 1860 <br>To restrict URLs in htdig, use \ 1861 <a href=\"#limit_urls_to\">limit_urls_to</a>. \ 1862 " }, \ 1863 { "robotstxt_name", "htdig", \ 1864 "string", "htdig", "Server", "3.0.7", "Indexing:Out", "robotstxt_name: myhtdig", " \ 1865 Sets the name that htdig will look for when parsing \ 1866 robots.txt files. This can be used to make htdig appear \ 1867 as a different spider than ht://Dig. Useful to \ 1868 distinguish between a private and a global index. \ 1869 " }, \ 1870 { "script_name", "", \ 1871 "string", "htsearch", "", "3.1.4", "Presentation:Text", "script_name: /search/results.shtml", " \ 1872 Overrides the value of the SCRIPT_NAME \ 1873 environment attribute. This is useful if \ 1874 htsearch is not being called directly as a CGI \ 1875 program, but indirectly from within a dynamic \ 1876 .shtml page using SSI directives. Previously, \ 1877 you needed a wrapper script to do this, but \ 1878 this configuration attribute makes wrapper \ 1879 scripts obsolete for SSI and possibly for \ 1880 other server scripting languages, as \ 1881 well. (You still need a wrapper script when \ 1882 using PHP, though.)<br> \ 1883 Check out the <code>contrib/scriptname</code> \ 1884 directory for a small example. Note that this \ 1885 attribute also affects the value of the <a \ 1886 href=\"hts_templates.html#CGI\">CGI</a> variable \ 1887 used in htsearch templates. \ 1888 " }, \ 1889 { "search_algorithm", "exact:1", \ 1890 "string list", "htsearch", "", "all", "Searching:Method", "search_algorithm: exact:1 soundex:0.3", " \ 1891 Specifies the search algorithms and their weight to use \ 1892 when searching. Each entry in the list consists of the \ 1893 algorithm name, followed by a colon (:) followed by a \ 1894 weight multiplier. The multiplier is a floating point \ 1895 number between 0 and 1. Note that depending on your \ 1896 <a href=\"#locale\">locale</a> setting, and whether your \ 1897 system's locale implementation affects floating point \ 1898 input, you may need to specify the decimal point as a \ 1899 comma rather than a period.<br> \ 1900 <strong>Note:</strong>If the exact \ 1901 method is not listed, the search may not work since the \ 1902 original terms will not be used.<br> \ 1903 Current algorithms supported are: \ 1904 <dl> \ 1905 <dt> \ 1906 exact \ 1907 </dt> \ 1908 <dd> \ 1909 The default exact word matching algorithm. This \ 1910 will find only exactly matched words. \ 1911 </dd> \ 1912 <dt> \ 1913 soundex \ 1914 </dt> \ 1915 <dd> \ 1916 Uses a slightly modified <a href=\"http://www.sog.org.uk/cig/vol6/605tdrake.pdf\">soundex</a> algorithm to match \ 1917 words. This requires that the soundex database be \ 1918 present. It is generated with the \ 1919 <a href=\"htfuzzy.html\">htfuzzy</a> program. \ 1920 </dd> \ 1921 <dt> \ 1922 metaphone \ 1923 </dt> \ 1924 <dd> \ 1925 Uses the metaphone algorithm for matching words. \ 1926 This algorithm is more specific to the english \ 1927 language than soundex. It requires the metaphone \ 1928 database, which is generated with the <a \ 1929 href=\"htfuzzy.html\">htfuzzy</a> program. \ 1930 </dd> \ 1931 <dt> \ 1932 accents \ 1933 </dt> \ 1934 <dd> \ 1935 Uses the accents algorithm for matching words. \ 1936 This algorithm will treat all accented letters \ 1937 as equivalent to their unaccented counterparts. \ 1938 It requires the accents database, which is \ 1939 generated with the <a \ 1940 href=\"htfuzzy.html\">htfuzzy</a> program. \ 1941 </dd> \ 1942 <dt> \ 1943 endings \ 1944 </dt> \ 1945 <dd> \ 1946 This algorithm uses language specific word endings \ 1947 to find matches. Each word is first reduced to its \ 1948 word root and then all known legal endings are used \ 1949 for the matching. This algorithm uses two databases \ 1950 which are generated with <a href=\"htfuzzy.html\"> \ 1951 htfuzzy</a>. \ 1952 </dd> \ 1953 <dt> \ 1954 synonyms \ 1955 </dt> \ 1956 <dd> \ 1957 Performs a dictionary lookup on all the words. This \ 1958 algorithm uses a database generated with the <a \ 1959 href=\"htfuzzy.html\">htfuzzy</a> program. \ 1960 </dd> \ 1961 <dt> \ 1962 substring \ 1963 </dt> \ 1964 <dd> \ 1965 Matches all words containing the queries as \ 1966 substrings. Since this requires checking every word in \ 1967 the database, this can really slow down searches \ 1968 considerably. \ 1969 <dd> \ 1970 <dt> \ 1971 prefix \ 1972 </dt> \ 1973 <dd> \ 1974 Matches all words beginning with the query \ 1975 strings. Uses the option <a \ 1976 href=\"#prefix_match_character\">prefix_match_character</a> \ 1977 to decide whether a query requires prefix \ 1978 matching. For example \"abc*\" would perform prefix \ 1979 matching on \"abc\" since * is the default \ 1980 prefix_match_character. \ 1981 </dd> \ 1982 <dt> \ 1983 regex \ 1984 </dt> \ 1985 <dd> \ 1986 Matches all words that match the patterns given as regular \ 1987 expressions. Since this requires checking every word in \ 1988 the database, this can really slow down searches \ 1989 considerably. The config file used for searching \ 1990 must include the regex meta-characters (^$\\[-]|.*) \ 1991 included in <a href=\"#extra_word_characters\">extra_word_characters</a>, \ 1992 while the config file used for digging should not.\ 1993 <dd> \ 1994 <dt> \ 1995 speling \ 1996 </dt> \ 1997 <dd> \ 1998 A simple fuzzy algorithm that tries to find one-off spelling \ 1999 mistakes, such as transposition of two letters or an extra character. \ 2000 Since this usually generates just a few possibilities, it is \ 2001 relatively quick. \ 2002 <dd> \ 2003 </dl> \ 2004 " }, \ 2005 { "search_results_contenttype", "text/html", \ 2006 "string", "htsearch", "", "all", "Presentation:Files", "search_results_contenttype: text/xml", " \ 2007 This specifies a Content-type to be output as an HTTP header \ 2008 at the start of search results. If set to an empty string, \ 2009 the Content-type header will be omitted altogether. \ 2010 " }, 2011 { "search_results_footer", "${common_dir}/footer.html", \ 2012 "string", "htsearch", "", "all", "Presentation:Files", "search_results_footer: /usr/local/etc/ht/end-stuff.html", " \ 2013 This specifies a filename to be output at the end of \ 2014 search results. While outputting the footer, some \ 2015 variables will be expanded. Variables use the same \ 2016 syntax as the Bourne shell. If there is a variable VAR, \ 2017 the following will all be recognized: \ 2018 <ul> \ 2019 <li> \ 2020 $VAR \ 2021 </li> \ 2022 <li> \ 2023 $(VAR) \ 2024 </li> \ 2025 <li> \ 2026 ${VAR} \ 2027 </li> \ 2028 </ul> \ 2029 The following variables are available. See \ 2030 <a href=\"hts_template.html\">hts_template.html</a> for a complete \ 2031 list. \ 2032 <dl> \ 2033 <dt> \ 2034 MATCHES \ 2035 </dt> \ 2036 <dd> \ 2037 The number of documents that were matched. \ 2038 </dd> \ 2039 <dt> \ 2040 PLURAL_MATCHES \ 2041 </dt> \ 2042 <dd> \ 2043 If MATCHES is not 1, this will be the string \"s\", \ 2044 else it is an empty string. This can be used to say \ 2045 something like \"$(MATCHES) \ 2046 document$(PLURAL_MATCHES) were found\" \ 2047 </dd> \ 2048 <dt> \ 2049 MAX_STARS \ 2050 </dt> \ 2051 <dd> \ 2052 The value of the <a href=\"#max_stars\">max_stars</a> \ 2053 attribute. \ 2054 </dd> \ 2055 <dt> \ 2056 LOGICAL_WORDS \ 2057 </dt> \ 2058 <dd> \ 2059 A string of the search words with either \"and\" or \ 2060 \"or\" between the words, depending on the type of \ 2061 search. \ 2062 </dd> \ 2063 <dt> \ 2064 WORDS \ 2065 </dt> \ 2066 <dd> \ 2067 A string of the search words with spaces in \ 2068 between. \ 2069 </dd> \ 2070 <dt> \ 2071 PAGEHEADER \ 2072 </dt> \ 2073 <dd> \ 2074 This expands to either the value of the \ 2075 <a href=\"#page_list_header\">page_list_header</a> or \ 2076 <a href=\"#no_page_list_header\">no_page_list_header</a> \ 2077 attribute depending on how many pages there are. \ 2078 </dd> \ 2079 </dl> \ 2080 Note that this file will <strong>NOT</strong> be output \ 2081 if no matches were found. In this case the \ 2082 <a href=\"#nothing_found_file\">nothing_found_file</a> \ 2083 attribute is used instead. \ 2084 Also, this file will not be output if it is \ 2085 overridden by defining the \ 2086 <a href=\"#search_results_wrapper\">search_results_wrapper</a> \ 2087 attribute. \ 2088 " }, \ 2089 { "search_results_header", "${common_dir}/header.html", \ 2090 "string", "htsearch", "", "all", "Presentation:Files", "search_results_header: /usr/local/etc/ht/start-stuff.html", " \ 2091 This specifies a filename to be output at the start of \ 2092 search results. While outputting the header, some \ 2093 variables will be expanded. Variables use the same \ 2094 syntax as the Bourne shell. If there is a variable VAR, \ 2095 the following will all be recognized: \ 2096 <ul> \ 2097 <li> \ 2098 $VAR \ 2099 </li> \ 2100 <li> \ 2101 $(VAR) \ 2102 </li> \ 2103 <li> \ 2104 ${VAR} \ 2105 </li> \ 2106 </ul> \ 2107 The following variables are available. See \ 2108 <a href=\"hts_template.html\">hts_template.html</a> for a complete \ 2109 list. \ 2110 <!-- Do these need to be listed for both _footer and _header? --> \ 2111 <dl> \ 2112 <dt> \ 2113 MATCHES \ 2114 </dt> \ 2115 <dd> \ 2116 The number of documents that were matched. \ 2117 </dd> \ 2118 <dt> \ 2119 PLURAL_MATCHES \ 2120 </dt> \ 2121 <dd> \ 2122 If MATCHES is not 1, this will be the string \"s\", \ 2123 else it is an empty string. This can be used to say \ 2124 something like \"$(MATCHES) \ 2125 document$(PLURAL_MATCHES) were found\" \ 2126 </dd> \ 2127 <dt> \ 2128 MAX_STARS \ 2129 </dt> \ 2130 <dd> \ 2131 The value of the <a href=\"#max_stars\">max_stars</a> \ 2132 attribute. \ 2133 </dd> \ 2134 <dt> \ 2135 LOGICAL_WORDS \ 2136 </dt> \ 2137 <dd> \ 2138 A string of the search words with either \"and\" or \ 2139 \"or\" between the words, depending on the type of \ 2140 search. \ 2141 </dd> \ 2142 <dt> \ 2143 WORDS \ 2144 </dt> \ 2145 <dd> \ 2146 A string of the search words with spaces in \ 2147 between. \ 2148 </dd> \ 2149 </dl> \ 2150 Note that this file will <strong>NOT</strong> be output \ 2151 if no matches were found. In this case the \ 2152 <a href=\"#nothing_found_file\">nothing_found_file</a> \ 2153 attribute is used instead. \ 2154 Also, this file will not be output if it is \ 2155 overridden by defining the \ 2156 <a href=\"#search_results_wrapper\">search_results_wrapper</a> \ 2157 attribute. \ 2158 " }, \ 2159 { "search_results_order", "", \ 2160 "string list", "htsearch", "", "3.2.0b2", "Searching:Ranking", "search_results_order: \ 2161 /docs/|faq.html * /maillist/ /testresults/", " \ 2162 This specifies a list of patterns for URLs in \ 2163 search results. Results will be displayed in the \ 2164 specified order, with the search algorithm result \ 2165 as the second order. Remaining areas, that do not \ 2166 match any of the specified patterns, can be placed \ 2167 by using * as the pattern. If no * is specified, \ 2168 one will be implicitly placed at the end of the \ 2169 list.<br> \ 2170 See also <a href=\"#url_seed_score\">url_seed_score</a>. \ 2171 " }, \ 2172 { "search_results_wrapper", "", \ 2173 "string", "htsearch", "", "3.1.0", "Presentation:Files", "search_results_wrapper: ${common_dir}/wrapper.html", " \ 2174 This specifies a filename to be output at the start and \ 2175 end of search results. This file replaces the \ 2176 <a href=\"#search_results_header\">search_results_header</a> and \ 2177 <a href=\"#search_results_footer\">search_results_footer</a> \ 2178 files, with the contents of both in one file, and uses the \ 2179 pseudo-variable <strong>$(HTSEARCH_RESULTS)</strong> as a \ 2180 separator for the header and footer sections. \ 2181 If the filename is not specified, the file is unreadable, \ 2182 or the pseudo-variable above is not found, htsearch reverts \ 2183 to the separate header and footer files instead. \ 2184 While outputting the wrapper, \ 2185 some variables will be expanded, just as for the \ 2186 <a href=\"#search_results_header\">search_results_header</a> and \ 2187 <a href=\"#search_results_footer\">search_results_footer</a> \ 2188 files.<br> \ 2189 Note that this file will <strong>NOT</strong> be output \ 2190 if no matches were found. In this case the \ 2191 <a href=\"#nothing_found_file\">nothing_found_file</a> \ 2192 attribute is used instead. \ 2193 " }, \ 2194 { "search_rewrite_rules", "", 2195 "string list", "htsearch", "", "3.1.6", "URLs", "search_rewrite_rules: http://(.*)\\\\.mydomain\\\\.org/([^/]*) http://\\\\2.\\\\1.com \\<br> \ 2196 http://www\\\\.myschool\\\\.edu/myorgs/([^/]*) http://\\\\1.org", " \ 2197 This is a list of pairs, <em>regex</em> <em>replacement</em>, used \ 2198 to rewrite URLs in the search results. The left hand string is a \ 2199 regular expression; the right hand string is a literal string with \ 2200 embedded placeholders for fragments that matched inside brackets in \ 2201 the regular expression. \\0 is the whole matched string, \\1 to \\9 \ 2202 are bracketted substrings. The backslash must be doubled-up in the \ 2203 attribute setting to get past the variable expansion parsing. Rewrite \ 2204 rules are applied sequentially to each URL before it is displayed \ 2205 or checked against the <a href=\"#restrict\">restrict</a> or \ 2206 <a href=\"#exclude\">exclude</a> lists. Rewriting does not stop once a \ 2207 match has been made, so multiple rules may affect a given URL. See \ 2208 also <a href=\"#url_part_aliases\">url_part_aliases</a> which allows \ 2209 URLs to be of one form during indexing and translated for results, \ 2210 and <a href=\"#url_rewrite_rules\">url_rewrite_rules</a> which allows \ 2211 URLs to be rewritten while indexing. \ 2212 " }, 2213 { "server_aliases", "", \ 2214 "string list", "htdig", "", "3.1.0b2", "Indexing:Where", "server_aliases: \ 2215 foo.mydomain.com:80=www.mydomain.com:80 \\<br> \ 2216 bar.mydomain.com:80=www.mydomain.com:80 \ 2217 ", " \ 2218 This attribute tells the indexer that servers have several \ 2219 DNS aliases, which all point to the same machine and are NOT \ 2220 virtual hosts. This allows you to ensure pages are indexed \ 2221 only once on a given machine, despite the alias used in a URL. \ 2222 As shown in the example, the mapping goes from left to right, \ 2223 so the server name on the right hand side is the one that is \ 2224 used. As of version 3.1.3, the port number is optional, and is \ 2225 assumed to be 80 if omitted. There is no easy way to map all \ 2226 ports from one alias to another without listing them all. \ 2227 " }, \ 2228 { "server_max_docs", "-1", \ 2229 "integer", "htdig", "Server", "3.1.0b3", "Indexing:Where", "server_max_docs: 50", " \ 2230 This attribute tells htdig to limit the dig to retrieve a maximum \ 2231 number of documents from each server. This can cause \ 2232 unusual behavior on update digs since the old URLs are \ 2233 stored alphabetically. Therefore, update digs will add \ 2234 additional URLs in pseudo-alphabetical order, up to the \ 2235 limit of the attribute. However, it is most useful to \ 2236 partially index a server as the URLs of additional \ 2237 documents are entered into the database, marked as never \ 2238 retrieved.<br> \ 2239 A value of -1 specifies no limit. \ 2240 " }, \ 2241 { "server_wait_time", "0", \ 2242 "integer", "htdig", "Server", "3.1.0b3", "Indexing:Connection", "server_wait_time: 20", " \ 2243 This attribute tells htdig to ensure a server has had a \ 2244 delay (in seconds) from the beginning of the last \ 2245 connection. This can be used to prevent \"server abuse\" \ 2246 by digging without delay. It's recommended to set this \ 2247 to 10-30 (seconds) when indexing servers that you don't \ 2248 monitor yourself. Additionally, this attribute can slow \ 2249 down local indexing if set, which may or may not be what \ 2250 you intended. \ 2251 " }, \ 2252 { "sort", "score", \ 2253 "string", "htsearch", "", "3.1.0", "Presentation:How", "sort: revtime", " \ 2254 This is the default sorting method that htsearch \ 2255 uses to determine the order in which matches are displayed. \ 2256 The valid choices are: \ 2257 <table border=\"0\"> \ 2258 <tr> \ 2259 <td> \ 2260 <ul> \ 2261 <li> score </li> \ 2262 <li> time </li> \ 2263 <li> title </li> \ 2264 </ul> \ 2265 </td> \ 2266 <td> \ 2267 <ul> \ 2268 <li> revscore </li> \ 2269 <li> revtime </li> \ 2270 <li> revtitle </li> \ 2271 </ul> \ 2272 </td> \ 2273 </tr> \ 2274 </table> \ 2275 This attribute will only be used if the HTML form that \ 2276 calls htsearch didn't have the <strong>sort</strong> \ 2277 value set. The words date and revdate can be used instead \ 2278 of time and revtime, as both will sort by the time that \ 2279 the document was last modified, if this information is \ 2280 given by the server. The default is to sort by the score, \ 2281 which ranks documents by best match. The sort methods that \ 2282 begin with \"rev\" simply reverse the order of the \ 2283 sort. Note that setting this to something other than \ 2284 \"score\" will incur a slowdown in searches. \ 2285 " }, \ 2286 { "sort_names", "score Score time Time title Title revscore 'Reverse Score' revtime 'Reverse Time' revtitle 'Reverse Title'", \ 2287 "quoted string list", "htsearch", "", "3.1.0", "Searching:UI", "sort_names: \ 2288 score 'Best Match' time Newest title A-Z \\<br> \ 2289 revscore 'Worst Match' revtime Oldest revtitle Z-A \ 2290 ", " \ 2291 These values are used to create the <strong> \ 2292 sort</strong> menu. It consists of pairs. The first \ 2293 element of each pair is one of the known sort methods, the \ 2294 second element is the text that will be shown in the \ 2295 menu for that sort method. This text needs to be quoted if \ 2296 it contains spaces. \ 2297 See the <a href=\"hts_selectors.html\">select list documentation</a> \ 2298 for more information on how this attribute is used. \ 2299 " }, \ 2300 { "soundex_db", "${database_base}.soundex.db", \ 2301 "string", "htfuzzy htsearch", "", "all", "File Layout", "soundex_db: ${database_base}.snd.db", " \ 2302 The database file used for the fuzzy \"soundex\" search \ 2303 algorithm. This database is created by \ 2304 <a href=\"htfuzzy.html\">htfuzzy</a> and used by \ 2305 <a href=\"htsearch.html\" target=\"_top\">htsearch</a>. \ 2306 " }, \ 2307 { "star_blank", "${image_url_prefix}/star_blank.gif", \ 2308 "string", "htsearch", "", "all", "Presentation:Text", "star_blank: http://www.somewhere.org/icons/noelephant.gif", " \ 2309 This specifies the URL to use to display a blank of the \ 2310 same size as the star defined in the \ 2311 <a href=\"#star_image\">star_image</a> attribute or in the \ 2312 <a href=\"#star_patterns\">star_patterns</a> attribute. \ 2313 " }, \ 2314 { "star_image", "${image_url_prefix}/star.gif", \ 2315 "string", "htsearch", "", "all", "Presentation:Text", "star_image: http://www.somewhere.org/icons/elephant.gif", " \ 2316 This specifies the URL to use to display a star. This \ 2317 allows you to use some other icon instead of a star. \ 2318 (We like the star...)<br> \ 2319 The display of stars can be turned on or off with the \ 2320 <em><a href=\"#use_star_image\">use_star_image</a></em> \ 2321 attribute and the maximum number of stars that can be \ 2322 displayed is determined by the \ 2323 <em><a href=\"#max_stars\">max_stars</a></em> attribute.<br> \ 2324 Even though the image can be changed, the ALT value \ 2325 for the image will always be a '*'. \ 2326 " }, \ 2327 { "star_patterns", "", \ 2328 "string list", "htsearch", "", "3.0", "Presentation:How", "star_patterns: \ 2329 http://www.sdsu.edu /sdsu.gif \\<br> \ 2330 http://www.ucsd.edu /ucsd.gif \ 2331 ", " \ 2332 This attribute allows the star image to be changed \ 2333 depending on the URL or the match it is used for. This \ 2334 is mainly to make a visual distinction between matches \ 2335 on different web sites. The star image could be \ 2336 replaced with the logo of the company the match refers \ 2337 to.<br> \ 2338 It is advisable to keep all the images the same size \ 2339 in order to line things up properly in a short result \ 2340 listing.<br> \ 2341 The format is simple. It is a list of pairs. The first \ 2342 element of each pair is a pattern, the second element \ 2343 is a URL to the image for that pattern. \ 2344 " }, \ 2345 { "startday", "", \ 2346 "integer", "htsearch", "", "3.1.6", "Searching:Method", "startday: 1", " \ 2347 Day component of first date allowed as last-modified date \ 2348 of returned docutments. \ 2349 This is most usefully specified as a \ 2350 <a href=\"hts_form.html#startyear\">GCI argument</a>. \ 2351 See also <a href=\"#startyear\">startyear</a>. \ 2352 " }, \ 2353 { "start_ellipses", "<strong><code>... </code></strong>", \ 2354 "string", "htsearch", "", "all", "Presentation:Text", "start_ellipses: ...", " \ 2355 When excerpts are displayed in the search output, this \ 2356 string will be prepended to the excerpt if there is \ 2357 text before the text displayed. This is just a visual \ 2358 reminder to the user that the excerpt is only part of \ 2359 the complete document. \ 2360 " }, \ 2361 { "start_highlight", "<strong>", \ 2362 "string", "htsearch", "", "3.1.4", "Presentation:Text", "start_highlight: <font color=\"#FF0000\">", " \ 2363 When excerpts are displayed in the search output, matched \ 2364 words will be highlighted using this string and \ 2365 <a href=\"#end_highlight\"> end_highlight</a>. \ 2366 You should ensure that highlighting tags are balanced, \ 2367 that is, any formatting tags that this string \ 2368 opens should be closed by end_highlight. \ 2369 " }, \ 2370 { "startmonth", "", \ 2371 "integer", "htsearch", "", "3.1.6", "Searching:Method", "startmonth: 1", " \ 2372 Month component of first date allowed as last-modified date \ 2373 of returned docutments. \ 2374 This is most usefully specified as a \ 2375 <a href=\"hts_form.html#startyear\">GCI argument</a>. \ 2376 See also <a href=\"#startyear\">startyear</a>. \ 2377 " }, \ 2378 { "start_url", "http://www.htdig.org/", \ 2379 "string list", "htdig", "", "all", "Indexing:Where", "start_url: http://www.somewhere.org/alldata/index.html", " \ 2380 This is the list of URLs that will be used to start a \ 2381 dig when there was no existing database. Note that \ 2382 multiple URLs can be given here. \ 2383 <br>Note also that the value of <em>start_url</em> \ 2384 will be the default value for \ 2385 <a href=\"#limit_urls_to\">limit_urls_to</a>, so if \ 2386 you set start_url to the URLs for specific files, \ 2387 rather than a site or subdirectory URL, you may need \ 2388 to set limit_urls_to to something less restrictive \ 2389 so htdig doesn't reject links in the documents. \ 2390 " }, \ 2391 { "startyear", "", \ 2392 "integer", "htsearch", "", "3.1.6", "Searching:Method", "startyear: 2001", " \ 2393 This specifies the year of the cutoff start date for \ 2394 search results. If the start or end date are specified, \ 2395 only results with a last modified date within this \ 2396 range are shown. If a start or end date is specified, but startyear \ 2397 is not, then it defaults to 1970. \ 2398 See also <a href=\"#startday\">startday</a>, \ 2399 <a href=\"#startmonth\">startmonth</a>, \ 2400 <a href=\"#endday\">endday</a>, \ 2401 <a href=\"#endmonth\">endmonth</a>, \ 2402 <a href=\"#endyear\">endyear</a>. \ 2403 These are most usefully specified as a \ 2404 <a href=\"hts_form.html#startyear\">GCI argument</a>.<br> \ 2405 For each component, if a negative number is given, \ 2406 it is taken as relative to the current date. \ 2407 Relative days can span several months or even years if desired, \ 2408 and relative months can span several years. A startday of \ 2409 -90 will select matching documents modified within \ 2410 the last 90 days. \ 2411 " }, \ 2412 { "store_phrases", "true", \ 2413 "boolean", "htdig", "", "3.2.0b5", "Indexing:How", "startyear: false", " \ 2414 Causes htdig to record all occurrences of each word in a document, \ 2415 to allow accurate phrase searches. If this is false, only the first \ 2416 occurrence of each word will be stored, causing many phrases to be \ 2417 missed. Setting this false increases indexing speed by about 20%, \ 2418 and reduces disk requirements by about 60%.\ 2419 " }, \ 2420 { "substring_max_words", "25", \ 2421 "integer", "htsearch", "", "3.0.8b1", "Searching:Method", "substring_max_words: 100", " \ 2422 The Substring <a href=\"#search_algorithm\">fuzzy algorithm</a> \ 2423 could potentially match a \ 2424 very large number of words. This value limits the \ 2425 number of words each substring pattern can match. Note \ 2426 that this does not limit the number of documents that \ 2427 are matched in any way. \ 2428 " }, \ 2429 { "synonym_db", "${common_dir}/synonyms.db", \ 2430 "string", "htsearch htfuzzy", "", "3.0", "File Layout", "synonym_db: ${database_base}.syn.db", " \ 2431 Points to the database that <a href=\"htfuzzy.html\"> \ 2432 htfuzzy</a> creates when the <strong>synonyms</strong> \ 2433 algorithm is used.<br> \ 2434 <a href=\"htsearch.html\" target=\"_top\">htsearch</a> \ 2435 uses this to perform synonym dictionary lookups. \ 2436 " }, \ 2437 { "synonym_dictionary", "${common_dir}/synonyms", \ 2438 "string", "htfuzzy", "", "3.0", "File Layout", "synonym_dictionary: /usr/dict/synonyms", " \ 2439 This points to a text file containing the synonym \ 2440 dictionary used for the synonyms search algorithm.<br> \ 2441 Each line of this file has at least two words. The \ 2442 first word is the word to replace, the rest of the \ 2443 words are synonyms for that word. \ 2444 " }, \ 2445 { "syntax_error_file", "${common_dir}/syntax.html", \ 2446 "string", "htsearch", "", "all", "Presentation:Files", "syntax_error_file: ${common_dir}/synerror.html", " \ 2447 This points to the file which will be displayed if a \ 2448 boolean expression syntax error was found. \ 2449 " }, \ 2450 { "tcp_max_retries", "1", \ 2451 "integer", "htdig", "Server", "3.2.0b1", "Indexing:Connection", "tcp_max_retries: 6", " \ 2452 This option set the maximum number of attempts when a connection \ 2453 <A href=\"#timeout\">timeout</A>s. \ 2454 After all these retries, the connection attempt results <timed out>. \ 2455 " }, \ 2456 { "tcp_wait_time", "5", \ 2457 "integer", "htdig", "Server", "3.2.0b1", "Indexing:Connection", "tcp_wait_time: 10", " \ 2458 This attribute sets the wait time (in seconds) after a connection \ 2459 fails and the <A href=\"#timeout\">timeout</A> is raised. \ 2460 " }, \ 2461 { "template_map", "Long builtin-long builtin-long Short builtin-short builtin-short", \ 2462 "quoted string list", "htsearch", "", "3.0", "Presentation:Files,Searching:UI", "template_map: \ 2463 Short short ${common_dir}/short.html \\<br> \ 2464 Normal normal builtin-long \\<br> \ 2465 Detailed detail ${common_dir}/detail.html \ 2466 ", " \ 2467 This maps match template names to internal names and \ 2468 template file names. It is a list of triplets. The \ 2469 first element in each triplet is the name that will be \ 2470 displayed in the FORMAT menu. The second element is the \ 2471 name used internally and the third element is a \ 2472 filename of the template to use.<br> \ 2473 There are two predefined templates, namely <strong> \ 2474 builtin-long</strong> and <strong> \ 2475 builtin-short</strong>. If the filename is one of \ 2476 those, they will be used instead.<br> \ 2477 More information about templates can be found in the \ 2478 <a href=\"htsearch.html\" target=\"_top\">htsearch</a> \ 2479 documentation. The particular template is selecterd by the \ 2480 <a href=\"hts_form.html#format\">format</a> cgi argument, and the \ 2481 default is given by <a href=\"#template_name\">template_name</a> in \ 2482 the config file. \ 2483 " }, \ 2484 { "template_name", "builtin-long", \ 2485 "string", "htsearch", "", "3.0", "Searching:UI,Presentation:How", "template_name: long", " \ 2486 Specifies the default template if no \ 2487 <a href=\"hts_form.html#format\">format</a> field is given by the \ 2488 search form. This needs to map to the \ 2489 <a href=\"#template_map\">template_map</a>. \ 2490 " }, \ 2491 { "template_patterns", "", \ 2492 "string list", "htsearch", "", "3.1.4", "Presentation:How", "template_patterns: \ 2493 http://www.sdsu.edu ${common_dir}/sdsu.html \\<br> \ 2494 http://www.ucsd.edu ${common_dir}/ucsd.html \ 2495 ", " \ 2496 This attribute allows the results template to be changed \ 2497 depending on the URL or the match it is used for. This \ 2498 is mainly to make a visual distinction between matches \ 2499 on different web sites. The results for each site could \ 2500 thus be shown in a style matching that site.<br> \ 2501 The format is simply a list of pairs. The first \ 2502 element of each pair is a pattern, the second element \ 2503 is the name of the template file for that pattern.<br> \ 2504 More information about templates can be found in the \ 2505 <a href=\"htsearch.html\" target=\"_top\">htsearch</a> \ 2506 documentation.<br> \ 2507 Normally, when using this template selection method, you \ 2508 would disable user selection of templates via the <strong>format</strong> \ 2509 input parameter in search forms, as the two methods were not \ 2510 really designed to interact. Templates selected by URL patterns \ 2511 would override any user selection made in the form. If you want \ 2512 to use the two methods together, see the notes on \ 2513 <a href=\"hts_selectors.html#template_patterns\">combining</a> \ 2514 them for an example of how to do this. \ 2515 " }, \ 2516 { "text_factor", "1", \ 2517 "number", "htsearch", "", "3.0", "Searching:Ranking", "text_factor: 0", " \ 2518 This is a factor which will be used to multiply the \ 2519 weight of words that are not in any special part of a \ 2520 document. Setting a factor to 0 will cause normal words \ 2521 to be ignored. The number may be a floating point \ 2522 number. See also the <a href=\"#heading_factor\"> heading_factor</a> \ 2523 attribute. \ 2524 " }, \ 2525 { "timeout", "30", \ 2526 "integer", "htdig", "Server", "all", "Indexing:Connection", "timeout: 42", " \ 2527 Specifies the time the digger will wait to complete a \ 2528 network read. This is just a safeguard against \ 2529 unforeseen things like the all too common \ 2530 transformation from a network to a notwork.<br> \ 2531 The timeout is specified in seconds. \ 2532 " }, \ 2533 { "title_factor", "100", \ 2534 "number", "htsearch", "", "all", "Searching:Ranking", "title_factor: 12", " \ 2535 This is a factor which will be used to multiply the \ 2536 weight of words in the title of a document. Setting a \ 2537 factor to 0 will cause words in the title to be \ 2538 ignored. The number may be a floating point number. See \ 2539 also the <a href=\"#heading_factor\"> \ 2540 heading_factor</a> attribute. \ 2541 " }, \ 2542 { "translate_latin1", "true", \ 2543 "boolean", "htdig htsearch", "", "3.2.0b5", "Indexing:What", "translate_latin1: false", " \ 2544 If set to false, the SGML entities for ISO-8859-1 (or \ 2545 Latin 1) characters above &nbsp; (or &#160;) \ 2546 will not be translated into their 8-bit equivalents. \ 2547 This attribute should be set to false when using a \ 2548 <a href=\"#locale\">locale</a> that doesn't use the \ 2549 ISO-8859-1 character set, to avoid these entities \ 2550 being mapped to inappropriate 8-bit characters, or \ 2551 perhaps more importantly to avoid 8-bit characters from \ 2552 your locale being mapped back to Latin 1 SGML entities \ 2553 in search results. \ 2554 " }, \ 2555 { "url_list", "${database_base}.urls", \ 2556 "string", "htdig", "", "all", "Extra Output", "url_list: /tmp/urls", " \ 2557 This file is only created if \ 2558 <em><a href=\"#create_url_list\">create_url_list</a></em> is set to \ 2559 true. It will contain a list of all URLs that were \ 2560 seen. \ 2561 " }, \ 2562 { "url_log", "${database_base}.log", \ 2563 "string", "htdig", "", "3.1.0", "Extra Output", "url_log: /tmp/htdig.progress", " \ 2564 If <a href=\"htdig.html\">htdig</a> is \ 2565 interrupted, it will write out its progress to this \ 2566 file. Note that if it has a large number of URLs to write, \ 2567 it may take some time to exit. This can especially happen \ 2568 when running update digs and the run is interrupted soon \ 2569 after beginning. \ 2570 " }, \ 2571 { "url_part_aliases", "", \ 2572 "string list", "all", "", "3.1.0", "URLs", "url_part_aliases: \ 2573 http://search.example.com/~htdig *site \\<br> \ 2574 http://www.htdig.org/this/ *1 \\<br> \ 2575 .html *2 \ 2576 url_part_aliases: \ 2577 http://www.htdig.org/ *site \\<br> \ 2578 http://www.htdig.org/that/ *1 \\<br> \ 2579 .htm *2 \ 2580 ", " \ 2581 A list of translations pairs <em>from</em> and \ 2582 <em>to</em>, used when accessing the database. \ 2583 If a part of an URL matches with the \ 2584 <em>from</em>-string of each pair, it will be \ 2585 translated into the <em>to</em>-string just before \ 2586 writing the URL to the database, and translated \ 2587 back just after reading it from the database.<br> \ 2588 This is primarily used to provide an easy way to \ 2589 rename parts of URLs for e.g. changing \ 2590 www.example.com/~htdig to www.htdig.org. Two \ 2591 different configuration files for digging and \ 2592 searching are then used, with url_part_aliases \ 2593 having different <em>from</em> strings, but \ 2594 identical <em>to</em>-strings.<br> \ 2595 See also <a \ 2596 href=\"#common_url_parts\">common_url_parts</a>.<br> \ 2597 Strings that are normally incorrect in URLs or \ 2598 very seldom used, should be used as \ 2599 <em>to</em>-strings, since extra storage will be \ 2600 used each time one is found as normal part of a \ 2601 URL. Translations will be performed with priority \ 2602 for the leftmost longest match. Each \ 2603 <em>to</em>-string must be unique and not be a \ 2604 part of any other <em>to</em>-string. It also helps \ 2605 to keep the <em>to</em>-strings short to save space \ 2606 in the database. Other than that, the choice of \ 2607 <em>to</em>-strings is pretty arbitrary, as they \ 2608 just provide a temporary, internal encoding in the \ 2609 databases, and none of the characters in these \ 2610 strings have any special meaning.<br> \ 2611 Note that when this attribute is changed, the \ 2612 database should be rebuilt, unless the effect of \ 2613 \"moving\" the affected URLs in the database is \ 2614 wanted, as described above.<br> \ 2615 <strong>Please note:</strong> Don't just copy the \ 2616 example below into a single configuration file. \ 2617 There are two separate settings of \ 2618 <em>url_part_aliases</em> below; the first one is \ 2619 for the configuration file to be used by htdig, \ 2620 htmerge, and htnotify, and the second one is for the \ 2621 configuration file to be used by htsearch. \ 2622 In this example, htdig will encode the URL \ 2623 \"http://search.example.com/~htdig/contrib/stuff.html\" \ 2624 as \"*sitecontrib/stuff*2\" in the databases, and \ 2625 htsearch will decode it as \ 2626 \"http://www.htdig.org/contrib/stuff.htm\".<br> \ 2627 As of version 3.1.6, you can also do more complex \ 2628 rewriting of URLs using \ 2629 <a href=\"#url_rewrite_rules\">url_rewrite_rules</a> and \ 2630 <a href=\"#search_rewrite_rules\">search_rewrite_rules</a>. \ 2631 " }, \ 2632 { "url_rewrite_rules", "", \ 2633 "string list", "htdig", "", "3.2.0b3", "URLs", "url_rewrite_rules: (.*)\\\\?JServSessionIdroot=.* \\\\1 \\<br> \ 2634 (.*)\\\\&JServSessionIdroot=.* \\\\1 \\<br> \ 2635 (.*)&context=.* \\\\1<br>", " \ 2636 This is a list of pairs, <em>regex</em> <em>replacement</em> used to \ 2637 permanently rewrite URLs as they are indexed. The left hand string is \ 2638 a regular expression; the right hand string is a literal string with \ 2639 embedded placeholders for fragments that matched inside brackets in \ 2640 the regex. \\0 is the whole matched string, \\1 to \\9 are bracketted \ 2641 substrings. Note that the <strong>entire</strong> URL is replaced by \ 2642 the right hand string (not just the portion which matches the left hand\ 2643 string). Thus, a leading and trailing (.*) should be included in the \ 2644 pattern, with matching placeholders in the replacement string.<br> \ 2645 Rewrite rules are applied sequentially to each \ 2646 incoming URL before normalization occurs. Rewriting does not stop \ 2647 once a match has been made, so multiple rules may affect a given URL. \ 2648 See also <a href=\"#url_part_aliases\">url_part_aliases</a> which \ 2649 allows URLs to be of one \ 2650 form during indexing and translated for results. \ 2651 "}, \ 2652 { "url_seed_score", "", \ 2653 "string list", "htsearch", "", "3.2.0b2", "Searching::Ranking", "url_seed_score: \ 2654 /mailinglist/ *.5-1e6 <br> \ 2655 /docs/|/news/ *1.5 <br> \ 2656 /testresults/ "*.7 -200" <br> \ 2657 /faq-area/ *2+10000", " \ 2658 This is a list of pairs, <em>pattern</em> \ 2659 <em>formula</em>, used to weigh the score of \ 2660 hits, depending on the URL of the document.<br> \ 2661 The <em>pattern</em> part is a substring to match \ 2662 against the URL. Pipe ('|') characters can be \ 2663 used in the pattern to concatenate substrings for \ 2664 web-areas that have the same formula.<br> \ 2665 The formula describes a <em>factor</em> and a \ 2666 <em>constant</em>, by which the hit score is \ 2667 weighed. The <em>factor</em> part is multiplied \ 2668 to the original score, then the <em>constant</em> \ 2669 part is added.<br> \ 2670 The format of the formula is the factor part: \ 2671 "*<em>N</em>" optionally followed by comma and \ 2672 spaces, followed by the constant part : \ 2673 "+<em>M</em>", where the plus sign may be emitted \ 2674 for negative numbers. Either part is optional, \ 2675 but must come in this order.<br> \ 2676 The numbers <em>N</em> and <em>M</em> are floating \ 2677 point constants.<br> \ 2678 More straightforward is to think of the format as \ 2679 "newscore = oldscore*<em>N</em>+<em>M</em>", \ 2680 but with the "newscore = oldscore" part left out. \ 2681 " }, \ 2682 { "url_text_factor", "1", \ 2683 "number", "htsearch", "", "??", "Searching:Ranking", "url_text_factor: 1", " \ 2684 TO BE COMPLETED<br> \ 2685 See also <a href=\"#heading_factor\">heading_factor</a>. \ 2686 " }, \ 2687 { "use_doc_date", "false", \ 2688 "boolean", "htdig", "", "3.2.0b1", "Indexing:How", "use_doc_date: true", " \ 2689 If set to true, htdig will use META date tags in documents, \ 2690 overriding the modification date returned by the server. \ 2691 Any documents that do not have META date tags will retain \ 2692 the last modified date returned by the server or found on \ 2693 the local file system. \ 2694 As of version 3.1.6, in addition to META date tags, htdig will also \ 2695 recognize dc.date, dc.date.created and dc.date.modified. \ 2696 " }, \ 2697 { "use_meta_description", "false", \ 2698 "boolean", "htsearch", "", "3.1.0b1", "Presentation:How", "use_meta_description: true", " \ 2699 If set to true, any META description tags will be used as \ 2700 excerpts by htsearch. Any documents that do not have META \ 2701 descriptions will retain their normal excerpts. \ 2702 " }, \ 2703 { "use_star_image", "true", \ 2704 "boolean", "htsearch", "", "all", "Presentation:How", "use_star_image: no", " \ 2705 If set to true, the <em><a href=\"#star_image\"> \ 2706 star_image</a></em> attribute is used to display upto \ 2707 <em><a href=\"#max_stars\">max_stars</a></em> images for \ 2708 each match. \ 2709 " }, \ 2710 { "user_agent", "htdig", \ 2711 "string", "htdig", "Server", "3.1.0b2", "Indexing:Out", "user_agent: htdig-digger", " \ 2712 This allows customization of the user_agent: field sent when \ 2713 the digger requests a file from a server. \ 2714 " }, \ 2715 { "valid_extensions", "", \ 2716 "string list", "htdig", "URL", "3.1.4", "Indexing:Where", "valid_extensions: .html .htm .shtml", " \ 2717 This is a list of extensions on URLs which are \ 2718 the only ones considered acceptable. This list is used to \ 2719 supplement the MIME-types that the HTTP server provides \ 2720 with documents. Some HTTP servers do not have a correct \ 2721 list of MIME-types and so can advertise certain \ 2722 documents as text while they are some binary format. \ 2723 If the list is empty, then all extensions are acceptable, \ 2724 provided they pass other criteria for acceptance or rejection. \ 2725 If the list is not empty, only documents with one of the \ 2726 extensions in the list are parsed. \ 2727 See also <a href=\"#bad_extensions\">bad_extensions</a>. \ 2728 " }, \ 2729 { "valid_punctuation", ".-_/!#\\$%^&'", \ 2730 "string", "htdig htsearch", "", "all", "Indexing:What", "valid_punctuation: -'", " \ 2731 This is the set of characters which may be deleted \ 2732 from the document before determining what a word is. \ 2733 This means that if a document contains something like \ 2734 <code>half-hearted</code> the digger will see this as the three \ 2735 words <code> half</code>, <code>hearted</code> and \ 2736 <code>halfhearted</code>.<br> \ 2737 These characters are also removed before keywords are passed to the \ 2738 search engine, so a search for \"half-hearted\" works as expected.<br> \ 2739 Note that the dollar sign ($) and backslash (\\) must be escaped by a \ 2740 backslash in both valid_punctuation and extra_word_characters. \ 2741 Moreover, the backslash should not be the last character on the line. \ 2742 There is currently no way to include a back-quote (`) in \ 2743 extra_word_characters or valid_punctuation.<br> \ 2744 See also the \ 2745 <a href=\"#extra_word_characters\">extra_word_characters</a> \ 2746 and <a href=\"#allow_numbers\">allow_numbers</a> \ 2747 attributes. \ 2748 " }, \ 2749 { "version", VERSION, \ 2750 "string", "htsearch", "", "all", "Presentation:Text", "version: 3.2.0", " \ 2751 This specifies the value of the VERSION \ 2752 variable which can be used in search templates. \ 2753 The default value of this attribute is determined \ 2754 at compile time, and will not normally be set \ 2755 in configuration files. \ 2756 " }, \ 2757 { "word_db", "${database_base}.words.db", \ 2758 "string", "all", "", "all", "File Layout", "word_db: ${database_base}.allwords.db", " \ 2759 This is the main word database. It is an index of all \ 2760 the words to a list of documents that contain the \ 2761 words. This database can grow large pretty quickly. \ 2762 " }, \ 2763 { "word_dump", "${database_base}.worddump", \ 2764 "string", "htdig htdump htload", "", "3.2.0b1", "File Layout", "word_dump: /tmp/words.txt", " \ 2765 This file is basically a text version of the file \ 2766 specified in <em><a href=\"#word_db\">word_db</a></em>. Its \ 2767 only use is to have a human readable database of all \ 2768 words. The file is easy to parse with tools like \ 2769 perl or tcl. \ 2770 " }, \ 2771 { "wordlist_cache_inserts", "false", \ 2772 "boolean", "???", "", "???", "Indexing:How", "wordlist_cache_inserts: true", " \ 2773 If true, create a cache of size wordlist_cache_size/2 for class \ 2774 WordListOne. <em>I don't know what this is for. Does anyone?</em> \ 2775 " }, \ 2776 { "wordlist_cache_size", "10000000", \ 2777 "integer", "all", "", "3.2.0b1", "Indexing:How", "wordlist_cache_size: 40000000", " \ 2778 Size (in bytes) of memory cache used by Berkeley DB (DB used by the indexer) \ 2779 IMPORTANT: It makes a <strong>huge</strong> difference. The rule \ 2780 is that the cache size should be at least 2% of the expected index size. The \ 2781 Berkeley DB file has 1% of internal pages that <em>must</em> be cached for good \ 2782 performances. Giving an additional 1% leaves room for caching leaf pages. \ 2783 " }, \ 2784 { "wordlist_compress", "true", \ 2785 "boolean", "all", "", "3.2.0b1", "Indexing:How", "wordlist_compress: false", " \ 2786 Enables or disables the default compression system for the indexer. \ 2787 This currently attempts to compress the index by a factor of 8. If the \ 2788 Zlib library is not found on the system, the default is false. \ 2789 " }, \ 2790 { "wordlist_compress_zlib", "true", \ 2791 "boolean", "all", "", "3.2.0b4", "Indexing:How", "wordlist_compress_zlib: false", " \ 2792 Enables or disables the zlib compression system for the indexer. \ 2793 Both <a href=\"#wordlist_compress\">wordlist_compress</a> and \ 2794 <a href=\"#compression_level\">compression_level</a> must be true \ 2795 (non-zero) to use this option!\ 2796 " }, \ 2797 { "wordlist_monitor", "false", \ 2798 "boolean", "all", "", "3.2.0b1", "Extra Output", "wordlist_monitor: true", " \ 2799 This enables monitoring of what's happening in the indexer. \ 2800 It can help to detect performance/configuration problems. \ 2801 " }, \ 2802 { "wordlist_monitor_period","0", \ 2803 "number", "all", "", "3.2.0b1", "Extra Output", "wordlist_monitor_period: .1", " \ 2804 Sets the number of seconds between each monitor output. \ 2805 " }, \ 2806 { "wordlist_monitor_output","", \ 2807 "string", "all", "", "3.2.0b1", "Extra Output", "wordlist_monitor_output: myfile", " \ 2808 Print monitoring output on file instead of the default stderr. \ 2809 " }, 2810 { "wordlist_page_size", "0", \ 2811 "integer", "all", "", "3.2.0b1", "Indexing:How", "wordlist_page_size: 8192", " \ 2812 Size (in bytes) of pages used by Berkeley DB (DB used by the indexer). \ 2813 Must be a power of two. \ 2814 " }, \ 2815 { "wordlist_verbose", "", \ 2816 "integer", "", "", "", "", "wordlist_verbose: true", " \ 2817 wordlist_verbose 1 walk logic<br> \ 2818 wordlist_verbose 2 walk logic details<br> \ 2819 wordlist_verbose 2 walk logic lots of details<br> \ 2820 " }, \ 2821 { "wordlist_wordkey_description", "Word/DocID 32/Flags 8/Location 16", \ 2822 "string", "all", "", "3.2.0b1", "Indexing:How", "**this should not be configured by user**", " \ 2823 Internal key description: *not user configurable* \ 2824 " }, \ 2825 { "wordlist_wordrecord_description", "DATA", \ 2826 "string", "all", "", "3.2.0b1", "Indexing:How", "**this should not be configured by user**", " \ 2827 Internal data description: *not user configurable* \ 2828 " }, \ 2829 {0, 0, 0, 0, 0, 0, 0, 0, 0} 2830 }; 2831 2832 HtConfiguration config; 2833