1 //---------------------------------------------------------------- 2 // 3 // libhtdig_api.h 4 // 5 // Header function for htdig shared library API 6 // 7 // 1/25/2002 created 8 // 9 // Neal Richter nealr@rightnow.com 10 // 11 // Part of the ht://Dig package <http://www.htdig.org/> 12 // Copyright (c) 1995-2004 The ht://Dig Group 13 // For copyright details, see the file COPYING in your distribution 14 // or the GNU Library General Public License (LGPL) version 2 or later or later 15 // <http://www.gnu.org/copyleft/lgpl.html> 16 // 17 // $Id: libhtdig_api.h,v 1.4 2004/05/28 13:15:29 lha Exp $ 18 // 19 //---------------------------------------------------------------- 20 21 #ifndef LIBHTDIG_API_H 22 #define LIBHTDIG_API_H 23 24 #include <time.h> 25 26 #ifndef TRUE 27 #define TRUE 1 28 #endif 29 30 #ifndef FALSE 31 #define FALSE 0 32 #endif 33 34 35 #define HTDIG_MAX_FILENAME_PATH_L 1024 36 #define HTDIG_DOCUMENT_ID_L 32 37 #define HTDIG_DOCUMENT_TITLE_L 256 38 #define HTDIG_DOCUMENT_META_L 4096 39 #define HTDIG_DOCUMENT_CONTENT_TYPE_L 32 40 #define HTDIG_DOCUMENT_EXCERPT_L 1024 41 //make sure HTDIG_DOCUMENT_EXCERPT_L is more than config 'excerpt_length' 42 43 //default failsafe size of 'excerpt' document 44 //make sure it's more than config 'max_head_length' 45 #define HTDIG_DEFAULT_EXCERPT_SIZE 524288 46 47 //should be the same as the default value in HTDIG 48 #define HTDIG_MAX_QUERY_L 256 49 50 51 #define HTDIG_CUSTOM_TEXT_MIME_TYPE "text/vnd.customdocument" 52 53 //htfuzzy 54 #define HTDIG_ALG_ACCENTS 0x00000100 //"accents" 55 #define HTDIG_ALG_ACCENTS_STR "accents" 56 57 #define HTDIG_ALG_ENDINGS 0x00001000 //"endings" 58 #define HTDIG_ALG_ENDINGS_STR "endings" 59 60 #define HTDIG_ALG_METAPHONE 0x00000010 //"metaphone" 61 #define HTDIG_ALG_METAPHONE_STR "metaphone" 62 63 #define HTDIG_ALG_SOUNDEX 0x00000001 //"soundex" 64 #define HTDIG_ALG_SOUNDEX_STR "soundex" 65 66 #define HTDIG_ALG_SYNONYMS 0x00010000 //"synonyms" 67 #define HTDIG_ALG_SYNONYMS_STR "synonyms" 68 69 70 //searching 71 #define HTSEARCH_ALG_AND 0x00000100 //"and" 72 #define HTSEARCH_ALG_AND_STR "and" 73 74 #define HTSEARCH_ALG_BOOLEAN 0x00000001 //"boolean" 75 #define HTSEARCH_ALG_BOOLEAN_STR "boolean" 76 77 #define HTSEARCH_ALG_OR 0x00000010 //"or" 78 #define HTSEARCH_ALG_OR_STR "or" 79 80 81 #define HTSEARCH_FORMAT_LONG 0x00000001 //"long" 82 #define HTSEARCH_FORMAT_LONG_STR "long" 83 84 #define HTSEARCH_FORMAT_SHORT 0x00000010 //"short" 85 #define HTSEARCH_FORMAT_SHORT_STR "short" 86 87 88 #define HTSEARCH_SORT_SCORE 0x00000001 //"score" 89 #define HTSEARCH_SORT_SCORE_STR "score" 90 91 #define HTSEARCH_SORT_REV_SCORE 0x00000010 //"reverse score" 92 #define HTSEARCH_SORT_REV_SCORE_STR "reverse score" 93 94 #define HTSEARCH_SORT_TIME 0x00000100 //"time" 95 #define HTSEARCH_SORT_TIME_STR "time" 96 97 #define HTSEARCH_SORT_REV_TIME 0x00001000 //"reverse time" 98 #define HTSEARCH_SORT_REV_TIME_STR "reverse time" 99 100 #define HTSEARCH_SORT_TITLE 0x00010000 //"title" 101 #define HTSEARCH_SORT_TITLE_STR "title" 102 103 #define HTSEARCH_SORT_REV_TITLE 0x00100000 //"reverse title" 104 #define HTSEARCH_SORT_REV_TITLE_STR "reverse title" 105 106 107 108 #define HTDIG_ERROR_CONFIG_READ -101 109 #define HTDIG_ERROR_URL_PART -102 110 #define HTDIG_ERROR_URL_REWRITE -103 111 #define HTDIG_ERROR_URL_CREATE_FILE -104 112 #define HTDIG_ERROR_IMAGE_CREATE_FILE -105 113 #define HTDIG_ERROR_OPEN_CREATE_DOCDB -106 114 #define HTDIG_ERROR_LOGFILE_OPEN -107 115 #define HTDIG_ERROR_LOGFILE_CLOSE -108 116 117 #define HTDIG_ERROR_TESTURL_EXCLUDE -109 118 #define HTDIG_ERROR_TESTURL_BADQUERY -110 119 #define HTDIG_ERROR_TESTURL_EXTENSION -111 120 #define HTDIG_ERROR_TESTURL_EXTENSION2 -112 121 #define HTDIG_ERROR_TESTURL_LIMITS -113 122 #define HTDIG_ERROR_TESTURL_LIMITSNORM -114 123 #define HTDIG_ERROR_TESTURL_SRCH_RESTRICT -115 124 #define HTDIG_ERROR_TESTURL_SRCH_EXCLUDE -116 125 #define HTDIG_ERROR_TESTURL_REWRITE_EMPTY -117 126 #define HTDIG_ERROR_TESTURL_ROBOT_FORBID -118 127 128 #define HTSEARCH_ERROR_NO_MATCH -201 129 #define HTSEARCH_ERROR_BAD_MATCH_INDEX -202 130 #define HTSEARCH_ERROR_BAD_DOCUMENT -203 131 #define HTSEARCH_ERROR_TEMPLATE_ERROR -204 132 #define HTSEARCH_ERROR_LOGFILE_OPEN -205 133 #define HTSEARCH_ERROR_LOGFILE_CLOSE -206 134 #define HTSEARCH_ERROR_CONFIG_READ -207 135 #define HTSEARCH_ERROR_URL_PART -208 136 #define HTSEARCH_ERROR_WORDDB_READ -209 137 #define HTSEARCH_ERROR_DOCINDEX_READ -210 138 #define HTSEARCH_ERROR_DOCDB_READ -211 139 #define HTSEARCH_ERROR_EXCERPTDB_READ -212 140 141 #define HTMERGE_ERROR_LOGFILE_OPEN -301 142 #define HTMERGE_ERROR_LOGFILE_CLOSE -302 143 #define HTMERGE_ERROR_CONFIG_READ -303 144 #define HTMERGE_ERROR_URL_PART -304 145 #define HTMERGE_ERROR_WORDDB_READ -305 146 #define HTMERGE_ERROR_DOCINDEX_READ -306 147 #define HTMERGE_ERROR_DOCDB_READ -307 148 #define HTMERGE_ERROR_EXCERPTDB_READ -308 149 150 #define PHP_HTDIG_CONFIGFILE_PARM "configFile" 151 #define PHP_HTDIG_URL_PARM "URL" 152 #define PHP_HTDIG_LIMITTO_PARM "limit_urls_to" 153 #define PHP_HTDIG_LIMITN_PARM "limit_normalized" 154 #define PHP_HTDIG_EXCLUDEURLS_PARM "exclude_urls" 155 #define PHP_HTDIG_SEARCHRESTRICT_PARM "search_restrict" 156 #define PHP_HTDIG_SEARCHEXCLUDE_PARM "search_exclude" 157 #define PHP_HTDIG_MAXHOPCOUNT_PARM "max_hop_cont" 158 #define PHP_HTDIG_URLREWRITE_PARM "url_rewrite_rules" 159 #define PHP_HTDIG_BAD_QUERYSTR_PARM "bad_querystr" 160 161 //============================================================================= 162 //===== HTDIG INDEXING API ==================================================== 163 164 165 /*************************************************** 166 * HTDIG_DOCUMENTATION for htdig_parameters_struct 167 * 168 * DEBUGGING PARAMETERS 169 * 170 * int debug 171 * Verbose mode. This increases the verbosity of the 172 * program. Using more than 2 is probably only useful 173 * for debugging purposes. The default verbose mode 174 * gives a nice progress report while digging. 175 * 176 * char logFile 177 * File to stream debugging & error messages to! 178 * 179 * BOOLEAN PARAMETERS 180 * 181 * int initial 182 * Initial. Do not use any old databases. This is 183 * accomplished by first erasing the databases 184 * 185 * int create_text_database 186 * Create an ASCII version of the document database. 187 * This database is easy to parse with other programs so 188 * that information can be extracted from it. 189 * 190 * int report_statistics 191 * Report statistics after completion. 192 * 193 * int alt_work_area 194 * Use alternate work files. 195 * Tells htdig to append .work to database files, causing 196 * a second copy of the database to be built. This allows 197 * the original files to be used by htsearch during the 198 * indexing run. 199 * 200 * 201 * STRING PARAMETERS 202 * 203 * char configFile 204 * configfile 205 * Use the specified configuration file instead of the 206 * default. 207 * 208 * char credentials 209 * username:password 210 * Tells htdig to send the supplied username and 211 * password with each HTTP request. The credentials 212 * will be encoded using the 'Basic' authentication scheme. 213 * There *HAS* to be a colon (:) between the username 214 * and password. 215 * 216 * 217 * char maxhops //9 digit limit 218 * hopcount 219 * Limit the stored documents to those which are at 220 * most hopcount links away from the start URL. 221 * 222 * char minimalFile 223 * 224 * char URL 225 * 'command-line' URLs from stdin 226 * fetches & indexes these URLs 227 * 228 ******************************************************************/ 229 230 typedef struct htdig_parameters_struct { 231 232 char configFile[HTDIG_MAX_FILENAME_PATH_L]; 233 char DBpath[HTDIG_MAX_FILENAME_PATH_L]; 234 char credentials[HTDIG_MAX_FILENAME_PATH_L]; 235 char max_hops[10]; //9 digit limit 236 char minimalFile[HTDIG_MAX_FILENAME_PATH_L]; 237 238 //debugging & logfile 239 char logFile[HTDIG_MAX_FILENAME_PATH_L]; //location of log file 240 int debug; //0, 1 ,2, 3, 4, 5 241 242 //booelan values 243 int initial; 244 int create_text_database; 245 int report_statistics; 246 int alt_work_area; 247 int use_cookies; 248 249 //spidering filters 250 char URL[HTDIG_MAX_FILENAME_PATH_L]; 251 char limit_urls_to[HTDIG_MAX_FILENAME_PATH_L]; 252 char limit_normalized[HTDIG_MAX_FILENAME_PATH_L]; 253 char exclude_urls[HTDIG_MAX_FILENAME_PATH_L]; 254 char search_restrict[HTDIG_MAX_FILENAME_PATH_L]; 255 char search_exclude[HTDIG_MAX_FILENAME_PATH_L]; 256 char url_rewrite_rules[HTDIG_MAX_FILENAME_PATH_L]; 257 char bad_querystr[HTDIG_MAX_FILENAME_PATH_L]; 258 char locale[16]; 259 char title_factor[16]; 260 char text_factor[16]; 261 char meta_description_factor[16]; 262 int max_hop_count; 263 264 //the rewritten URL - OUTGOING after htdig_index_test_url 265 char rewritten_URL[HTDIG_MAX_FILENAME_PATH_L]; 266 267 } htdig_parameters_struct; 268 269 /***************************************************************** 270 * HTDIG_DOCUMENTATION for htdig_simple_doc_struct 271 * 272 * STRING PARAMETERS 273 * 274 * char location 275 * the 'URL' of the document. Can be any usefull string. 276 * 277 * char documentid 278 * document id of document [NOT CURRENTLY USED - IGNORED] 279 * 280 * char title 281 * document title 282 * 283 * char meta 284 * content that is indexed but won appear in an search excerpts 285 * 286 * char * contents 287 * pointer to a NULL TERMINATED string on information to be 288 * indexed. 289 * 290 * char content_type 291 * a MIME-like string 292 * custom MIME-type defined above, others are supported by 293 * htdig as well. 294 * 295 * 296 *****************************************************************/ 297 298 typedef struct htdig_simple_doc_struct { 299 300 char location[HTDIG_MAX_FILENAME_PATH_L]; 301 char documentid[HTDIG_DOCUMENT_ID_L]; 302 char title[HTDIG_DOCUMENT_TITLE_L]; 303 char meta[HTDIG_DOCUMENT_META_L]; 304 char *contents; //MUST ALLOCATE & FREE!!! 305 char content_type[HTDIG_DOCUMENT_CONTENT_TYPE_L]; //MIME-ISH string 306 //struct tm time_tm; // use to override index time 307 time_t doc_time; 308 309 } htdig_simple_doc_struct; 310 311 312 int htdig_index_open(htdig_parameters_struct *); 313 int htdig_index_simple_doc(htdig_simple_doc_struct * ); 314 int htdig_index_urls(void); 315 int htdig_index_reset(void); 316 int htdig_index_close(void); 317 318 int htdig_index_test_url(htdig_parameters_struct *htparms); 319 320 int htdig_get_max_head_length(void); 321 322 323 324 325 //============================================================================= 326 //===== HTDIG MERGING API ===================================================== 327 328 /************************************************** 329 * HTDIG_DOCUMENTATION for htmerge_parameters_struct 330 * 331 * DEBUGGING PARAMETERS 332 * 333 * int debug 334 * Verbose mode. This increases the verbosity of the 335 * program. Using more than 2 is probably only useful 336 * for debugging purposes. The default verbose mode 337 * gives a progress on what it is doing and where it is. 338 * 339 * char logFile 340 * File to stream debugging & error messages to! 341 * 342 * 343 * BOOLEAN PARAMETERS 344 * 345 * int alt_work_area 346 * Use alternate work files. 347 * Tells htmerge to append .work to database files causing 348 * a second copy of the database to be built. This allows 349 * original files to be used by htsearch during the indexing run. 350 * 351 * 352 * STRING PARAMETERS 353 * 354 * char configFile 355 * configfile 356 * Use the specified configuration file instead of the default. 357 * 358 * char merge_configFile 359 * merge_configfile 360 * Merge the databases specified into the databases specified 361 * by -c or the default. 362 * 363 * 364 *************************************************/ 365 366 typedef struct htmerge_parameters_struct { 367 368 char configFile[HTDIG_MAX_FILENAME_PATH_L]; 369 char merge_configFile[HTDIG_MAX_FILENAME_PATH_L]; 370 371 //debugging & logfile 372 char logFile[HTDIG_MAX_FILENAME_PATH_L]; //location of log file 373 int debug; //0, 1 ,2, 3, 4, 5 374 375 //booelan values 376 int alt_work_area; 377 378 } htmerge_parameters_struct; 379 380 int htmerge_index_merge(htmerge_parameters_struct *); 381 382 383 384 385 386 //============================================================================= 387 //===== HTDIG HTFUZZY API ===================================================== 388 389 390 391 /************************************************** 392 * HTDIG_DOCUMENTATION for htfuzzy_parameters_struct 393 * 394 * DEBUGGING PARAMETERS 395 * 396 * int debug 397 * Verbose mode. This increases the verbosity of the 398 * program. Using more than 2 is probably only useful 399 * for debugging purposes. 400 * 401 * char logFile 402 * File to stream debugging & error messages to! 403 * 404 * 405 * PARAMETERS 406 * 407 * char configFile 408 * configfile 409 * Use the specified configuration file instead of the default. 410 * 411 * int algorithms_flag 412 * Bitwise Flags to signal algorithms to be used 413 * 414 * soundex == HTDIG_ALG_SOUNDEX 415 * metaphone == HTDIG_ALG_METAPHONE 416 * accents == HTDIG_ALG_ACCENTS 417 * endings == HTDIG_ALG_ENDINGS 418 * synonyms == HTDIG_ALG_SYNONYMS 419 * 420 ***************************************************/ 421 422 423 typedef struct htfuzzy_parameters_struct { 424 425 char configFile[HTDIG_MAX_FILENAME_PATH_L]; 426 int algorithms_flag; 427 428 //debugging & logfile 429 char logFile[HTDIG_MAX_FILENAME_PATH_L]; //location of log file 430 int debug; //0, 1 ,2, 3, 4, 5 431 432 //booelan values 433 434 } htfuzzy_parameters_struct; 435 436 437 // htfuzzy functions 438 int htfuzzy_index(htfuzzy_parameters_struct *); 439 440 441 442 443 //============================================================================== 444 //===== HTDIG SEARCHING API ==================================================== 445 446 /************************************************ 447 * HTDIG_DOCUMENTATION for htsearch_parameters_struct 448 * 449 * DEBUGGING PARAMETERS 450 * 451 * int debug 452 * Verbose mode. This increases the verbosity of the; 453 * program. Using more than 2 is probably only useful; 454 * for debugging purposes. The default verbose mode; 455 * gives a progress on what it is doing and where it is.; 456 * 457 * char logFile 458 * File to stream debugging & error messages to! 459 * 460 * STRING PARAMETERS 461 * 462 * char configFile 463 * configfile 464 * Use the specified configuration file instead of the default. 465 * 466 * 467 **************************************************/ 468 469 typedef struct htsearch_parameters_struct { 470 471 char configFile[HTDIG_MAX_FILENAME_PATH_L]; 472 char DBpath[HTDIG_MAX_FILENAME_PATH_L]; 473 char locale[16]; 474 475 //debugging & logfile 476 char logFile[HTDIG_MAX_FILENAME_PATH_L]; //location of log file 477 int debug; //0, 1 ,2, 3, 4, 5 478 479 //filters 480 char search_restrict[HTDIG_MAX_FILENAME_PATH_L]; 481 char search_exclude[HTDIG_MAX_FILENAME_PATH_L]; 482 char title_factor[16]; 483 char text_factor[16]; 484 char meta_description_factor[16]; 485 486 } htsearch_parameters_struct; 487 488 489 490 491 /***************************************************************** 492 * HTDIG_DOCUMENTATION for htsearch_query_struct 493 * 494 * STRING PARAMETERS 495 * 496 * char raw_query 497 * STRING of text that is the search query -- syntax is important 498 * 499 * INTEGER PARAMETERS 500 * 501 * int algorithms_flag [ALSO CALLED 'method' IN HTDIG] 502 * HTSEARCH_ALG_BOOLEAN 503 * HTSEARCH_ALG_OR 504 * HTSEARCH_ALG_AND 505 * 506 * int sortby_flag 507 * score, date, title & reversed 508 * HTSEARCH_SORT_SCORE 509 * HTSEARCH_SORT_REV_SCORE 510 * HTSEARCH_SORT_TIME 511 * HTSEARCH_SORT_REV_TIME 512 * HTSEARCH_SORT_TITLE 513 * HTSEARCH_SORT_REV_TITLE 514 * 515 * int format 516 * short, long (with excerpt) 517 * HTSEARCH_FORMAT_LONG 518 * HTSEARCH_FORMAT_SHORT 519 * 520 * 521 * 522 * TODO: 'Connect' these htsearch features to this API 523 * 524 * config 525 * Specifies the name of the configuration file. 526 * 527 * exclude 528 * This value is a pattern that specifies which URLs are to be excluded from 529 * the search results. 530 * 531 * keywords 532 * Used to specify a list of required words that have to be in the documents. 533 * 534 * restrict 535 * This value is a pattern that all URLs of the search results will have to 536 * match. 537 * 538 * startyear, startmonth, startday, endyear, endmonth, endday 539 * These values specify the allowed range of document modification dates 540 * allowed in the search results. 541 * 542 * 543 * 544 *****************************************************************/ 545 546 typedef struct htsearch_query_struct { 547 548 char raw_query[HTDIG_MAX_QUERY_L]; 549 550 int algorithms_flag; 551 int sortby_flag; 552 int format; 553 554 } htsearch_query_struct; 555 556 557 /***************************************************************** 558 * HTDIG_DOCUMENTATION for htsearch_query_match_struct 559 * 560 * STRING PARAMETERS 561 * 562 * char title 563 * Title of document returned 564 * 565 * char URL 566 * URL/location-string of document returned 567 * 568 * char excerpt 569 * Excerpt with search words highlighted with 570 * <strong>searchword</strong> 571 * 572 * INTEGER PARAMETERS 573 * 574 * int score 575 * score in 'number of stars' 576 * [MAX NUMBER OF STARS DECLARED IN CONFIG FILE] 577 * 578 * int score_percent //top result is 100% 579 * 580 * time_t time [DOCUMENT TIME] 581 * struct tm time_tm [DOCUMENT TIME] 582 * int size [TOTAL DOCUMENT SIZE] 583 * 584 * 585 *****************************************************************/ 586 587 typedef struct htsearch_query_match_struct { 588 589 char title[HTDIG_DOCUMENT_TITLE_L]; 590 char URL[HTDIG_MAX_FILENAME_PATH_L]; 591 char excerpt[HTDIG_DOCUMENT_EXCERPT_L]; 592 int score; 593 int score_percent; //top result is 100% 594 struct tm time_tm; 595 int size; 596 597 } htsearch_query_match_struct; 598 599 600 // htsearch functions 601 602 int htsearch_open(htsearch_parameters_struct *); 603 int htsearch_query(htsearch_query_struct *); 604 605 int htsearch_get_nth_match(int, htsearch_query_match_struct *); 606 int htsearch_close(); 607 608 //htsearch_free(indicator) 609 610 char * htsearch_get_error(); 611 612 613 #endif /* LIBHTDIG_API_H */ 614 615