1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved. 2 3 This program is free software; you can redistribute it and/or modify 4 it under the terms of the GNU General Public License as published by 5 the Free Software Foundation; either version 2 of the License, or 6 (at your option) any later version. 7 8 This program is distributed in the hope that it will be useful, 9 but WITHOUT ANY WARRANTY; without even the implied warranty of 10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 GNU General Public License for more details. 12 13 You should have received a copy of the GNU General Public License 14 along with this program; if not, write to the Free Software 15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 */ 17 18 #ifndef _UDM_COMMON_H 19 #define _UDM_COMMON_H 20 21 #include "udm_config.h" 22 23 #include <stddef.h> 24 25 #include <stdio.h> /* for FILE etc. */ 26 27 #include <sys/types.h> 28 29 #ifdef HAVE_UNISTD_H 30 #include <unistd.h> 31 #endif 32 #ifdef HAVE_WINSOCK_H 33 #include <winsock.h> 34 #endif 35 #ifdef HAVE_SYS_SOCKET_H 36 #include <sys/socket.h> 37 #endif 38 #ifdef HAVE_NETINET_IN_H 39 #include <netinet/in.h> 40 #endif 41 #ifdef HAVE_ARPA_INET_H 42 #include <arpa/inet.h> 43 #endif 44 #ifdef HAVE_ARPA_NAMESER_H 45 #include <arpa/nameser.h> 46 #endif 47 #ifdef HAVE_RESOLV_H 48 #include <resolv.h> 49 #endif 50 #ifdef HAVE_NETDB_H 51 #include <netdb.h> 52 #endif 53 #ifdef MECAB 54 #include <mecab.h> 55 #endif 56 57 #ifdef HAVE_DEBUG 58 #include <assert.h> 59 #endif 60 61 /************************ Basic data types ****************/ 62 63 #if 0 64 #if 0 65 typedef const char * udm_rc_t; 66 #define UDM_OK ((const char*)NULL) 67 #define UDM_ERROR ((const char*)"error") 68 #define UDM_NOTARGET ((const char*)"notarget") 69 #define UDM_TERMINATED ((const char*)"terminated") 70 #else 71 typedef unsigned long udm_rc_t; 72 #define UDM_OK 0 73 #define UDM_ERROR 0xFFFFFFFFUL 74 #define UDM_NOTARGET 0xFFFFFFFEUL 75 #define UDM_TERMINATED 0xFFFFFFFDUL 76 #endif 77 #else 78 79 typedef enum 80 { 81 UDM_OK= 0, 82 UDM_ERROR= 1, 83 UDM_NOTARGET= 2, 84 UDM_TERMINATED= 3 85 } udm_rc_t; 86 #endif 87 88 typedef enum 89 { 90 UDM_FALSE= 0, 91 UDM_TRUE= 1 92 } udm_bool_t; 93 94 95 #define UDM_TEST(x) ((x) ? UDM_TRUE : UDM_FALSE) 96 97 #ifndef udm_max 98 #define udm_max(a,b) (((a) > (b)) ? (a) : (b)) 99 #endif 100 #ifndef udm_min 101 #define udm_min(a,b) (((a) < (b)) ? (a) : (b)) 102 #endif 103 104 typedef int (*udm_qsort_cmp)(const void*, const void*); 105 106 typedef uint4 udm_pos_t; 107 typedef unsigned char udm_secno_t; 108 typedef unsigned char udm_wordnum_t; 109 110 /**********************************************************/ 111 112 #include "udm_unicode.h" 113 #include "udm_uniconv.h" 114 #include "udm_unidata.h" 115 #include "udm_hash.h" 116 117 /**************************** Constant strings *******************************/ 118 typedef struct udm_const_string_st 119 { 120 const char *str; 121 size_t length; 122 } UDM_CONST_STR; 123 124 /************************* Constant attribute - name with value *************/ 125 typedef struct udm_const_attr_st 126 { 127 UDM_CONST_STR name; 128 UDM_CONST_STR value; 129 } UDM_CONST_ATTR; 130 131 /**************************** Variable strings *******************************/ 132 typedef struct udm_str_st 133 { 134 char *str; 135 size_t length; 136 } UDM_STR; 137 138 /************************ Dynamic strings ************************************/ 139 typedef struct dstr_struct 140 { 141 size_t size_alloced; /* Bytes allocated */ 142 size_t size_page; /* Bytes to allocate on overflow */ 143 #ifdef DSTR_PRIVATE 144 UDM_STR Val; /* Value (usually null-terminated string) and its length */ 145 #else 146 UDM_STR Val; 147 #endif 148 } UDM_DSTR; 149 150 #include "udm_utils.h" 151 152 153 /* Some constants */ 154 #define UDM_LANGPERDOC 16 /* FIXME */ 155 #define UDM_USER_AGENT "MnoGoSearch/" VERSION 156 #define UDM_MAXWORDPERQUERY 64 157 158 /* Some sizes and others definitions */ 159 #define UDM_MAXDOCSIZE 2*1024*1024 /**< 2 MB */ 160 #define UDM_DEFAULT_REINDEX_TIME 7*24*60*60 /**< 1week */ 161 #define UDM_MAXWORDSIZE 32 162 #define UDM_MAXDISCWORDSIZE 64 163 #define UDM_DEFAULT_MAX_HOPS 256 164 #define UDM_READ_TIMEOUT 30 165 #define UDM_DOC_TIMEOUT 90 166 #define UDM_MAXNETERRORS 16 167 #define UDM_DEFAULT_NET_ERROR_DELAY_TIME 86400 168 #define UDM_DEFAULT_BAD_SINCE_TIME 15*24*60*60 /**< 15 days */ 169 #define UDM_FINDURL_CACHE_SIZE 128 170 #define UDM_SERVERID_CACHE_SIZE 128 171 #define UDM_ERRSTR_SIZE 2048 172 173 /* search modes */ 174 typedef enum 175 { 176 UDM_MODE_ALL= 0, 177 UDM_MODE_ANY= 1, 178 UDM_MODE_BOOL= 2, 179 UDM_MODE_PHRASE= 3, 180 UDM_MODE_ALL_MINUS= 4, 181 UDM_MODE_ALL_MINUS_HALF= 5 182 } udm_search_mode_t; 183 184 185 /* word and pattern match type */ 186 typedef enum 187 { 188 UDM_MATCH_FULL= 0, 189 UDM_MATCH_BEGIN= 1, 190 UDM_MATCH_SUBSTR= 2, 191 UDM_MATCH_END= 3, 192 UDM_MATCH_REGEX= 4, 193 UDM_MATCH_WILD= 5, 194 UDM_MATCH_SUBNET= 6, 195 UDM_MATCH_NUMERIC_LT= 7, 196 UDM_MATCH_NUMERIC_GT= 8, 197 UDM_MATCH_RANGE= 9 198 } udm_match_mode_t; 199 200 201 /* Case sensitivity */ 202 #define UDM_CASE_SENSITIVE 0 203 #define UDM_CASE_INSENSITIVE 1 204 205 /* Flags for indexing */ 206 #define UDM_FLAG_REINDEX 1 207 #define UDM_FLAG_SORT_EXPIRED 2 208 #define UDM_FLAG_SORT_HOPS 4 209 #define UDM_FLAG_ADD_SERV 8 210 #define UDM_FLAG_SPELL 16 211 #define UDM_FLAG_LOAD_LANGMAP 32 212 #define UDM_FLAG_DONTSORT_SEED 64 213 #define UDM_FLAG_ADD_SERVURL 128 214 #define UDM_FLAG_DONT_ADD_TO_DB 256 215 216 217 /* URLFile actions */ 218 typedef enum 219 { 220 UDM_URL_FILE_REINDEX= 1, 221 UDM_URL_FILE_CLEAR= 2, 222 UDM_URL_FILE_INSERT= 3, 223 UDM_URL_FILE_PARSE= 4 224 } udm_urlfilecmd_t; 225 226 /* Ispell mode binary flags */ 227 #define UDM_ISPELL_MODE_DB 1 228 #define UDM_ISPELL_USE_PREFIXES 2 229 #define UDM_ISPELL_MODE_SERVER 4 230 231 232 /* Known content types */ 233 typedef enum 234 { 235 UDM_CONTENT_TYPE_UNKNOWN= 0, 236 UDM_CONTENT_TYPE_TEXT_PLAIN= 1, 237 UDM_CONTENT_TYPE_TEXT_HTML= 2, 238 UDM_CONTENT_TYPE_TEXT_XML= 3, 239 UDM_CONTENT_TYPE_MESSAGE_RFC822= 4, 240 UDM_CONTENT_TYPE_AUDIO_MPEG= 5, 241 UDM_CONTENT_TYPE_HTDB= 6, 242 UDM_CONTENT_TYPE_DOCX= 7, 243 UDM_CONTENT_TYPE_TEXT_RTF= 8 244 } udm_content_type_t; 245 246 247 /* Action type: HTTP methods */ 248 typedef enum 249 { 250 UDM_METHOD_GET= 0, 251 UDM_METHOD_UNKNOWN= 1, 252 UDM_METHOD_DISALLOW= 2, 253 UDM_METHOD_HEAD= 3, 254 UDM_METHOD_HREFONLY= 4, 255 UDM_METHOD_CHECKMP3= 5, 256 UDM_METHOD_CHECKMP3ONLY= 6, 257 UDM_METHOD_VISITLATER= 7, 258 UDM_METHOD_INDEX= 8, 259 UDM_METHOD_NOINDEX= 9, 260 UDM_METHOD_IMPORTONLY= 10 261 } udm_method_t; 262 263 #define UDM_METHOD_DEFAULT UDM_METHOD_GET 264 265 typedef enum 266 { 267 UDM_LINK_SOURCE_UNKNOWN= 0, 268 UDM_LINK_SOURCE_CMDLINE= 1, 269 UDM_LINK_SOURCE_CONF= 2, 270 UDM_LINK_SOURCE_A_HREF= 3, 271 UDM_LINK_SOURCE_FRAME_SRC= 4, 272 UDM_LINK_SOURCE_IMG_SRC= 5, 273 UDM_LINK_SOURCE_REDIRECT= 6, 274 UDM_LINK_SOURCE_META_REFRESH= 7, 275 UDM_LINK_SOURCE_LINK_HREF= 8, 276 UDM_LINK_SOURCE_AREA_HREF= 9, 277 UDM_LINK_SOURCE_IFRAME_SRC= 10, 278 UDM_LINK_SOURCE_SCRIPT_SRC= 11, 279 UDM_LINK_SOURCE_HTDB= 12, 280 UDM_LINK_SOURCE_URLFILE= 13, 281 UDM_LINK_SOURCE_ROBOTS_SITEMAP= 14, 282 UDM_LINK_SOURCE_XML= 15, 283 UDM_LINK_SOURCE_DIR= 16 /* Directory listing: file:///dir/ */ 284 } udm_link_source_t; 285 #define UDM_LINK_SOURCES_YES 0x7FFFFFFF 286 287 288 /* CollectLinks flags*/ 289 typedef enum 290 { 291 UDM_COLLECT_LINKS_NONE= 0x0000, /* do not collect links */ 292 UDM_COLLECT_LINKS_INNER= 0x0001, /* links covered by .conf file */ 293 UDM_COLLECT_LINKS_OUTER= 0x0002, /* links going outside .conf file */ 294 UDM_COLLECT_LINKS_SITE= 0x0004, /* links going to the same site */ 295 UDM_COLLECT_LINKS_PAGE= 0x0008, /* links going to the same page */ 296 UDM_COLLECT_LINKS_BADSCHEME= 0x0010, /* Schema: mailto, javascript */ 297 UDM_COLLECT_LINKS_BAD= 0x0020, /* Bad syntax */ 298 UDM_COLLECT_LINKS_HOPS= 0x0040, /* Too many hops */ 299 UDM_COLLECT_LINKS_FILTER= 0x0080, /* Too many hops */ 300 UDM_COLLECT_LINKS_PERSITE= 0x0100, /* Per site limite reached */ 301 UDM_COLLECT_LINKS_ALL_DST= 0xFFFF 302 } udm_link_destination_t; 303 304 305 /* 306 Don't need these by default: 307 UDM_COLLECT_LINKS_PAGE 308 UDM_COLLECT_LINKS_BADSCHEMA 309 UDM_COLLECT_LINKS_BAD 310 */ 311 #define UDM_COLLECT_LINKS_YES \ 312 (UDM_COLLECT_LINKS_INNER |\ 313 UDM_COLLECT_LINKS_OUTER |\ 314 UDM_COLLECT_LINKS_SITE |\ 315 UDM_COLLECT_LINKS_HOPS |\ 316 UDM_COLLECT_LINKS_FILTER |\ 317 UDM_COLLECT_LINKS_PERSITE) 318 #define UDM_COLLECT_LINKS_DEFAULT UDM_COLLECT_LINKS_YES 319 320 /* Robots flags */ 321 typedef enum 322 { 323 UDM_ROBOTS_NONE= 0x0000, /* Don't respect robots limitation */ 324 UDM_ROBOTS_TXT= 0x0001, /* Respect robots.txt instructions */ 325 UDM_ROBOTS_HEADER= 0x0002, /* Restext "X-Robots-Tag" HTTP header */ 326 UDM_ROBOTS_META= 0x0004, /* Respect <meta robots="xxx"> insturctions */ 327 UDM_ROBOTS_REL= 0x0008, /* Respect <link rel="nofollow> insturctions */ 328 UDM_ROBOTS_ALL= 0xFFFF /* Respect all robots instructions */ 329 } udm_use_robots_t; 330 331 332 typedef enum 333 { 334 UDM_COLLECT_LINKS_ABSOLUTE= 0x000000, /* Store in absolute format */ 335 UDM_COLLECT_LINKS_ASIS= 0x010000, /* Preserve the original form */ 336 UDM_COLLECT_LINKS_FORMAT= 0xFF0000 337 } udm_link_format_t; 338 339 340 /* Content encoding types */ 341 typedef enum 342 { 343 UDM_CONTENT_ENCODING_IDENTITY= 0, 344 UDM_CONTENT_ENCODING_DEFLATE= 1, 345 UDM_CONTENT_ENCODING_GZIP= 3, 346 UDM_CONTENT_ENCODING_ZLIB_COMPRESS= 4, 347 UDM_CONTENT_ENCODING_UNKNOWN= 5 348 } udm_content_encoding_t; 349 350 351 /* Words origins */ 352 typedef enum 353 { 354 UDM_WORD_ORIGIN_QUERY= 1, 355 UDM_WORD_ORIGIN_SPELL= 2, 356 UDM_WORD_ORIGIN_SYNONYM= 3, 357 UDM_WORD_ORIGIN_SYNONYM_FINAL= 4, 358 UDM_WORD_ORIGIN_STOP= 5, 359 UDM_WORD_ORIGIN_SUGGEST= 6, 360 UDM_WORD_ORIGIN_COLLATION= 7 361 } udm_wordorigin_t; 362 363 364 /* URL data flags */ 365 #define UDM_URLDATA_URL 1 366 #define UDM_URLDATA_SITE 2 367 #define UDM_URLDATA_POP 4 368 #define UDM_URLDATA_LM 8 369 #define UDM_URLDATA_SU 16 370 #define UDM_URLDATA_SITE_RANK 32 371 372 /* Locking mutex numbers */ 373 #define UDM_LOCK_INTERNAL 0 374 #define UDM_LOCK_CONF 1 375 #define UDM_LOCK_WIN 2 376 #define UDM_LOCK_THREAD 3 377 #define UDM_LOCK_SQL 4 378 #define UDM_LOCK_SEGMENTER 5 379 #define UDM_LOCK_DB 6 380 #define UDM_LOCK_LOG 7 381 #define UDM_LOCK_HREF_CACHE 8 382 #define UDM_LOCK_TARGETS 9 383 #define UDM_LOCK_HOST_CACHE 10 384 #define UDM_LOCK_INADDR_CACHE 11 385 #define UDM_LOCK_DOC_CACHE 12 386 #define UDM_LOCK_ROBOT_CACHE 13 387 #define UDM_LOCK_COOKIE_CACHE 14 388 /* Don't forget to add a new name into mutex.c */ 389 #define UDM_LOCK_ROBOT_FIRST 15 390 #define UDM_LOCK_ROBOT_LAST 128 391 #define UDM_LOCK_ROBOT_COUNT (UDM_LOCK_ROBOT_LAST - UDM_LOCK_ROBOT_FIRST +1) 392 #define UDM_LOCK_HOST_FIRST 129 393 #define UDM_LOCK_HOST_LAST 255 394 #define UDM_LOCK_HOST_COUNT (UDM_LOCK_HOST_LAST - UDM_LOCK_HOST_FIRST + 1) 395 #define UDM_LOCK_MAX (UDM_LOCK_HOST_LAST + 1) 396 397 typedef unsigned int udm_threadid_t; 398 typedef unsigned int udm_mutexno_t; 399 400 typedef enum 401 { 402 UDM_LOCK, 403 UDM_UNLOCK, 404 UDM_CKLOCK 405 } udm_mutexcmd_t; 406 407 /************************ Statistics **********************/ 408 typedef struct stat_struct 409 { 410 int status; 411 int expired; 412 int total; 413 } UDM_STAT; 414 415 typedef struct stat_list_struct 416 { 417 time_t time; 418 size_t nstats; 419 UDM_STAT *Stat; 420 } UDM_STATLIST; 421 422 /************************ VARLISTs ************************/ 423 424 /* Various variable flags */ 425 typedef enum 426 { 427 UDM_VARFLAG_NONE= 0x00, 428 UDM_VARFLAG_NOCLONE= 0x01, /* Ignore for clone detection */ 429 UDM_VARFLAG_USERDEF= 0x02, /* User defined section */ 430 UDM_VARFLAG_HTMLSOURCE= 0x08, /* If apply HTML parser, for HTDB */ 431 UDM_VARFLAG_WIKI= 0x10, /* If to remove text between [ and ] */ 432 UDM_VARFLAG_HL= 0x20, /* If variable has highlight markers */ 433 UDM_VARFLAG_NOINDEX= 0x40, /* If section should be in bdicti but not in bdict */ 434 UDM_VARFLAG_DECIMAL= 0x80, /* Whether to detect decimal numbers */ 435 UDM_VARFLAG_RAW= 0x100, /* If sections is Raw (no cs conversion */ 436 UDM_VARFLAG_READONLY= 0x200, /* If read only (e.g. full) */ 437 UDM_VARFLAG_FORCETXT= 0x400 /* $(var) forces TXT rather than HTML output */ 438 } udm_var_flag_t; 439 440 441 /* Value handler types */ 442 typedef enum 443 { 444 UDM_VALUE_HANDLER_TYPE_STR= 1, 445 UDM_VALUE_HANDLER_TYPE_INT= 2, 446 UDM_VALUE_HANDLER_TYPE_DOUBLE= 3, 447 UDM_VALUE_HANDLER_TYPE_CHAR= 4, 448 UDM_VALUE_HANDLER_TYPE_ENV= 16, 449 UDM_VALUE_HANDLER_TYPE_RESULT= 18, 450 UDM_VALUE_HANDLER_TYPE_DOCUMENT= 19, 451 UDM_VALUE_HANDLER_TYPE_SQLRESULT= 20, 452 UDM_VALUE_HANDLER_TYPE_EXCERPT_FRAGMENT= 21 453 } udm_value_handler_type_t; 454 455 /* 456 Value handler data types. 457 Handlers of different type can have the data type. 458 */ 459 typedef enum 460 { 461 UDM_VALUE_DATA_TYPE_STR= 1, 462 UDM_VALUE_DATA_TYPE_INT= 2, 463 UDM_VALUE_DATA_TYPE_DOUBLE= 3, 464 UDM_VALUE_DATA_TYPE_CHAR= 4, 465 UDM_VALUE_DATA_TYPE_ENV= 16, 466 UDM_VALUE_DATA_TYPE_RESULT= 18, 467 UDM_VALUE_DATA_TYPE_DOCUMENT= 19, 468 UDM_VALUE_DATA_TYPE_SQLRESULT= 20, 469 UDM_VALUE_DATA_TYPE_EXCERPT_FRAGMENT= 21 470 } udm_value_data_type_t; 471 472 473 474 /* 475 These values should not be equal to any secno values 476 in message_header_param[] in message.c. 477 TODO34: get rid of this. Don't put QSTRING and ENV 478 values into ENV, e.g. use UDM_QUERY instead. 479 */ 480 #define UDM_VARSRC_QSTRING 255 481 #define UDM_VARSRC_ENV 254 482 483 484 typedef struct 485 { 486 udm_secno_t secno; /**< Number 0..255 */ 487 udm_var_flag_t flags; 488 } UDM_SECTION_PARAM; 489 490 491 typedef struct 492 { 493 UDM_DSTR Value; /**< Value with length */ 494 UDM_SECTION_PARAM Param; /**< Parameters */ 495 } UDM_SECTION; 496 497 498 struct udm_value_st; 499 struct udm_var_st; 500 struct udm_varlist_st; 501 struct udm_value_handler_st; 502 struct udm_prog_executor_state_st; 503 struct udm_prog_executor_st; 504 505 typedef struct udm_value_prototype_st 506 { 507 const struct udm_value_handler_st *handler; 508 } UDM_VALUE_PROTOTYPE; 509 510 511 typedef struct udm_function_prototype_st 512 { 513 UDM_VALUE_PROTOTYPE result; 514 size_t nargs; 515 const UDM_VALUE_PROTOTYPE *args; 516 } UDM_FUNCTION_PROTOTYPE; 517 518 519 typedef void (*udm_func_runtime_t)(struct udm_prog_executor_state_st *state); 520 521 typedef struct 522 { 523 const UDM_CONST_STR name; 524 UDM_FUNCTION_PROTOTYPE prototype; 525 udm_func_runtime_t func; 526 } UDM_FUNCTION; 527 528 529 typedef struct udm_value_handler_st 530 { 531 udm_value_handler_type_t type; 532 udm_value_data_type_t datatype; 533 udm_value_data_type_t native_reg_type; 534 const char *type_name; 535 udm_rc_t (*Constructor)(char *Data, 536 const struct udm_value_st **args, size_t nargs); 537 void (*Destructor)(char *Data); 538 539 udm_rc_t (*Copy)(char *To, const char *From); 540 541 size_t (*VarSize)(void); 542 size_t (*ValueSize)(void); 543 size_t (*DataOffset)(void); 544 size_t (*DataSize)(void); 545 size_t (*DataAlignment)(void); 546 udm_rc_t (*Dump)(const char *Data, const char *name, FILE *f); 547 udm_var_flag_t (*Flags)(const char *Data); 548 udm_secno_t (*Secno)(const char *Data); 549 size_t (*MemUsed)(const char *Data); 550 551 void (*GetConstStr)(const char *D, UDM_CONST_STR *Val); 552 void (*GetBool)(const char *D, udm_bool_t *val); 553 void (*GetInt)(const char *D, int *val); 554 void (*GetUInt)(const char *D, unsigned int *val); 555 void (*GetDouble)(const char *D, double *val); 556 udm_rc_t (*SetInt)(char *D, int value); 557 udm_rc_t (*SetDouble)(char *D, double value); 558 udm_rc_t (*SetStrn)(char *D, const char *str, size_t length); 559 560 udm_rc_t (*PrintToFile)(const char *D, FILE *f); 561 udm_rc_t (*PrintToDSTR)(const char *D, UDM_DSTR *dstr); 562 563 udm_rc_t (*ConvertCharset)(char *Data, UDM_CONV *conv, int flags); 564 /* Set new value but don't touch the other members (e.g. section param) */ 565 udm_rc_t (*SetConv)(char *Data, UDM_CONV *cnv, int cnvflag, const char *src, size_t length); 566 udm_rc_t (*AppendStrn)(char *Data, const char *str, size_t length); 567 udm_rc_t (*AppendConv)(char *Data, size_t maxlen, UDM_CONV *cnv, int cnvflags, const char *src, size_t length); 568 569 const UDM_FUNCTION *method; 570 } UDM_VALUE_HANDLER; 571 572 573 typedef struct udm_value_st 574 { 575 const struct udm_value_handler_st *handler; /**< Value handler */ 576 } UDM_VALUE; 577 578 579 typedef struct udm_var_header_st 580 { 581 char *name; /**< Variable name */ 582 } UDM_VAR_HEADER; 583 584 typedef struct udm_var_st 585 { 586 UDM_VAR_HEADER header; 587 #ifdef UDM_VAR_H_PRIVATE 588 UDM_VALUE Value; 589 #else 590 UDM_VALUE m_hidden_Value; 591 #endif 592 } UDM_VAR; 593 594 595 typedef enum 596 { 597 UDM_VARLIST_FLAG_NONE= 0, 598 UDM_VARLIST_FLAG_CS= 1 /* Case sensitive */ 599 } udm_varlist_flag_t; 600 601 602 typedef struct udm_varlist_st 603 { 604 size_t nvars; /* Number of registered variables */ 605 size_t mvars; /* Number of allocated variables */ 606 UDM_VAR **Var; 607 udm_varlist_flag_t flags; 608 } UDM_VARLIST; 609 610 611 typedef struct 612 { 613 size_t nitems; 614 size_t mitems; 615 UDM_VARLIST *Item; 616 } UDM_VARLISTLIST; 617 618 619 typedef enum 620 { 621 UDM_TEXTLIST_FLAG_NONE= 0x00, 622 UDM_TEXTLIST_FLAG_SKIP_ADD_SECTION= 0x01, 623 UDM_TEXTLIST_FLAG_RFC1522= 0x02, /* Message header (Subj, From)*/ 624 UDM_TEXTLIST_FLAG_MESSAGE_RFC822= 0x04, /* Used by cached copy for messages */ 625 UDM_TEXTLIST_FLAG_HTML= 0x08 /* HTML format with entities */ 626 } udm_textlist_flag_t; 627 628 629 typedef struct 630 { 631 udm_secno_t secno; 632 udm_textlist_flag_t flags; 633 } UDM_TEXT_PARAM; 634 635 636 typedef struct 637 { 638 UDM_CONST_STR text; 639 UDM_CONST_STR href; 640 UDM_CONST_STR section_name; 641 } UDM_CONST_TEXTITEM; 642 643 644 typedef struct 645 { 646 char *str; 647 char *href; 648 char *section_name; 649 UDM_TEXT_PARAM Param; 650 } UDM_TEXTITEM; 651 652 653 typedef struct 654 { 655 size_t nitems; 656 size_t mitems; 657 UDM_TEXTITEM *Item; 658 } UDM_TEXTLIST; 659 660 /*****************************************************/ 661 662 /** StopList unit */ 663 typedef struct udm_stopword_struct 664 { 665 char *word; 666 } UDM_STOPWORD; 667 668 #define UDM_STOPLIST_LANGLEN 32 669 #define UDM_STOPLIST_CSETLEN 32 670 #define UDM_STOPLIST_FILELEN 128 671 672 typedef struct 673 { 674 size_t nstopwords; 675 UDM_STOPWORD *StopWord; 676 char lang[UDM_STOPLIST_LANGLEN]; 677 char cset[UDM_STOPLIST_CSETLEN]; 678 char fname[UDM_STOPLIST_FILELEN]; 679 } UDM_STOPLIST; 680 681 typedef struct 682 { 683 size_t nitems; 684 UDM_STOPLIST *Item; 685 } UDM_STOPLISTLIST; 686 687 /*****************************************************/ 688 689 /** Words parameters */ 690 typedef struct 691 { 692 size_t min_word_len; 693 size_t max_word_len; 694 } UDM_WORDPARAM; 695 696 697 /**************************/ 698 typedef struct udm_coord_st 699 { 700 udm_pos_t pos:24; /* 3 */ 701 udm_secno_t secno; /* 1 */ 702 } UDM_COORD; 703 704 typedef struct udm_urlid_coord_st 705 { 706 urlid_t url_id; /* 4 */ 707 UDM_COORD coord; /* 4 */ 708 } UDM_URLID_COORD; 709 710 typedef struct 711 { 712 UDM_URLID_COORD urlid_coord; 713 udm_pos_t seclen:24; /* 3 */ /*TODO34: get rid of this*/ 714 udm_wordnum_t num; /* 1 */ /*TODO34: get rid of this*/ 715 } UDM_URL_CRD; /* 12 bytes total */ 716 717 718 typedef struct 719 { 720 size_t acoords; 721 size_t ncoords; 722 size_t order; 723 char *word; 724 UDM_URL_CRD *Coords; 725 } UDM_URLCRDLIST; 726 727 728 /***************************/ 729 typedef struct udm_coord2_st 730 { 731 udm_pos_t pos:24; 732 udm_wordnum_t order; 733 } UDM_COORD2; 734 735 736 typedef struct udm_searchsection_st 737 { 738 UDM_COORD2 *Coord; /* 4/8 bytes */ 739 const unsigned char *PackedCoord; /* 4/8 bytes */ 740 urlid_t url_id; /* 4 bytes */ 741 udm_pos_t ncoords; /* 4 bytes */ 742 udm_pos_t seclen; /* 4 bytes */ 743 udm_pos_t minpos; /* 4 bytes */ 744 udm_pos_t maxpos; /* 4 bytes */ 745 udm_secno_t secno; /* 1 byte */ 746 udm_wordnum_t wordnum; /* 1 byte */ 747 udm_wordnum_t order; /* 1 byte */ 748 } UDM_SEARCHSECTION; /* 32 bytes (i386), 40 bytes (64bit) */ 749 750 751 typedef struct udm_searchsectionlist_st 752 { 753 size_t mcoords; 754 size_t ncoords; 755 UDM_COORD2 *Coord; 756 size_t msections; 757 size_t nsections; 758 UDM_SEARCHSECTION *Section; 759 } UDM_SEARCHSECTIONLIST; 760 761 762 typedef struct udm_searchsectionlistlist_st 763 { 764 size_t nitems; 765 size_t mitems; 766 UDM_SEARCHSECTIONLIST *Item; 767 } UDM_SEARCHSECTIONLISTLIST; 768 769 770 771 /** Main search structure */ 772 typedef struct { 773 urlid_t url_id; 774 uint4 score; 775 } UDM_URL_SCORE; 776 777 typedef struct { 778 size_t nitems; 779 UDM_URL_SCORE *Item; 780 } UDM_URLSCORELIST; 781 782 783 /* UserScore and UserSiteScore structure */ 784 785 typedef struct udm_url_int4_st 786 { 787 urlid_t url_id; 788 int4 param; 789 } UDM_URL_INT4; 790 791 typedef struct udm_url_int4_list_st 792 { 793 size_t nitems; 794 UDM_URL_INT4 *Item; 795 } UDM_URL_INT4_LIST; 796 797 798 799 /* Structure to handle limits */ 800 typedef struct udm_urlid_list_st 801 { 802 char empty; 803 char exclude; 804 urlid_t *urls; 805 size_t nurls; 806 } UDM_URLID_LIST; 807 808 809 typedef struct 810 { 811 urlid_t url_id; 812 uint4 score; 813 uint4 per_site; 814 urlid_t site_id; 815 time_t last_mod_time; 816 double pop_rank; 817 char *url; 818 char *section; 819 } UDM_URLDATA; 820 821 #define UDM_COORD2DBNUM(score) (255 - (int) ((score) & 0xFF)) 822 823 typedef struct 824 { 825 size_t nitems; 826 UDM_URLDATA *Item; 827 } UDM_URLDATALIST; 828 829 830 /** Word list unit */ 831 typedef struct 832 { 833 char *word; 834 UDM_COORD coord; 835 unsigned char hash; 836 unsigned char seclen_marker; 837 } UDM_WORD; 838 839 typedef struct 840 { 841 size_t wordpos[256]; /**< Word positions in sections */ 842 size_t mwords; /**< Number of memory allocated for words */ 843 size_t nwords; /**< Real number of words in list */ 844 UDM_WORD *Word; /**< Word list itself */ 845 } UDM_WORDLIST; 846 847 848 typedef struct 849 { 850 size_t nitems; 851 UDM_WORDLIST Item[256]; 852 } UDM_WORDLISTLIST; 853 854 855 typedef struct 856 { 857 const char *str; /* 4 */ 858 udmcrc32_t crc; /* 4 */ 859 UDM_COORD coord; /* 4 */ 860 unsigned char length; /* 1 */ 861 } UDM_CONSTWORD; 862 863 864 typedef struct 865 { 866 size_t nitems; 867 size_t mitems; 868 UDM_CONSTWORD *Item; 869 udm_pos_t wordpos[256]; 870 } UDM_CONSTWORDLIST; 871 872 /***************************************************************/ 873 874 /* Server/Realm follow types */ 875 typedef enum 876 { 877 UDM_WEBSPACE_PAGE= 0, 878 UDM_WEBSPACE_PATH= 1, 879 UDM_WEBSPACE_SITE= 2, 880 UDM_WEBSPACE_WORLD= 3, 881 UDM_WEBSPACE_URLLIST= 4, 882 UDM_WEBSPACE_UNKNOWN= 127 883 } udm_webspace_t; 884 #define UDM_WEBSPACE_DEFAULT UDM_WEBSPACE_PATH 885 886 typedef struct 887 { 888 udm_bool_t index; /**< Whether to index words */ 889 udm_bool_t follow; /**< Whether follow links */ 890 udm_bool_t archive; /**< Whether to store cached copies */ 891 } UDM_ROBOTSPARAM; 892 893 894 typedef struct 895 { 896 int max_net_errors; 897 int net_error_delay_time; 898 int read_timeout; 899 int doc_timeout; 900 int period; /**< Reindex period */ 901 int maxhops; /**< Max way in mouse clicks */ 902 int doc_per_site; 903 int crawl_delay; 904 int dns_cache_timeout; 905 int link_sources_to_follow; 906 udm_link_destination_t collect_links_destination; 907 udm_link_format_t collect_links_format; 908 udm_webspace_t webspace; /**< World, Site, Path, Page */ 909 udm_use_robots_t use_robots; /**< Whether to use robots.txt and meta tags */ 910 UDM_ROBOTSPARAM robots; 911 udm_bool_t use_clones; /**< Whether to detect clones */ 912 udm_bool_t ajax_links; /**< Whether to detect links with '#!' */ 913 } UDM_SPIDERPARAM; 914 915 /*****************************************************************/ 916 917 typedef enum 918 { 919 UDM_MATCH_FLAG_NONE= 0, 920 UDM_MATCH_FLAG_SKIP_OPTIMIZATION= 1, 921 UDM_MATCH_FLAG_CASE_INSENSITIVE= 2, 922 UDM_MATCH_FLAG_NEGATIVE= 4 923 } udm_matchflag_t; 924 925 926 typedef struct 927 { 928 int one; 929 int many; 930 int eol; 931 } UDM_WILD_PARAM; 932 933 934 typedef struct 935 { 936 udm_match_mode_t match_mode; 937 udm_matchflag_t flags; /* optimization, case sensitivity, negative */ 938 } UDM_MATCH_PARAM; 939 940 typedef struct 941 { 942 UDM_MATCH_PARAM Param; 943 #ifdef UDM_MATCH_PRIV 944 UDM_STR Pattern; 945 #else 946 UDM_STR m_hidden_Pattern; 947 #endif 948 void *specific; 949 } UDM_MATCH; 950 951 952 typedef struct 953 { 954 int beg; 955 int end; 956 } UDM_MATCH_PART; 957 958 959 typedef struct 960 { 961 UDM_MATCH Match; 962 int quality; 963 } UDM_EXCERPT_FRAGMENT; 964 965 966 typedef struct 967 { 968 UDM_MATCH Match; 969 UDM_STR Replace; 970 } UDM_REPLACE; 971 972 973 typedef struct 974 { 975 size_t nitems; 976 size_t mitems; 977 UDM_REPLACE *Item; 978 } UDM_REPLACELIST; 979 980 981 typedef struct 982 { 983 UDM_REPLACE Replace; 984 UDM_STR SectionName; 985 UDM_STR Source; 986 } UDM_USERSECTION; 987 988 989 typedef struct 990 { 991 size_t nitems; 992 size_t mitems; 993 UDM_USERSECTION *Item; 994 } UDM_USERSECTIONLIST; 995 996 997 typedef struct 998 { 999 UDM_MATCH Match; 1000 udm_method_t method; 1001 } UDM_FILTER; 1002 1003 1004 typedef struct 1005 { 1006 size_t mitems; 1007 size_t nitems; 1008 UDM_FILTER *Item; 1009 } UDM_FILTERLIST; 1010 1011 1012 typedef struct 1013 { 1014 UDM_FILTER Filter; 1015 char *section; 1016 } UDM_SECTIONFILTER; 1017 1018 1019 typedef struct 1020 { 1021 size_t mitems; 1022 size_t nitems; 1023 UDM_SECTIONFILTER *Item; 1024 } UDM_SECTIONFILTERLIST; 1025 1026 1027 /*****************************************************************/ 1028 /** Parsed URL string */ 1029 typedef struct udm_url 1030 { 1031 char *schema; 1032 char *specific; 1033 char *hostinfo; 1034 char *auth; 1035 char *hostname; 1036 char *path; 1037 char *filename; 1038 char *anchor; 1039 int port; 1040 int default_port; 1041 } UDM_URL; 1042 1043 1044 /** List of URLs */ 1045 typedef struct udm_url_list 1046 { 1047 size_t nitems; 1048 size_t mitems; 1049 UDM_URL *Item; 1050 } UDM_URLLIST; 1051 1052 1053 /*****************************************************************/ 1054 1055 /** Structure to store server parameters */ 1056 typedef struct 1057 { 1058 UDM_FILTER Filter; 1059 urlid_t site_id; /**< server.rec_id */ 1060 char command; /**< 'S' - server,realm, 'F' - disallow,allow */ 1061 int ordre; /**< order in list to find */ 1062 urlid_t parent; /**< parent rec_id for grouping by site */ 1063 float weight; /**< server weight for popularity rank calculation */ 1064 UDM_VARLIST Vars; /**< Default lang, charset,etc*/ 1065 UDM_URLLIST ProxyList; /**< List of proxies */ 1066 uint4 MaxHops; 1067 udm_webspace_t webspace; /* Page, Path, Site, World, etc*/ 1068 /*udm_method_t method;*/ /* Allow, Disallow, etc */ 1069 udm_bool_t enabled; 1070 } UDM_SERVER; 1071 1072 1073 typedef struct 1074 { 1075 size_t nservers; 1076 size_t mservers; 1077 int have_subnets; 1078 UDM_SERVER *Server; 1079 } UDM_SERVERLIST; 1080 1081 1082 /*******************************************************/ 1083 1084 typedef struct 1085 { 1086 size_t max_doc_per_site; 1087 urlid_t referrer; 1088 uint4 hops; 1089 urlid_t server_id; 1090 urlid_t rec_id; 1091 udm_method_t method; 1092 udm_bool_t stored; 1093 udm_link_source_t link_source; 1094 udm_link_destination_t method_reason; 1095 } UDM_HREFPARAM; 1096 1097 /* All links are stored in the cache of this structure */ 1098 /* before actual INSERT into database */ 1099 1100 typedef struct 1101 { 1102 char *url; 1103 UDM_VARLIST HrefVars; 1104 UDM_HREFPARAM Param; 1105 } UDM_HREF; 1106 1107 1108 typedef enum 1109 { 1110 UDM_HREFLIST_FLAG_NONUNIQ= 1 1111 } udm_hreflist_flag_t; 1112 1113 1114 typedef struct 1115 { 1116 size_t mhrefs; 1117 size_t nhrefs; 1118 size_t shrefs; 1119 UDM_HREF *Href; 1120 int flags; 1121 } UDM_HREFLIST; 1122 1123 /*******************************************************/ 1124 1125 /* IP with parameters */ 1126 typedef struct udm_sin_addr_st 1127 { 1128 struct in_addr addr; 1129 time_t first_used; 1130 time_t last_used; 1131 size_t hits; 1132 } UDM_INADDR; 1133 1134 1135 typedef struct 1136 { 1137 size_t nitems; 1138 size_t mitems; 1139 UDM_INADDR *Item; 1140 } UDM_INADDRLIST; 1141 1142 1143 /** Resolve stuff */ 1144 typedef struct udm_host_addr_struct 1145 { 1146 char *hostname; 1147 struct in_addr addr; 1148 int net_errors; 1149 time_t last_used; 1150 time_t expires; 1151 } UDM_HOST_ADDR; 1152 1153 1154 typedef struct 1155 { 1156 size_t nhost_addr; 1157 size_t mhost_addr; 1158 UDM_HOST_ADDR *host_addr; 1159 } UDM_HOSTLIST; 1160 1161 1162 /** Used in FTP sessions */ 1163 typedef struct udm_conn_struct 1164 { 1165 int status; 1166 int connected; 1167 int err; 1168 time_t host_last_used; 1169 int conn_fd; 1170 #ifdef WIN32 1171 unsigned short port; 1172 #else 1173 int port; 1174 #endif 1175 int timeout; 1176 char *hostname; 1177 char *user; 1178 char *pass; 1179 struct sockaddr_in sin; 1180 int buf_len; 1181 size_t buf_len_total; 1182 int len; 1183 char *buf; 1184 int net_errors; 1185 struct udm_conn_struct *connp; 1186 } UDM_CONN; 1187 1188 /***************************************************/ 1189 1190 typedef struct 1191 { 1192 char *buf; /**< Buffer to download document to */ 1193 char *content; /**< Pointer to content, after headers */ 1194 size_t size; /**< Number of bytes loaded */ 1195 size_t alloced_size; /**< Maximum bytes to load into buf */ 1196 } UDM_HTTPBUF; 1197 1198 1199 typedef struct 1200 { 1201 int stored; /**< If it is already stored, forAddHref() */ 1202 udm_method_t method; /**< How to download document: GET, HEAD etc */ 1203 1204 UDM_HTTPBUF Buf; /**< Buffer */ 1205 1206 UDM_HREFLIST Hrefs; /**< Link list */ 1207 1208 UDM_VARLIST RequestHeaders; /**< Extra headers*/ 1209 UDM_VARLIST Sections; /**< User sections*/ 1210 1211 UDM_TEXTLIST TextList; /**< Text list */ 1212 UDM_URL CurURL; /**< Parsed URL */ 1213 UDM_CHARSET *lcs; /**< LocalCharser */ 1214 UDM_SPIDERPARAM Spider; /**< Spider prms */ 1215 UDM_CONN connp; /**< For FTP */ 1216 UDM_CONN connp2; /**< For FTP */ 1217 } UDM_DOCUMENT; 1218 1219 /********************************************************/ 1220 1221 /** External Parsers */ 1222 typedef struct udm_parser_struct 1223 { 1224 char *from_mime; 1225 char *to_mime; 1226 char *cmd; 1227 char *src; 1228 } UDM_PARSER; 1229 1230 typedef struct 1231 { 1232 size_t nparsers; 1233 UDM_PARSER *Parser; 1234 } UDM_PARSERLIST; 1235 1236 1237 /******* Ispell BEGIN ********/ 1238 1239 #define UDM_SPELL_NOPREFIX 1 1240 1241 typedef struct udm_spell_st 1242 { 1243 char *word; 1244 char *flags; 1245 } UDM_SPELL; 1246 1247 #define UDM_SPELL_LANGLEN 32 1248 #define UDM_SPELL_CSETLEN 32 1249 #define UDM_SPELL_FILELEN 128 1250 #define UDM_SPELL_FMT_TEXT 0 1251 #define UDM_SPELL_FMT_HASH 1 1252 1253 typedef struct udm_dict_st 1254 { 1255 char lang[UDM_SPELL_LANGLEN]; 1256 char cset[UDM_SPELL_CSETLEN]; 1257 char fname[UDM_SPELL_FILELEN]; 1258 int fmt; 1259 int fd; 1260 size_t wordlen; 1261 UDM_CHARSET *cs; 1262 char *fbody; 1263 size_t nitems; 1264 size_t mitems; 1265 UDM_SPELL *Item; 1266 } UDM_SPELLLIST; 1267 1268 1269 typedef struct udm_spelllistlist_st 1270 { 1271 size_t nitems; 1272 size_t mitems; 1273 UDM_SPELLLIST *Item; 1274 } UDM_SPELLLISTLIST; 1275 1276 1277 typedef struct udm_aff_st UDM_AFFIX; /* Defined in spell.c */ 1278 1279 1280 typedef struct udm_afflist_st 1281 { 1282 size_t mitems; 1283 size_t nitems; 1284 char lang[UDM_SPELL_LANGLEN]; 1285 char cset[UDM_SPELL_CSETLEN]; 1286 char fname[UDM_SPELL_FILELEN]; 1287 UDM_CHARSET *cs; 1288 UDM_AFFIX *Item; 1289 } UDM_AFFIXLIST; 1290 1291 1292 typedef struct udm_afflistlist_st 1293 { 1294 size_t mitems; 1295 size_t nitems; 1296 UDM_AFFIXLIST *Item; 1297 } UDM_AFFIXLISTLIST; 1298 1299 1300 /******* Ispell END **********/ 1301 1302 1303 typedef struct 1304 { 1305 udm_method_t method; /**< 'allow' or 'disallow' */ 1306 char *path; 1307 } UDM_ROBOT_RULE; 1308 1309 1310 typedef struct 1311 { 1312 char *hostinfo; 1313 size_t nrules; 1314 UDM_ROBOT_RULE *Rule; 1315 } UDM_ROBOT; 1316 1317 1318 typedef struct 1319 { 1320 size_t nrobots; 1321 UDM_ROBOT *Robot; 1322 } UDM_ROBOTS; 1323 1324 1325 /********************************************************/ 1326 1327 typedef struct 1328 { 1329 size_t order; /*TODO34: change to udm_wordnum_t */ 1330 size_t order_extra_width; /* For multi-word synonyms, see below */ 1331 size_t count; 1332 size_t doccount; /* Number of documents this word appears in */ 1333 udm_wordorigin_t origin; /* query, spell, synonym, etc */ 1334 int weight; /* origin-dependent weight */ 1335 int user_weight; /* User-supplied weight */ 1336 udm_match_mode_t match_mode; /* BEGIN,END,SUBSTR,NUM_LT,NUM_GT,FULL*/ 1337 size_t secno; /* Which section to search in; TODO34: udm_secno_t */ 1338 size_t phrpos; /* 0 means "not in phrase" */ 1339 size_t phrlen; /* phase length */ 1340 size_t phrwidth; /* How many additional parts in a multi-word */ 1341 } UDM_WIDEWORD_PARAM; 1342 1343 1344 typedef struct 1345 { 1346 UDM_STR Word; 1347 UDM_WIDEWORD_PARAM Param; 1348 } UDM_WIDEWORD; 1349 1350 1351 /* 1352 order_extra_width - use in case of many-to-one and many-to-many synonyms. 1353 It represents the number of query words this synonym covers. 1354 For example, if we have synonym: 1355 "aaaa bbbb" -> "cccc" 1356 then origin_extra_width for the words "cccc" will be 2, because 1357 it is a replacement for two words. 1358 0 means "not a multi-word replacement". 1359 */ 1360 1361 typedef struct 1362 { 1363 udm_match_mode_t match_mode; /* Search mode: wrd, sub, beg, end */ 1364 udm_bool_t strip_noaccents; /* If accent insensitive comparison*/ /* TODO34: change to a flag mask? */ 1365 size_t nuniq; 1366 size_t nwords; 1367 UDM_WIDEWORD *Word; 1368 } UDM_WIDEWORDLIST; 1369 1370 1371 /*****************************************************************/ 1372 1373 typedef struct 1374 { 1375 char *p; 1376 char *s; 1377 udm_wordorigin_t origin; /* SYNONYM or SYNONYM_FINAL */ 1378 } UDM_SYNONYM; 1379 1380 1381 #define UDM_SYNONYM_LANGLEN 32 1382 #define UDM_SYNONYM_CSETLEN 32 1383 #define UDM_SYNONYM_FILELEN 128 1384 #define UDM_SYNONYM_FMT_TEXT 0 1385 #define UDM_SYNONYM_FMT_HASH 1 1386 1387 typedef struct 1388 { 1389 size_t nsynonyms; 1390 size_t msynonyms; 1391 UDM_SYNONYM *Synonym; 1392 char lang[UDM_SYNONYM_LANGLEN]; 1393 char cset[UDM_SYNONYM_CSETLEN]; 1394 char fname[UDM_SYNONYM_FILELEN]; 1395 size_t max_phrase_length; /* Many-to-many, many-to-one, one-to-many */ 1396 } UDM_SYNONYMLIST; 1397 1398 1399 typedef struct 1400 { 1401 size_t nitems; 1402 UDM_SYNONYMLIST *Item; 1403 } UDM_SYNONYMLISTLIST; 1404 1405 1406 typedef struct udm_chinaword_struct 1407 { 1408 int *word; 1409 int freq; 1410 } UDM_CHINAWORD; 1411 1412 1413 typedef struct 1414 { 1415 size_t nwords; 1416 size_t mwords; 1417 size_t total; 1418 UDM_CHINAWORD *ChiWord; 1419 size_t *hash; 1420 } UDM_CHINALIST; 1421 1422 1423 /*************************************************************/ 1424 1425 1426 /*** Boolean search constants and types ****/ 1427 typedef enum 1428 { 1429 UDM_BOOLCMD_LEFT= 0, 1430 UDM_BOOLCMD_RIGHT= 1, 1431 UDM_BOOLCMD_BOT= 2, 1432 UDM_BOOLCMD_OR= 3, 1433 UDM_BOOLCMD_AND= 4, 1434 UDM_BOOLCMD_NOT= 5, 1435 UDM_BOOLCMD_PHRASE= 6, 1436 UDM_BOOLCMD_WORD= 200, 1437 UDM_BOOLCMD_STOP= 201 1438 } udm_boolcmd_t; 1439 1440 1441 typedef struct 1442 { 1443 size_t ncstack; 1444 size_t mcstack; 1445 udm_boolcmd_t *cstack; 1446 size_t nastack; 1447 size_t mastack; 1448 unsigned long *astack; 1449 } UDM_BOOLSTACK; /* TODO34: split into two lists */ 1450 1451 1452 typedef struct 1453 { 1454 udm_boolcmd_t cmd; 1455 unsigned long arg; 1456 } UDM_BOOLEXPR_ITEM; 1457 1458 1459 typedef struct 1460 { 1461 size_t nitems; 1462 size_t mitems; 1463 size_t ncmds; 1464 UDM_BOOLEXPR_ITEM *items; 1465 } UDM_BOOLEXPR; 1466 1467 1468 /*****************************/ 1469 typedef struct 1470 { 1471 size_t first; 1472 size_t last; 1473 size_t total_found; 1474 } UDM_QUERY_STATS; 1475 1476 1477 typedef struct 1478 { 1479 size_t num_rows; 1480 size_t cur_row; 1481 UDM_DOCUMENT *Doc; 1482 UDM_VARLIST Vars; 1483 UDM_WIDEWORDLIST WWList; 1484 } UDM_RESULT; 1485 1486 1487 typedef struct 1488 { 1489 UDM_QUERY_STATS stats; 1490 UDM_RESULT Res; 1491 UDM_URLDATALIST URLData; 1492 UDM_BOOLEXPR BoolExpr; 1493 UDM_STATLIST StatList; 1494 UDM_SEARCHSECTIONLIST SectionList; 1495 size_t num_best_rows; 1496 char *where; 1497 char *from; 1498 } UDM_QUERY; 1499 1500 1501 struct udm_dbhandler_st; 1502 1503 typedef struct udm_db_st 1504 { 1505 const struct udm_dbhandler_st *dbhandler; 1506 void *specific; 1507 } UDM_DB; 1508 1509 1510 typedef struct 1511 { 1512 size_t nitems; 1513 UDM_DB *Item; 1514 } UDM_DBLIST; 1515 1516 1517 #define UDM_LOG_FLAG_SKIP_PID 0x00000001 1518 1519 typedef struct 1520 { 1521 int is_log_open; /* Flag indicating if openlog() has been called */ 1522 FILE *logFD; /* File descriptor, when logging to stderr or file */ 1523 int facility; /* Which facility to use, or negative number if none */ 1524 int flags; 1525 int level; 1526 } UDM_LOG; 1527 1528 1529 /****** SQLMon **************/ 1530 1531 typedef enum 1532 { 1533 UDM_MSG_DATA= 0, /* Column values */ 1534 UDM_MSG_ERROR= 1, /* Error text */ 1535 UDM_MSG_COMMAND= 2, /* The command being executed */ 1536 UDM_MSG_EOL= 3, /* End-of-line (e.g. record has ended) */ 1537 UDM_MSG_PROMPT= 4, /* ">SQL" */ 1538 UDM_MSG_INFO= 5 /* e.g. "Connection changed to #1" */ 1539 } udm_msg_t; 1540 1541 1542 /* Input/Output handler, e.g. for "indexer --sqlmon" */ 1543 typedef struct udm_iohandler_st 1544 { 1545 void *user_data; 1546 char *(*gets)(struct udm_iohandler_st *prm, char *str, size_t size); 1547 udm_rc_t (*prompt)(struct udm_iohandler_st *prm, udm_msg_t msgtype, const char *msg); 1548 } UDM_IOHANDLER; 1549 1550 1551 /** Forward declaration of UDM_AGENT */ 1552 struct udm_indexer_struct_st; 1553 1554 typedef struct 1555 { 1556 void (*Lock)(struct udm_indexer_struct_st *, 1557 udm_mutexcmd_t command, udm_mutexno_t number, 1558 const char *fname, int lineno); 1559 udm_rc_t (*ThreadCreate)(void *thd, void *(*start_routine)(void *), void *arg); 1560 udm_rc_t (*ThreadJoin)(void *thd); 1561 } UDM_THDHANDLER; 1562 1563 #define UDM_GETLOCK(A,mutex) if((A)->Conf->THDHandler.Lock)(A)->Conf->THDHandler.Lock((A),UDM_LOCK,(mutex),__FILE__,__LINE__) 1564 #define UDM_RELEASELOCK(A,mutex) if((A)->Conf->THDHandler.Lock)(A)->Conf->THDHandler.Lock((A),UDM_UNLOCK,(mutex),__FILE__,__LINE__) 1565 #define UDM_LOCK_CHECK_OWNER(A,mutex) if((A)->Conf->THDHandler.Lock)(A)->Conf->THDHandler.Lock((A),UDM_CKLOCK,(mutex),__FILE__,__LINE__) 1566 1567 1568 typedef struct 1569 { 1570 size_t maxlen[256]; 1571 } UDM_SECTIONPARAM; 1572 1573 1574 /** Config file */ 1575 typedef struct udm_config_struct 1576 { 1577 char errstr[UDM_ERRSTR_SIZE]; 1578 UDM_CHARSET *bcs; 1579 UDM_CHARSET *lcs; 1580 UDM_UNIDATA *unidata; 1581 1582 int url_number; /**< For indexer -nXXX */ 1583 1584 UDM_SERVERLIST Servers; /**< List of servers and realms */ 1585 UDM_SERVER *Cfg_Srv; 1586 1587 UDM_REPLACELIST Aliases; /**< Straight aliases */ 1588 UDM_REPLACELIST ReverseAliases; /**< Reverse aliases */ 1589 UDM_REPLACELIST MimeTypes; /**< For AddType commands */ 1590 UDM_REPLACELIST Encodings; /**< For AddType commands */ 1591 1592 UDM_FILTERLIST Filters; /**< Allow, Disallow,etc */ 1593 UDM_SECTIONFILTERLIST SectionFilters; /**< IndexIf, NoIndexIf */ 1594 UDM_USERSECTIONLIST SectionHdrMatch; /**< User sections after headers*/ 1595 UDM_USERSECTIONLIST SectionGsrMatch; /**< User sections after quesser*/ 1596 UDM_USERSECTIONLIST SectionMatch; /**< User sections after parser */ 1597 1598 UDM_HREFLIST Hrefs; /**< Links cache */ 1599 UDM_RESULT Targets; /**< Targets cache */ 1600 1601 UDM_SECTIONPARAM SectionParam; /**< section parameters */ 1602 UDM_VARLIST Sections; /**< document section to parse */ 1603 UDM_VARLIST Vars; /**< Config parameters */ 1604 UDM_VARLIST Cookies; /**< Cookie list */ 1605 UDM_VARLIST XMLEnterHooks; 1606 UDM_VARLIST XMLLeaveHooks; 1607 UDM_VARLIST XMLDataHooks; 1608 1609 UDM_LANGMAPLIST LangMaps; /**< For lang+charset quesser */ 1610 UDM_ROBOTS Robots; /**< robots.txt information */ 1611 UDM_SYNONYMLISTLIST Synonym; /**< Synonims list */ 1612 UDM_STOPLISTLIST StopWord; /**< Stopwords list */ 1613 UDM_PARSERLIST Parsers; /**< External parsers */ 1614 UDM_DBLIST DBList; /**< Databases */ 1615 UDM_HOSTLIST Hosts; /**< Resolve cache */ 1616 UDM_INADDRLIST InAddr; /**< IP cache */ 1617 UDM_SPELLLISTLIST Spells; /**< For ispell dictionaries */ 1618 UDM_AFFIXLISTLIST Affixes; /**< For ispell affixes */ 1619 UDM_WORDPARAM WordParam; /**< Word limits */ 1620 UDM_CHINALIST Chi; /**< Chinese words list */ 1621 UDM_CHINALIST Thai; /**< Thai words list */ 1622 1623 UDM_LOG Log; 1624 1625 int CVS_ignore; /**< Skip CVS directgories, for tests */ 1626 1627 /* Various virtual functions */ 1628 UDM_THDHANDLER THDHandler; 1629 void (*ThreadInfo)(struct udm_indexer_struct_st *, const char * state, const char * str); 1630 void (*RefInfo)(int code, const char *url, const char *ref); 1631 int (*DumpDoc)(struct udm_indexer_struct_st *, UDM_DOCUMENT *Doc); 1632 #ifdef MECAB 1633 mecab_t *mecab; 1634 #endif 1635 1636 } UDM_ENV; 1637 1638 1639 #define UDM_AGENT_STATE_MUTEXES 5 1640 1641 typedef struct udm_agent_state_t 1642 { 1643 const char *task; 1644 const char *param; 1645 const char *extra; 1646 time_t start_time; 1647 } UDM_AGENT_STATE; 1648 1649 1650 /** Indexer */ 1651 typedef struct udm_indexer_struct_st 1652 { 1653 udm_threadid_t handle; /**< Handler for threaded version */ 1654 time_t start_time; /**< Time of allocation, for stat */ 1655 udm_uint8 nbytes; /**< Number of bytes downloaded */ 1656 size_t ndocs; /**< Number of documents */ 1657 size_t nsleepsecs; /**> Number of sleep seconds */ 1658 int flags; /**< Callback function to request action */ 1659 udm_rc_t action; 1660 int doccount; /**< for UdmGetDocCount() */ 1661 UDM_ENV *Conf; /**< Configuration */ 1662 UDM_LANGMAP *LangMap; /**< LangMap for current document */ 1663 UDM_RESULT Indexed; /**< Indexed cache */ 1664 size_t Indexed_memused; /**< Memory used by Indexed cache */ 1665 1666 UDM_AGENT_STATE State; 1667 1668 #ifdef USE_TRACE 1669 FILE *TR; 1670 #endif 1671 1672 } UDM_AGENT; 1673 1674 1675 typedef struct 1676 { 1677 UDM_AGENT Agent; 1678 UDM_QUERY *Query; 1679 } UDM_CRAWLER; 1680 1681 1682 typedef struct udm_cfg_st 1683 { 1684 UDM_AGENT *Indexer; 1685 UDM_SERVER *Srv; 1686 int flags; 1687 int level; 1688 int ordre; 1689 size_t excerpt_fragments_count; 1690 } UDM_CFG; 1691 1692 1693 typedef struct 1694 { 1695 UDM_DSTR Value; 1696 char *name; 1697 size_t lineno; 1698 } UDM_TEMPLATE_ITEM; 1699 1700 1701 typedef struct 1702 { 1703 size_t nitems; 1704 size_t mitems; 1705 UDM_TEMPLATE_ITEM *Item; 1706 int rc; /* The code returned from the template program, e.g. exit() */ 1707 } UDM_TEMPLATE; 1708 1709 1710 #ifdef DMALLOC 1711 #include <dmalloc.h> 1712 #endif 1713 1714 #define UDM_DT_BACK 1 1715 #define UDM_DT_ER 2 1716 #define UDM_DT_RANGE 3 1717 #define UDM_DT_UNKNOWN 4 1718 1719 typedef enum 1720 { 1721 UDM_OPT_BOOL, 1722 UDM_OPT_INT, 1723 UDM_OPT_STR, 1724 UDM_OPT_TITLE 1725 } udm_opttype_t; 1726 1727 1728 typedef struct udm_cmdline_opt_st 1729 { 1730 int id; 1731 const char *name; 1732 int type; 1733 void *value; 1734 const char *comment; 1735 } UDM_CMDLINE_OPT; 1736 1737 1738 typedef enum 1739 { 1740 UDM_IND_AMBIGUOUS= 0, 1741 UDM_IND_UNKNOWN= 1, 1742 UDM_IND_INDEX= 300, 1743 UDM_IND_CRAWL= 301, 1744 UDM_IND_STAT= 'S', 1745 UDM_IND_CREATE= 303, 1746 UDM_IND_DROP= 304, 1747 UDM_IND_DELETE= 'C', 1748 UDM_IND_REFERERS= 'I', 1749 UDM_IND_SQLMON= 'Q', 1750 UDM_IND_CHECKCONF= 308, 1751 UDM_IND_CONVERT= 309, 1752 UDM_IND_MULTI2BLOB= 310, 1753 UDM_IND_EXPORT= 311, 1754 UDM_IND_WRDSTAT= 312, 1755 UDM_IND_REWRITEURL= 313, 1756 UDM_IND_HASHSPELL= 314, 1757 UDM_IND_DUMPSPELL= 315, 1758 UDM_IND_REWRITELIMITS= 316, 1759 UDM_IND_DUMPCONF= 317, 1760 UDM_IND_DUMPDATA= 318, 1761 UDM_IND_RESTOREDATA= 319, 1762 UDM_IND_EXECSQL= 320, 1763 UDM_IND_SET= 321, /* indexer --set=a=b -> a=b */ 1764 UDM_IND_SET0= 322, /* indexer --fl=xxx -> fl=xxx */ 1765 UDM_IND_REWRITEPOP= 'R' 1766 } udm_indcmd_t; 1767 1768 #endif 1769