1 /* Copyright (C) 2000-2015 Lavtech.com corp. All rights reserved.
2 
3    This program is free software; you can redistribute it and/or modify
4    it under the terms of the GNU General Public License as published by
5    the Free Software Foundation; either version 2 of the License, or
6    (at your option) any later version.
7 
8    This program is distributed in the hope that it will be useful,
9    but WITHOUT ANY WARRANTY; without even the implied warranty of
10    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11    GNU General Public License for more details.
12 
13    You should have received a copy of the GNU General Public License
14    along with this program; if not, write to the Free Software
15    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
16 */
17 
18 #ifndef _UDM_COMMON_H
19 #define _UDM_COMMON_H
20 
21 #include "udm_config.h"
22 
23 #include <stddef.h>
24 
25 #include <stdio.h> /* for FILE etc. */
26 
27 #include <sys/types.h>
28 
29 #ifdef HAVE_UNISTD_H
30 #include <unistd.h>
31 #endif
32 #ifdef HAVE_WINSOCK_H
33 #include <winsock.h>
34 #endif
35 #ifdef HAVE_SYS_SOCKET_H
36 #include <sys/socket.h>
37 #endif
38 #ifdef HAVE_NETINET_IN_H
39 #include <netinet/in.h>
40 #endif
41 #ifdef HAVE_ARPA_INET_H
42 #include <arpa/inet.h>
43 #endif
44 #ifdef HAVE_ARPA_NAMESER_H
45 #include <arpa/nameser.h>
46 #endif
47 #ifdef HAVE_RESOLV_H
48 #include <resolv.h>
49 #endif
50 #ifdef HAVE_NETDB_H
51 #include <netdb.h>
52 #endif
53 #ifdef MECAB
54 #include <mecab.h>
55 #endif
56 
57 #ifdef HAVE_DEBUG
58 #include <assert.h>
59 #endif
60 
61 /************************ Basic data types ****************/
62 
63 #if 0
64 #if 0
65 typedef const char * udm_rc_t;
66 #define UDM_OK			((const char*)NULL)
67 #define UDM_ERROR		((const char*)"error")
68 #define UDM_NOTARGET		((const char*)"notarget")
69 #define UDM_TERMINATED		((const char*)"terminated")
70 #else
71 typedef unsigned long udm_rc_t;
72 #define UDM_OK         0
73 #define UDM_ERROR      0xFFFFFFFFUL
74 #define UDM_NOTARGET   0xFFFFFFFEUL
75 #define UDM_TERMINATED 0xFFFFFFFDUL
76 #endif
77 #else
78 
79 typedef enum
80 {
81   UDM_OK= 0,
82   UDM_ERROR= 1,
83   UDM_NOTARGET= 2,
84   UDM_TERMINATED= 3
85 } udm_rc_t;
86 #endif
87 
88 typedef enum
89 {
90   UDM_FALSE= 0,
91   UDM_TRUE= 1
92 } udm_bool_t;
93 
94 
95 #define UDM_TEST(x) ((x) ? UDM_TRUE : UDM_FALSE)
96 
97 #ifndef udm_max
98 #define udm_max(a,b) (((a) > (b)) ? (a) : (b))
99 #endif
100 #ifndef udm_min
101 #define udm_min(a,b) (((a) < (b)) ? (a) : (b))
102 #endif
103 
104 typedef int (*udm_qsort_cmp)(const void*, const void*);
105 
106 typedef uint4 udm_pos_t;
107 typedef unsigned char udm_secno_t;
108 typedef unsigned char udm_wordnum_t;
109 
110 /**********************************************************/
111 
112 #include "udm_unicode.h"
113 #include "udm_uniconv.h"
114 #include "udm_unidata.h"
115 #include "udm_hash.h"
116 
117 /**************************** Constant strings *******************************/
118 typedef struct udm_const_string_st
119 {
120   const char *str;
121   size_t length;
122 } UDM_CONST_STR;
123 
124 /************************* Constant attribute - name with value *************/
125 typedef struct udm_const_attr_st
126 {
127   UDM_CONST_STR name;
128   UDM_CONST_STR value;
129 } UDM_CONST_ATTR;
130 
131 /**************************** Variable strings *******************************/
132 typedef struct udm_str_st
133 {
134   char *str;
135   size_t length;
136 } UDM_STR;
137 
138 /************************ Dynamic strings ************************************/
139 typedef struct dstr_struct
140 {
141   size_t size_alloced; /* Bytes allocated */
142   size_t size_page;    /* Bytes to allocate on overflow */
143 #ifdef DSTR_PRIVATE
144   UDM_STR Val;         /* Value (usually null-terminated string) and its length */
145 #else
146   UDM_STR Val;
147 #endif
148 } UDM_DSTR;
149 
150 #include "udm_utils.h"
151 
152 
153 /* Some constants */
154 #define UDM_LANGPERDOC				16		/* FIXME */
155 #define UDM_USER_AGENT				"MnoGoSearch/" VERSION
156 #define UDM_MAXWORDPERQUERY			64
157 
158 /* Some sizes and others definitions */
159 #define UDM_MAXDOCSIZE				2*1024*1024	/**< 2 MB  */
160 #define UDM_DEFAULT_REINDEX_TIME		7*24*60*60	/**< 1week */
161 #define UDM_MAXWORDSIZE				32
162 #define UDM_MAXDISCWORDSIZE			64
163 #define UDM_DEFAULT_MAX_HOPS			256
164 #define UDM_READ_TIMEOUT			30
165 #define UDM_DOC_TIMEOUT				90
166 #define UDM_MAXNETERRORS			16
167 #define UDM_DEFAULT_NET_ERROR_DELAY_TIME	86400
168 #define UDM_DEFAULT_BAD_SINCE_TIME              15*24*60*60     /**< 15 days */
169 #define UDM_FINDURL_CACHE_SIZE                  128
170 #define UDM_SERVERID_CACHE_SIZE                 128
171 #define UDM_ERRSTR_SIZE                         2048
172 
173 /* search modes */
174 typedef enum
175 {
176   UDM_MODE_ALL=             0,
177   UDM_MODE_ANY=             1,
178   UDM_MODE_BOOL=            2,
179   UDM_MODE_PHRASE=          3,
180   UDM_MODE_ALL_MINUS=       4,
181   UDM_MODE_ALL_MINUS_HALF=  5
182 } udm_search_mode_t;
183 
184 
185 /* word and pattern match type */
186 typedef enum
187 {
188   UDM_MATCH_FULL=       0,
189   UDM_MATCH_BEGIN=      1,
190   UDM_MATCH_SUBSTR=     2,
191   UDM_MATCH_END=        3,
192   UDM_MATCH_REGEX=      4,
193   UDM_MATCH_WILD=       5,
194   UDM_MATCH_SUBNET=     6,
195   UDM_MATCH_NUMERIC_LT= 7,
196   UDM_MATCH_NUMERIC_GT= 8,
197   UDM_MATCH_RANGE=      9
198 } udm_match_mode_t;
199 
200 
201 /* Case sensitivity */
202 #define UDM_CASE_SENSITIVE      0
203 #define UDM_CASE_INSENSITIVE    1
204 
205 /* Flags for indexing */
206 #define UDM_FLAG_REINDEX           1
207 #define UDM_FLAG_SORT_EXPIRED      2
208 #define UDM_FLAG_SORT_HOPS         4
209 #define UDM_FLAG_ADD_SERV          8
210 #define UDM_FLAG_SPELL	          16
211 #define UDM_FLAG_LOAD_LANGMAP	    32
212 #define UDM_FLAG_DONTSORT_SEED    64
213 #define UDM_FLAG_ADD_SERVURL	   128
214 #define UDM_FLAG_DONT_ADD_TO_DB  256
215 
216 
217 /* URLFile actions */
218 typedef enum
219 {
220   UDM_URL_FILE_REINDEX= 1,
221   UDM_URL_FILE_CLEAR=   2,
222   UDM_URL_FILE_INSERT=  3,
223   UDM_URL_FILE_PARSE=   4
224 } udm_urlfilecmd_t;
225 
226 /* Ispell mode binary flags */
227 #define UDM_ISPELL_MODE_DB	1
228 #define UDM_ISPELL_USE_PREFIXES	2
229 #define UDM_ISPELL_MODE_SERVER  4
230 
231 
232 /* Known content types */
233 typedef enum
234 {
235   UDM_CONTENT_TYPE_UNKNOWN= 0,
236   UDM_CONTENT_TYPE_TEXT_PLAIN= 1,
237   UDM_CONTENT_TYPE_TEXT_HTML= 2,
238   UDM_CONTENT_TYPE_TEXT_XML= 3,
239   UDM_CONTENT_TYPE_MESSAGE_RFC822= 4,
240   UDM_CONTENT_TYPE_AUDIO_MPEG= 5,
241   UDM_CONTENT_TYPE_HTDB= 6,
242   UDM_CONTENT_TYPE_DOCX= 7,
243   UDM_CONTENT_TYPE_TEXT_RTF= 8
244 } udm_content_type_t;
245 
246 
247 /* Action type: HTTP methods */
248 typedef enum
249 {
250   UDM_METHOD_GET=          0,
251   UDM_METHOD_UNKNOWN=      1,
252   UDM_METHOD_DISALLOW=     2,
253   UDM_METHOD_HEAD=         3,
254   UDM_METHOD_HREFONLY=     4,
255   UDM_METHOD_CHECKMP3=     5,
256   UDM_METHOD_CHECKMP3ONLY= 6,
257   UDM_METHOD_VISITLATER=   7,
258   UDM_METHOD_INDEX=        8,
259   UDM_METHOD_NOINDEX=      9,
260   UDM_METHOD_IMPORTONLY=   10
261 } udm_method_t;
262 
263 #define UDM_METHOD_DEFAULT      UDM_METHOD_GET
264 
265 typedef enum
266 {
267   UDM_LINK_SOURCE_UNKNOWN=         0,
268   UDM_LINK_SOURCE_CMDLINE=         1,
269   UDM_LINK_SOURCE_CONF=            2,
270   UDM_LINK_SOURCE_A_HREF=          3,
271   UDM_LINK_SOURCE_FRAME_SRC=       4,
272   UDM_LINK_SOURCE_IMG_SRC=         5,
273   UDM_LINK_SOURCE_REDIRECT=        6,
274   UDM_LINK_SOURCE_META_REFRESH=    7,
275   UDM_LINK_SOURCE_LINK_HREF=       8,
276   UDM_LINK_SOURCE_AREA_HREF=       9,
277   UDM_LINK_SOURCE_IFRAME_SRC=     10,
278   UDM_LINK_SOURCE_SCRIPT_SRC=     11,
279   UDM_LINK_SOURCE_HTDB=           12,
280   UDM_LINK_SOURCE_URLFILE=        13,
281   UDM_LINK_SOURCE_ROBOTS_SITEMAP= 14,
282   UDM_LINK_SOURCE_XML=            15,
283   UDM_LINK_SOURCE_DIR=            16  /* Directory listing: file:///dir/ */
284 } udm_link_source_t;
285 #define UDM_LINK_SOURCES_YES 0x7FFFFFFF
286 
287 
288 /* CollectLinks flags*/
289 typedef enum
290 {
291   UDM_COLLECT_LINKS_NONE=      0x0000,  /* do not collect links           */
292   UDM_COLLECT_LINKS_INNER=     0x0001,  /* links covered by .conf file    */
293   UDM_COLLECT_LINKS_OUTER=     0x0002,  /* links going outside .conf file */
294   UDM_COLLECT_LINKS_SITE=      0x0004,  /* links going to the same site   */
295   UDM_COLLECT_LINKS_PAGE=      0x0008,  /* links going to the same page   */
296   UDM_COLLECT_LINKS_BADSCHEME= 0x0010,  /* Schema: mailto, javascript     */
297   UDM_COLLECT_LINKS_BAD=       0x0020,  /* Bad syntax                     */
298   UDM_COLLECT_LINKS_HOPS=      0x0040,  /* Too many hops                  */
299   UDM_COLLECT_LINKS_FILTER=    0x0080,  /* Too many hops                  */
300   UDM_COLLECT_LINKS_PERSITE=   0x0100,  /* Per site limite reached        */
301   UDM_COLLECT_LINKS_ALL_DST=   0xFFFF
302 } udm_link_destination_t;
303 
304 
305 /*
306   Don't need these by default:
307   UDM_COLLECT_LINKS_PAGE
308   UDM_COLLECT_LINKS_BADSCHEMA
309   UDM_COLLECT_LINKS_BAD
310 */
311 #define UDM_COLLECT_LINKS_YES \
312   (UDM_COLLECT_LINKS_INNER   |\
313    UDM_COLLECT_LINKS_OUTER   |\
314    UDM_COLLECT_LINKS_SITE    |\
315    UDM_COLLECT_LINKS_HOPS    |\
316    UDM_COLLECT_LINKS_FILTER  |\
317    UDM_COLLECT_LINKS_PERSITE)
318 #define UDM_COLLECT_LINKS_DEFAULT UDM_COLLECT_LINKS_YES
319 
320 /* Robots flags */
321 typedef enum
322 {
323   UDM_ROBOTS_NONE=   0x0000, /* Don't respect robots limitation  */
324   UDM_ROBOTS_TXT=    0x0001, /* Respect robots.txt instructions  */
325   UDM_ROBOTS_HEADER= 0x0002, /* Restext "X-Robots-Tag" HTTP header        */
326   UDM_ROBOTS_META=   0x0004, /* Respect <meta robots="xxx">  insturctions */
327   UDM_ROBOTS_REL=    0x0008, /* Respect <link rel="nofollow> insturctions */
328   UDM_ROBOTS_ALL=    0xFFFF  /* Respect all robots instructions  */
329 } udm_use_robots_t;
330 
331 
332 typedef enum
333 {
334   UDM_COLLECT_LINKS_ABSOLUTE=  0x000000, /* Store in absolute format   */
335   UDM_COLLECT_LINKS_ASIS=      0x010000, /* Preserve the original form */
336   UDM_COLLECT_LINKS_FORMAT=    0xFF0000
337 } udm_link_format_t;
338 
339 
340 /* Content encoding types */
341 typedef enum
342 {
343   UDM_CONTENT_ENCODING_IDENTITY= 0,
344   UDM_CONTENT_ENCODING_DEFLATE=  1,
345   UDM_CONTENT_ENCODING_GZIP=     3,
346   UDM_CONTENT_ENCODING_ZLIB_COMPRESS= 4,
347   UDM_CONTENT_ENCODING_UNKNOWN=  5
348 } udm_content_encoding_t;
349 
350 
351 /* Words origins */
352 typedef enum
353 {
354   UDM_WORD_ORIGIN_QUERY=           1,
355   UDM_WORD_ORIGIN_SPELL=           2,
356   UDM_WORD_ORIGIN_SYNONYM=         3,
357   UDM_WORD_ORIGIN_SYNONYM_FINAL=   4,
358   UDM_WORD_ORIGIN_STOP=            5,
359   UDM_WORD_ORIGIN_SUGGEST=         6,
360   UDM_WORD_ORIGIN_COLLATION=       7
361 } udm_wordorigin_t;
362 
363 
364 /* URL data flags */
365 #define UDM_URLDATA_URL	        1
366 #define UDM_URLDATA_SITE        2
367 #define UDM_URLDATA_POP         4
368 #define UDM_URLDATA_LM          8
369 #define UDM_URLDATA_SU         16
370 #define UDM_URLDATA_SITE_RANK  32
371 
372 /* Locking mutex numbers */
373 #define UDM_LOCK_INTERNAL       0
374 #define UDM_LOCK_CONF		1
375 #define UDM_LOCK_WIN		2
376 #define UDM_LOCK_THREAD         3
377 #define UDM_LOCK_SQL            4
378 #define UDM_LOCK_SEGMENTER      5
379 #define UDM_LOCK_DB             6
380 #define UDM_LOCK_LOG		7
381 #define UDM_LOCK_HREF_CACHE	8
382 #define UDM_LOCK_TARGETS        9
383 #define UDM_LOCK_HOST_CACHE     10
384 #define UDM_LOCK_INADDR_CACHE   11
385 #define UDM_LOCK_DOC_CACHE      12
386 #define UDM_LOCK_ROBOT_CACHE    13
387 #define UDM_LOCK_COOKIE_CACHE   14
388 /* Don't forget to add a new name into mutex.c */
389 #define UDM_LOCK_ROBOT_FIRST	15
390 #define UDM_LOCK_ROBOT_LAST	128
391 #define UDM_LOCK_ROBOT_COUNT    (UDM_LOCK_ROBOT_LAST - UDM_LOCK_ROBOT_FIRST +1)
392 #define UDM_LOCK_HOST_FIRST     129
393 #define UDM_LOCK_HOST_LAST      255
394 #define UDM_LOCK_HOST_COUNT     (UDM_LOCK_HOST_LAST - UDM_LOCK_HOST_FIRST + 1)
395 #define UDM_LOCK_MAX            (UDM_LOCK_HOST_LAST + 1)
396 
397 typedef unsigned int udm_threadid_t;
398 typedef unsigned int udm_mutexno_t;
399 
400 typedef enum
401 {
402   UDM_LOCK,
403   UDM_UNLOCK,
404   UDM_CKLOCK
405 } udm_mutexcmd_t;
406 
407 /************************ Statistics **********************/
408 typedef struct stat_struct
409 {
410   int status;
411   int expired;
412   int total;
413 } UDM_STAT;
414 
415 typedef struct stat_list_struct
416 {
417   time_t   time;
418   size_t   nstats;
419   UDM_STAT *Stat;
420 } UDM_STATLIST;
421 
422 /************************ VARLISTs ************************/
423 
424 /* Various variable flags */
425 typedef enum
426 {
427   UDM_VARFLAG_NONE=       0x00,
428   UDM_VARFLAG_NOCLONE=    0x01,  /* Ignore for clone detection        */
429   UDM_VARFLAG_USERDEF=    0x02,  /* User defined section              */
430   UDM_VARFLAG_HTMLSOURCE= 0x08,  /* If apply HTML parser, for HTDB    */
431   UDM_VARFLAG_WIKI=       0x10,  /* If to remove text between [ and ] */
432   UDM_VARFLAG_HL=         0x20,  /* If variable has highlight markers */
433   UDM_VARFLAG_NOINDEX=    0x40,  /* If section should be in bdicti but not in bdict */
434   UDM_VARFLAG_DECIMAL=    0x80,  /* Whether to detect decimal numbers */
435   UDM_VARFLAG_RAW=       0x100,  /* If sections is Raw (no cs conversion */
436   UDM_VARFLAG_READONLY=  0x200,  /* If read only (e.g. full) */
437   UDM_VARFLAG_FORCETXT=  0x400   /* $(var) forces TXT rather than HTML output */
438 } udm_var_flag_t;
439 
440 
441 /* Value handler types */
442 typedef enum
443 {
444   UDM_VALUE_HANDLER_TYPE_STR=               1,
445   UDM_VALUE_HANDLER_TYPE_INT=               2,
446   UDM_VALUE_HANDLER_TYPE_DOUBLE=            3,
447   UDM_VALUE_HANDLER_TYPE_CHAR=              4,
448   UDM_VALUE_HANDLER_TYPE_ENV=              16,
449   UDM_VALUE_HANDLER_TYPE_RESULT=           18,
450   UDM_VALUE_HANDLER_TYPE_DOCUMENT=         19,
451   UDM_VALUE_HANDLER_TYPE_SQLRESULT=        20,
452   UDM_VALUE_HANDLER_TYPE_EXCERPT_FRAGMENT= 21
453 } udm_value_handler_type_t;
454 
455 /*
456   Value handler data types.
457   Handlers of different type can have the data type.
458 */
459 typedef enum
460 {
461   UDM_VALUE_DATA_TYPE_STR=               1,
462   UDM_VALUE_DATA_TYPE_INT=               2,
463   UDM_VALUE_DATA_TYPE_DOUBLE=            3,
464   UDM_VALUE_DATA_TYPE_CHAR=              4,
465   UDM_VALUE_DATA_TYPE_ENV=              16,
466   UDM_VALUE_DATA_TYPE_RESULT=           18,
467   UDM_VALUE_DATA_TYPE_DOCUMENT=         19,
468   UDM_VALUE_DATA_TYPE_SQLRESULT=        20,
469   UDM_VALUE_DATA_TYPE_EXCERPT_FRAGMENT= 21
470 } udm_value_data_type_t;
471 
472 
473 
474 /*
475   These values should not be equal to any secno values
476   in message_header_param[] in message.c.
477   TODO34: get rid of this. Don't put QSTRING and ENV
478   values into ENV, e.g. use UDM_QUERY instead.
479 */
480 #define UDM_VARSRC_QSTRING 255
481 #define UDM_VARSRC_ENV     254
482 
483 
484 typedef struct
485 {
486   udm_secno_t        secno;    /**< Number 0..255   */
487   udm_var_flag_t     flags;
488 } UDM_SECTION_PARAM;
489 
490 
491 typedef struct
492 {
493   UDM_DSTR          Value;    /**< Value with length  */
494   UDM_SECTION_PARAM Param;  /**< Parameters         */
495 } UDM_SECTION;
496 
497 
498 struct udm_value_st;
499 struct udm_var_st;
500 struct udm_varlist_st;
501 struct udm_value_handler_st;
502 struct udm_prog_executor_state_st;
503 struct udm_prog_executor_st;
504 
505 typedef struct udm_value_prototype_st
506 {
507   const struct udm_value_handler_st *handler;
508 } UDM_VALUE_PROTOTYPE;
509 
510 
511 typedef struct udm_function_prototype_st
512 {
513   UDM_VALUE_PROTOTYPE result;
514   size_t nargs;
515   const UDM_VALUE_PROTOTYPE *args;
516 } UDM_FUNCTION_PROTOTYPE;
517 
518 
519 typedef void (*udm_func_runtime_t)(struct udm_prog_executor_state_st *state);
520 
521 typedef struct
522 {
523   const UDM_CONST_STR name;
524   UDM_FUNCTION_PROTOTYPE prototype;
525   udm_func_runtime_t func;
526 } UDM_FUNCTION;
527 
528 
529 typedef struct udm_value_handler_st
530 {
531   udm_value_handler_type_t type;
532   udm_value_data_type_t datatype;
533   udm_value_data_type_t native_reg_type;
534   const char *type_name;
535   udm_rc_t (*Constructor)(char *Data,
536                           const struct udm_value_st **args, size_t nargs);
537   void (*Destructor)(char *Data);
538 
539   udm_rc_t (*Copy)(char *To, const char *From);
540 
541   size_t (*VarSize)(void);
542   size_t (*ValueSize)(void);
543   size_t (*DataOffset)(void);
544   size_t (*DataSize)(void);
545   size_t (*DataAlignment)(void);
546   udm_rc_t (*Dump)(const char *Data, const char *name, FILE *f);
547   udm_var_flag_t (*Flags)(const char *Data);
548   udm_secno_t (*Secno)(const char *Data);
549   size_t (*MemUsed)(const char *Data);
550 
551   void (*GetConstStr)(const char *D, UDM_CONST_STR *Val);
552   void (*GetBool)(const char *D, udm_bool_t *val);
553   void (*GetInt)(const char *D, int *val);
554   void (*GetUInt)(const char *D, unsigned int *val);
555   void (*GetDouble)(const char *D, double *val);
556   udm_rc_t (*SetInt)(char *D, int value);
557   udm_rc_t (*SetDouble)(char *D, double value);
558   udm_rc_t (*SetStrn)(char *D, const char *str, size_t length);
559 
560   udm_rc_t (*PrintToFile)(const char *D, FILE *f);
561   udm_rc_t (*PrintToDSTR)(const char *D, UDM_DSTR *dstr);
562 
563   udm_rc_t (*ConvertCharset)(char *Data, UDM_CONV *conv, int flags);
564   /* Set new value but don't touch the other members (e.g. section param) */
565   udm_rc_t (*SetConv)(char *Data, UDM_CONV *cnv, int cnvflag, const char *src, size_t length);
566   udm_rc_t (*AppendStrn)(char *Data, const char *str, size_t length);
567   udm_rc_t (*AppendConv)(char *Data, size_t maxlen, UDM_CONV *cnv, int cnvflags, const char *src, size_t length);
568 
569   const UDM_FUNCTION *method;
570 } UDM_VALUE_HANDLER;
571 
572 
573 typedef struct udm_value_st
574 {
575   const struct udm_value_handler_st *handler; /**< Value handler  */
576 } UDM_VALUE;
577 
578 
579 typedef struct udm_var_header_st
580 {
581   char                      *name;  /**< Variable name */
582 } UDM_VAR_HEADER;
583 
584 typedef struct udm_var_st
585 {
586   UDM_VAR_HEADER            header;
587 #ifdef UDM_VAR_H_PRIVATE
588   UDM_VALUE                 Value;
589 #else
590   UDM_VALUE                 m_hidden_Value;
591 #endif
592 } UDM_VAR;
593 
594 
595 typedef enum
596 {
597   UDM_VARLIST_FLAG_NONE= 0,
598   UDM_VARLIST_FLAG_CS= 1  /* Case sensitive */
599 } udm_varlist_flag_t;
600 
601 
602 typedef struct udm_varlist_st
603 {
604   size_t   nvars; /* Number of registered variables */
605   size_t   mvars; /* Number of allocated variables  */
606   UDM_VAR  **Var;
607   udm_varlist_flag_t flags;
608 } UDM_VARLIST;
609 
610 
611 typedef struct
612 {
613   size_t nitems;
614   size_t mitems;
615   UDM_VARLIST *Item;
616 } UDM_VARLISTLIST;
617 
618 
619 typedef enum
620 {
621   UDM_TEXTLIST_FLAG_NONE=             0x00,
622   UDM_TEXTLIST_FLAG_SKIP_ADD_SECTION= 0x01,
623   UDM_TEXTLIST_FLAG_RFC1522=          0x02, /* Message header (Subj, From)*/
624   UDM_TEXTLIST_FLAG_MESSAGE_RFC822=   0x04, /* Used by cached copy for messages */
625   UDM_TEXTLIST_FLAG_HTML=             0x08  /* HTML format with entities */
626 } udm_textlist_flag_t;
627 
628 
629 typedef struct
630 {
631   udm_secno_t secno;
632   udm_textlist_flag_t flags;
633 } UDM_TEXT_PARAM;
634 
635 
636 typedef struct
637 {
638   UDM_CONST_STR text;
639   UDM_CONST_STR href;
640   UDM_CONST_STR section_name;
641 } UDM_CONST_TEXTITEM;
642 
643 
644 typedef struct
645 {
646   char  *str;
647   char  *href;
648   char  *section_name;
649   UDM_TEXT_PARAM Param;
650 } UDM_TEXTITEM;
651 
652 
653 typedef struct
654 {
655   size_t        nitems;
656   size_t        mitems;
657   UDM_TEXTITEM	*Item;
658 } UDM_TEXTLIST;
659 
660 /*****************************************************/
661 
662 /** StopList unit */
663 typedef struct udm_stopword_struct
664 {
665   char *word;
666 } UDM_STOPWORD;
667 
668 #define UDM_STOPLIST_LANGLEN 32
669 #define UDM_STOPLIST_CSETLEN 32
670 #define UDM_STOPLIST_FILELEN 128
671 
672 typedef struct
673 {
674   size_t nstopwords;
675   UDM_STOPWORD	*StopWord;
676   char  lang[UDM_STOPLIST_LANGLEN];
677   char  cset[UDM_STOPLIST_CSETLEN];
678   char  fname[UDM_STOPLIST_FILELEN];
679 } UDM_STOPLIST;
680 
681 typedef struct
682 {
683   size_t nitems;
684   UDM_STOPLIST *Item;
685 } UDM_STOPLISTLIST;
686 
687 /*****************************************************/
688 
689 /** Words parameters */
690 typedef struct
691 {
692   size_t min_word_len;
693   size_t max_word_len;
694 } UDM_WORDPARAM;
695 
696 
697 /**************************/
698 typedef struct udm_coord_st
699 {
700   udm_pos_t pos:24;   /* 3 */
701   udm_secno_t secno;  /* 1 */
702 } UDM_COORD;
703 
704 typedef struct udm_urlid_coord_st
705 {
706   urlid_t url_id;     /* 4 */
707   UDM_COORD coord;    /* 4 */
708 } UDM_URLID_COORD;
709 
710 typedef struct
711 {
712   UDM_URLID_COORD urlid_coord;
713   udm_pos_t     seclen:24; /* 3 */  /*TODO34: get rid of this*/
714   udm_wordnum_t num;       /* 1 */  /*TODO34: get rid of this*/
715 } UDM_URL_CRD;             /* 12 bytes total */
716 
717 
718 typedef struct
719 {
720   size_t       acoords;
721   size_t       ncoords;
722   size_t       order;
723   char	       *word;
724   UDM_URL_CRD  *Coords;
725 } UDM_URLCRDLIST;
726 
727 
728 /***************************/
729 typedef struct udm_coord2_st
730 {
731   udm_pos_t pos:24;
732   udm_wordnum_t order;
733 } UDM_COORD2;
734 
735 
736 typedef struct udm_searchsection_st
737 {
738   UDM_COORD2 *Coord;                 /* 4/8 bytes */
739   const unsigned char *PackedCoord;  /* 4/8 bytes */
740   urlid_t url_id;                    /* 4   bytes */
741   udm_pos_t ncoords;                 /* 4   bytes */
742   udm_pos_t seclen;                  /* 4   bytes */
743   udm_pos_t minpos;                  /* 4   bytes */
744   udm_pos_t maxpos;                  /* 4   bytes */
745   udm_secno_t secno;                 /* 1   byte  */
746   udm_wordnum_t wordnum;             /* 1   byte  */
747   udm_wordnum_t order;               /* 1   byte  */
748 } UDM_SEARCHSECTION;                 /* 32 bytes (i386), 40 bytes (64bit) */
749 
750 
751 typedef struct udm_searchsectionlist_st
752 {
753   size_t mcoords;
754   size_t ncoords;
755   UDM_COORD2 *Coord;
756   size_t msections;
757   size_t nsections;
758   UDM_SEARCHSECTION *Section;
759 } UDM_SEARCHSECTIONLIST;
760 
761 
762 typedef struct udm_searchsectionlistlist_st
763 {
764   size_t nitems;
765   size_t mitems;
766   UDM_SEARCHSECTIONLIST *Item;
767 } UDM_SEARCHSECTIONLISTLIST;
768 
769 
770 
771 /** Main search structure */
772 typedef struct {
773   urlid_t url_id;
774   uint4   score;
775 } UDM_URL_SCORE;
776 
777 typedef struct {
778   size_t nitems;
779   UDM_URL_SCORE	*Item;
780 } UDM_URLSCORELIST;
781 
782 
783 /* UserScore and UserSiteScore structure */
784 
785 typedef struct udm_url_int4_st
786 {
787   urlid_t url_id;
788   int4  param;
789 } UDM_URL_INT4;
790 
791 typedef struct udm_url_int4_list_st
792 {
793   size_t nitems;
794   UDM_URL_INT4 *Item;
795 } UDM_URL_INT4_LIST;
796 
797 
798 
799 /* Structure to handle limits */
800 typedef struct udm_urlid_list_st
801 {
802   char empty;
803   char exclude;
804   urlid_t *urls;
805   size_t nurls;
806 } UDM_URLID_LIST;
807 
808 
809 typedef struct
810 {
811   urlid_t   url_id;
812   uint4     score;
813   uint4     per_site;
814   urlid_t   site_id;
815   time_t    last_mod_time;
816   double    pop_rank;
817   char      *url;
818   char      *section;
819 } UDM_URLDATA;
820 
821 #define UDM_COORD2DBNUM(score) (255 - (int) ((score) & 0xFF))
822 
823 typedef struct
824 {
825   size_t       nitems;
826   UDM_URLDATA  *Item;
827 } UDM_URLDATALIST;
828 
829 
830 /** Word list unit */
831 typedef struct
832 {
833   char		*word;
834   UDM_COORD     coord;
835   unsigned char hash;
836   unsigned char seclen_marker;
837 } UDM_WORD;
838 
839 typedef struct
840 {
841   size_t    wordpos[256]; /**< Word positions in sections               */
842   size_t    mwords;       /**< Number of memory allocated for words     */
843   size_t    nwords;       /**< Real number of words in list             */
844   UDM_WORD *Word;         /**< Word list  itself                        */
845 } UDM_WORDLIST;
846 
847 
848 typedef struct
849 {
850   size_t nitems;
851   UDM_WORDLIST Item[256];
852 } UDM_WORDLISTLIST;
853 
854 
855 typedef struct
856 {
857   const char *str;      /* 4 */
858   udmcrc32_t crc;       /* 4 */
859   UDM_COORD coord;      /* 4 */
860   unsigned char length; /* 1 */
861 } UDM_CONSTWORD;
862 
863 
864 typedef struct
865 {
866   size_t nitems;
867   size_t mitems;
868   UDM_CONSTWORD *Item;
869   udm_pos_t wordpos[256];
870 } UDM_CONSTWORDLIST;
871 
872 /***************************************************************/
873 
874 /* Server/Realm follow types */
875 typedef enum
876 {
877   UDM_WEBSPACE_PAGE= 0,
878   UDM_WEBSPACE_PATH= 1,
879   UDM_WEBSPACE_SITE= 2,
880   UDM_WEBSPACE_WORLD= 3,
881   UDM_WEBSPACE_URLLIST= 4,
882   UDM_WEBSPACE_UNKNOWN= 127
883 } udm_webspace_t;
884 #define UDM_WEBSPACE_DEFAULT  UDM_WEBSPACE_PATH
885 
886 typedef struct
887 {
888   udm_bool_t index;   /**< Whether to index words         */
889   udm_bool_t follow;  /**< Whether follow links           */
890   udm_bool_t archive; /**< Whether to store cached copies */
891 } UDM_ROBOTSPARAM;
892 
893 
894 typedef struct
895 {
896   int max_net_errors;
897   int net_error_delay_time;
898   int read_timeout;
899   int doc_timeout;
900   int period;                   /**< Reindex period           */
901   int maxhops;                  /**< Max way in mouse clicks  */
902   int doc_per_site;
903   int crawl_delay;
904   int dns_cache_timeout;
905   int link_sources_to_follow;
906   udm_link_destination_t collect_links_destination;
907   udm_link_format_t      collect_links_format;
908   udm_webspace_t webspace;      /**< World, Site, Path, Page  */
909   udm_use_robots_t use_robots;  /**< Whether to use robots.txt and meta tags */
910   UDM_ROBOTSPARAM robots;
911   udm_bool_t use_clones;        /**< Whether to detect clones */
912   udm_bool_t ajax_links;    /**< Whether to detect links with '#!' */
913 } UDM_SPIDERPARAM;
914 
915 /*****************************************************************/
916 
917 typedef enum
918 {
919   UDM_MATCH_FLAG_NONE= 0,
920   UDM_MATCH_FLAG_SKIP_OPTIMIZATION= 1,
921   UDM_MATCH_FLAG_CASE_INSENSITIVE= 2,
922   UDM_MATCH_FLAG_NEGATIVE= 4
923 } udm_matchflag_t;
924 
925 
926 typedef struct
927 {
928   int one;
929   int many;
930   int eol;
931 } UDM_WILD_PARAM;
932 
933 
934 typedef struct
935 {
936   udm_match_mode_t match_mode;
937   udm_matchflag_t flags;      /* optimization, case sensitivity, negative */
938 } UDM_MATCH_PARAM;
939 
940 typedef struct
941 {
942   UDM_MATCH_PARAM Param;
943 #ifdef UDM_MATCH_PRIV
944   UDM_STR Pattern;
945 #else
946   UDM_STR m_hidden_Pattern;
947 #endif
948   void *specific;
949 } UDM_MATCH;
950 
951 
952 typedef struct
953 {
954   int beg;
955   int end;
956 } UDM_MATCH_PART;
957 
958 
959 typedef struct
960 {
961   UDM_MATCH Match;
962   int quality;
963 } UDM_EXCERPT_FRAGMENT;
964 
965 
966 typedef struct
967 {
968   UDM_MATCH Match;
969   UDM_STR Replace;
970 } UDM_REPLACE;
971 
972 
973 typedef struct
974 {
975   size_t nitems;
976   size_t mitems;
977   UDM_REPLACE *Item;
978 } UDM_REPLACELIST;
979 
980 
981 typedef struct
982 {
983   UDM_REPLACE Replace;
984   UDM_STR SectionName;
985   UDM_STR Source;
986 } UDM_USERSECTION;
987 
988 
989 typedef struct
990 {
991   size_t nitems;
992   size_t mitems;
993   UDM_USERSECTION *Item;
994 } UDM_USERSECTIONLIST;
995 
996 
997 typedef struct
998 {
999   UDM_MATCH Match;
1000   udm_method_t method;
1001 } UDM_FILTER;
1002 
1003 
1004 typedef struct
1005 {
1006   size_t mitems;
1007   size_t nitems;
1008   UDM_FILTER *Item;
1009 } UDM_FILTERLIST;
1010 
1011 
1012 typedef struct
1013 {
1014   UDM_FILTER Filter;
1015   char *section;
1016 } UDM_SECTIONFILTER;
1017 
1018 
1019 typedef struct
1020 {
1021   size_t mitems;
1022   size_t nitems;
1023   UDM_SECTIONFILTER *Item;
1024 } UDM_SECTIONFILTERLIST;
1025 
1026 
1027 /*****************************************************************/
1028 /** Parsed URL string */
1029 typedef struct udm_url
1030 {
1031   char *schema;
1032   char *specific;
1033   char *hostinfo;
1034   char *auth;
1035   char *hostname;
1036   char *path;
1037   char *filename;
1038   char *anchor;
1039   int  port;
1040   int  default_port;
1041 } UDM_URL;
1042 
1043 
1044 /** List of URLs */
1045 typedef struct udm_url_list
1046 {
1047   size_t nitems;
1048   size_t mitems;
1049   UDM_URL *Item;
1050 } UDM_URLLIST;
1051 
1052 
1053 /*****************************************************************/
1054 
1055 /** Structure to store server parameters */
1056 typedef struct
1057 {
1058   UDM_FILTER   Filter;
1059   urlid_t      site_id;        /**< server.rec_id            */
1060   char         command;        /**< 'S' - server,realm, 'F' - disallow,allow */
1061   int          ordre;          /**< order in list to find    */
1062   urlid_t      parent;         /**< parent rec_id for grouping by site */
1063   float        weight;         /**< server weight for popularity rank calculation */
1064   UDM_VARLIST  Vars;           /**< Default lang, charset,etc*/
1065   UDM_URLLIST  ProxyList;      /**< List of proxies */
1066   uint4        MaxHops;
1067   udm_webspace_t webspace;     /* Page, Path, Site, World, etc*/
1068   /*udm_method_t method;*/         /* Allow, Disallow, etc */
1069   udm_bool_t   enabled;
1070 } UDM_SERVER;
1071 
1072 
1073 typedef struct
1074 {
1075   size_t      nservers;
1076   size_t      mservers;
1077   int         have_subnets;
1078   UDM_SERVER  *Server;
1079 } UDM_SERVERLIST;
1080 
1081 
1082 /*******************************************************/
1083 
1084 typedef struct
1085 {
1086   size_t  max_doc_per_site;
1087   urlid_t referrer;
1088   uint4	hops;
1089   urlid_t server_id;
1090   urlid_t rec_id;
1091   udm_method_t method;
1092   udm_bool_t stored;
1093   udm_link_source_t link_source;
1094   udm_link_destination_t method_reason;
1095 } UDM_HREFPARAM;
1096 
1097 /* All links are stored in the cache of this structure */
1098 /* before actual INSERT into database                  */
1099 
1100 typedef struct
1101 {
1102   char *url;
1103   UDM_VARLIST HrefVars;
1104   UDM_HREFPARAM Param;
1105 } UDM_HREF;
1106 
1107 
1108 typedef enum
1109 {
1110   UDM_HREFLIST_FLAG_NONUNIQ= 1
1111 } udm_hreflist_flag_t;
1112 
1113 
1114 typedef struct
1115 {
1116   size_t    mhrefs;
1117   size_t    nhrefs;
1118   size_t    shrefs;
1119   UDM_HREF  *Href;
1120   int       flags;
1121 } UDM_HREFLIST;
1122 
1123 /*******************************************************/
1124 
1125 /* IP with parameters */
1126 typedef struct udm_sin_addr_st
1127 {
1128   struct in_addr addr;
1129   time_t first_used;
1130   time_t last_used;
1131   size_t hits;
1132 } UDM_INADDR;
1133 
1134 
1135 typedef struct
1136 {
1137   size_t nitems;
1138   size_t mitems;
1139   UDM_INADDR *Item;
1140 } UDM_INADDRLIST;
1141 
1142 
1143 /** Resolve stuff */
1144 typedef struct udm_host_addr_struct
1145 {
1146   char            *hostname;
1147   struct in_addr  addr;
1148   int             net_errors;
1149   time_t          last_used;
1150   time_t          expires;
1151 } UDM_HOST_ADDR;
1152 
1153 
1154 typedef struct
1155 {
1156   size_t          nhost_addr;
1157   size_t          mhost_addr;
1158   UDM_HOST_ADDR  *host_addr;
1159 } UDM_HOSTLIST;
1160 
1161 
1162 /** Used in FTP sessions */
1163 typedef struct udm_conn_struct
1164 {
1165   int     status;
1166   int     connected;
1167   int     err;
1168   time_t  host_last_used;
1169   int     conn_fd;
1170 #ifdef WIN32
1171   unsigned short port;
1172 #else
1173   int     port;
1174 #endif
1175   int     timeout;
1176   char    *hostname;
1177   char    *user;
1178   char    *pass;
1179   struct  sockaddr_in sin;
1180   int     buf_len;
1181   size_t  buf_len_total;
1182   int     len;
1183   char    *buf;
1184   int     net_errors;
1185   struct  udm_conn_struct *connp;
1186 } UDM_CONN;
1187 
1188 /***************************************************/
1189 
1190 typedef struct
1191 {
1192   char   *buf;           /**< Buffer to download document to    */
1193   char   *content;       /**< Pointer to content, after headers */
1194   size_t  size;          /**< Number of bytes loaded            */
1195   size_t  alloced_size;   /**< Maximum bytes to load into buf    */
1196 } UDM_HTTPBUF;
1197 
1198 
1199 typedef struct
1200 {
1201   int stored;           /**< If it is already stored, forAddHref()   */
1202   udm_method_t method;  /**< How to download document: GET, HEAD etc */
1203 
1204   UDM_HTTPBUF     Buf;                  /**< Buffer       */
1205 
1206   UDM_HREFLIST    Hrefs;                /**< Link list    */
1207 
1208   UDM_VARLIST     RequestHeaders;       /**< Extra headers*/
1209   UDM_VARLIST     Sections;             /**< User sections*/
1210 
1211   UDM_TEXTLIST    TextList;             /**< Text list    */
1212   UDM_URL         CurURL;               /**< Parsed URL   */
1213   UDM_CHARSET     *lcs;                 /**< LocalCharser */
1214   UDM_SPIDERPARAM Spider;               /**< Spider prms  */
1215   UDM_CONN        connp;                /**< For FTP      */
1216   UDM_CONN        connp2;               /**< For FTP      */
1217 } UDM_DOCUMENT;
1218 
1219 /********************************************************/
1220 
1221 /** External Parsers */
1222 typedef struct udm_parser_struct
1223 {
1224   char *from_mime;
1225   char *to_mime;
1226   char *cmd;
1227   char *src;
1228 } UDM_PARSER;
1229 
1230 typedef struct
1231 {
1232   size_t      nparsers;
1233   UDM_PARSER  *Parser;
1234 } UDM_PARSERLIST;
1235 
1236 
1237 /******* Ispell BEGIN ********/
1238 
1239 #define UDM_SPELL_NOPREFIX 1
1240 
1241 typedef struct udm_spell_st
1242 {
1243   char *word;
1244   char *flags;
1245 } UDM_SPELL;
1246 
1247 #define UDM_SPELL_LANGLEN 32
1248 #define UDM_SPELL_CSETLEN 32
1249 #define UDM_SPELL_FILELEN 128
1250 #define UDM_SPELL_FMT_TEXT 0
1251 #define UDM_SPELL_FMT_HASH 1
1252 
1253 typedef struct udm_dict_st
1254 {
1255   char  lang[UDM_SPELL_LANGLEN];
1256   char  cset[UDM_SPELL_CSETLEN];
1257   char  fname[UDM_SPELL_FILELEN];
1258   int   fmt;
1259   int   fd;
1260   size_t wordlen;
1261   UDM_CHARSET *cs;
1262   char   *fbody;
1263   size_t nitems;
1264   size_t mitems;
1265   UDM_SPELL *Item;
1266 } UDM_SPELLLIST;
1267 
1268 
1269 typedef struct udm_spelllistlist_st
1270 {
1271   size_t nitems;
1272   size_t mitems;
1273   UDM_SPELLLIST *Item;
1274 } UDM_SPELLLISTLIST;
1275 
1276 
1277 typedef struct udm_aff_st UDM_AFFIX; /* Defined in spell.c */
1278 
1279 
1280 typedef struct udm_afflist_st
1281 {
1282   size_t mitems;
1283   size_t nitems;
1284   char  lang[UDM_SPELL_LANGLEN];
1285   char  cset[UDM_SPELL_CSETLEN];
1286   char  fname[UDM_SPELL_FILELEN];
1287   UDM_CHARSET *cs;
1288   UDM_AFFIX *Item;
1289 } UDM_AFFIXLIST;
1290 
1291 
1292 typedef struct udm_afflistlist_st
1293 {
1294   size_t mitems;
1295   size_t nitems;
1296   UDM_AFFIXLIST *Item;
1297 } UDM_AFFIXLISTLIST;
1298 
1299 
1300 /******* Ispell END **********/
1301 
1302 
1303 typedef struct
1304 {
1305   udm_method_t  method; /**< 'allow' or 'disallow' */
1306   char         *path;
1307 } UDM_ROBOT_RULE;
1308 
1309 
1310 typedef struct
1311 {
1312   char           *hostinfo;
1313   size_t          nrules;
1314   UDM_ROBOT_RULE  *Rule;
1315 } UDM_ROBOT;
1316 
1317 
1318 typedef struct
1319 {
1320   size_t     nrobots;
1321   UDM_ROBOT  *Robot;
1322 } UDM_ROBOTS;
1323 
1324 
1325 /********************************************************/
1326 
1327 typedef struct
1328 {
1329   size_t           order; /*TODO34: change to udm_wordnum_t */
1330   size_t           order_extra_width; /* For multi-word synonyms, see below */
1331   size_t           count;
1332   size_t           doccount;    /* Number of documents this word appears in */
1333   udm_wordorigin_t origin;      /* query, spell, synonym, etc */
1334   int              weight;      /* origin-dependent weight   */
1335   int              user_weight; /* User-supplied weight */
1336   udm_match_mode_t match_mode;  /* BEGIN,END,SUBSTR,NUM_LT,NUM_GT,FULL*/
1337   size_t           secno;       /* Which section to search in; TODO34: udm_secno_t */
1338   size_t           phrpos;      /* 0 means "not in phrase"    */
1339   size_t           phrlen;      /* phase length               */
1340   size_t           phrwidth;    /* How many additional parts in a multi-word */
1341 } UDM_WIDEWORD_PARAM;
1342 
1343 
1344 typedef struct
1345 {
1346   UDM_STR Word;
1347   UDM_WIDEWORD_PARAM Param;
1348 } UDM_WIDEWORD;
1349 
1350 
1351 /*
1352   order_extra_width - use in case of many-to-one and many-to-many synonyms.
1353   It represents the number of query words this synonym covers.
1354   For example, if we have synonym:
1355   "aaaa bbbb" -> "cccc"
1356   then origin_extra_width for the words "cccc" will be 2, because
1357   it is a replacement for two words.
1358   0 means "not a multi-word replacement".
1359 */
1360 
1361 typedef struct
1362 {
1363   udm_match_mode_t  match_mode;       /* Search mode: wrd, sub, beg, end */
1364   udm_bool_t        strip_noaccents;  /* If accent insensitive comparison*/ /* TODO34: change to a flag mask? */
1365   size_t            nuniq;
1366   size_t            nwords;
1367   UDM_WIDEWORD	    *Word;
1368 } UDM_WIDEWORDLIST;
1369 
1370 
1371 /*****************************************************************/
1372 
1373 typedef struct
1374 {
1375   char *p;
1376   char *s;
1377   udm_wordorigin_t origin; /* SYNONYM or SYNONYM_FINAL */
1378 } UDM_SYNONYM;
1379 
1380 
1381 #define UDM_SYNONYM_LANGLEN 32
1382 #define UDM_SYNONYM_CSETLEN 32
1383 #define UDM_SYNONYM_FILELEN 128
1384 #define UDM_SYNONYM_FMT_TEXT 0
1385 #define UDM_SYNONYM_FMT_HASH 1
1386 
1387 typedef struct
1388 {
1389   size_t      nsynonyms;
1390   size_t      msynonyms;
1391   UDM_SYNONYM *Synonym;
1392   char        lang[UDM_SYNONYM_LANGLEN];
1393   char        cset[UDM_SYNONYM_CSETLEN];
1394   char        fname[UDM_SYNONYM_FILELEN];
1395   size_t      max_phrase_length; /* Many-to-many, many-to-one, one-to-many */
1396 } UDM_SYNONYMLIST;
1397 
1398 
1399 typedef struct
1400 {
1401   size_t nitems;
1402   UDM_SYNONYMLIST *Item;
1403 } UDM_SYNONYMLISTLIST;
1404 
1405 
1406 typedef struct udm_chinaword_struct
1407 {
1408   int *word;
1409   int  freq;
1410 } UDM_CHINAWORD;
1411 
1412 
1413 typedef struct
1414 {
1415   size_t        nwords;
1416   size_t        mwords;
1417   size_t        total;
1418   UDM_CHINAWORD *ChiWord;
1419   size_t        *hash;
1420 } UDM_CHINALIST;
1421 
1422 
1423 /*************************************************************/
1424 
1425 
1426 /*** Boolean search constants and types ****/
1427 typedef enum
1428 {
1429   UDM_BOOLCMD_LEFT=    0,
1430   UDM_BOOLCMD_RIGHT=   1,
1431   UDM_BOOLCMD_BOT=     2,
1432   UDM_BOOLCMD_OR=      3,
1433   UDM_BOOLCMD_AND=     4,
1434   UDM_BOOLCMD_NOT=     5,
1435   UDM_BOOLCMD_PHRASE=  6,
1436   UDM_BOOLCMD_WORD=    200,
1437   UDM_BOOLCMD_STOP=    201
1438 } udm_boolcmd_t;
1439 
1440 
1441 typedef struct
1442 {
1443   size_t  ncstack;
1444   size_t  mcstack;
1445   udm_boolcmd_t *cstack;
1446   size_t  nastack;
1447   size_t  mastack;
1448   unsigned long	*astack;
1449 } UDM_BOOLSTACK; /* TODO34: split into two lists */
1450 
1451 
1452 typedef struct
1453 {
1454   udm_boolcmd_t cmd;
1455   unsigned long arg;
1456 } UDM_BOOLEXPR_ITEM;
1457 
1458 
1459 typedef struct
1460 {
1461   size_t nitems;
1462   size_t mitems;
1463   size_t ncmds;
1464   UDM_BOOLEXPR_ITEM *items;
1465 } UDM_BOOLEXPR;
1466 
1467 
1468 /*****************************/
1469 typedef struct
1470 {
1471   size_t first;
1472   size_t last;
1473   size_t total_found;
1474 } UDM_QUERY_STATS;
1475 
1476 
1477 typedef struct
1478 {
1479   size_t            num_rows;
1480   size_t            cur_row;
1481   UDM_DOCUMENT      *Doc;
1482   UDM_VARLIST       Vars;
1483   UDM_WIDEWORDLIST  WWList;
1484 } UDM_RESULT;
1485 
1486 
1487 typedef struct
1488 {
1489   UDM_QUERY_STATS       stats;
1490   UDM_RESULT            Res;
1491   UDM_URLDATALIST       URLData;
1492   UDM_BOOLEXPR          BoolExpr;
1493   UDM_STATLIST          StatList;
1494   UDM_SEARCHSECTIONLIST SectionList;
1495   size_t num_best_rows;
1496   char *where;
1497   char *from;
1498 } UDM_QUERY;
1499 
1500 
1501 struct udm_dbhandler_st;
1502 
1503 typedef struct udm_db_st
1504 {
1505   const struct udm_dbhandler_st *dbhandler;
1506   void *specific;
1507 } UDM_DB;
1508 
1509 
1510 typedef struct
1511 {
1512   size_t  nitems;
1513   UDM_DB  *Item;
1514 } UDM_DBLIST;
1515 
1516 
1517 #define UDM_LOG_FLAG_SKIP_PID 0x00000001
1518 
1519 typedef struct
1520 {
1521   int is_log_open; /* Flag indicating if openlog() has been called      */
1522   FILE *logFD;     /* File descriptor, when logging to stderr or file   */
1523   int facility;    /* Which facility to use, or negative number if none */
1524   int flags;
1525   int level;
1526 } UDM_LOG;
1527 
1528 
1529 /****** SQLMon **************/
1530 
1531 typedef enum
1532 {
1533   UDM_MSG_DATA= 0,     /* Column values                       */
1534   UDM_MSG_ERROR= 1,    /* Error text                          */
1535   UDM_MSG_COMMAND= 2,  /* The command being executed          */
1536   UDM_MSG_EOL= 3,      /* End-of-line (e.g. record has ended) */
1537   UDM_MSG_PROMPT= 4,   /* ">SQL"                              */
1538   UDM_MSG_INFO= 5      /* e.g. "Connection changed to #1"     */
1539 } udm_msg_t;
1540 
1541 
1542 /* Input/Output handler, e.g. for "indexer --sqlmon" */
1543 typedef struct udm_iohandler_st
1544 {
1545   void *user_data;
1546   char *(*gets)(struct udm_iohandler_st *prm, char *str, size_t size);
1547   udm_rc_t (*prompt)(struct udm_iohandler_st *prm, udm_msg_t msgtype, const char *msg);
1548 } UDM_IOHANDLER;
1549 
1550 
1551 /** Forward declaration of UDM_AGENT */
1552 struct udm_indexer_struct_st;
1553 
1554 typedef struct
1555 {
1556   void (*Lock)(struct udm_indexer_struct_st *,
1557                udm_mutexcmd_t command, udm_mutexno_t number,
1558                const char *fname, int lineno);
1559   udm_rc_t (*ThreadCreate)(void *thd, void *(*start_routine)(void *), void *arg);
1560   udm_rc_t (*ThreadJoin)(void *thd);
1561 } UDM_THDHANDLER;
1562 
1563 #define UDM_GETLOCK(A,mutex)           if((A)->Conf->THDHandler.Lock)(A)->Conf->THDHandler.Lock((A),UDM_LOCK,(mutex),__FILE__,__LINE__)
1564 #define UDM_RELEASELOCK(A,mutex)       if((A)->Conf->THDHandler.Lock)(A)->Conf->THDHandler.Lock((A),UDM_UNLOCK,(mutex),__FILE__,__LINE__)
1565 #define UDM_LOCK_CHECK_OWNER(A,mutex)  if((A)->Conf->THDHandler.Lock)(A)->Conf->THDHandler.Lock((A),UDM_CKLOCK,(mutex),__FILE__,__LINE__)
1566 
1567 
1568 typedef struct
1569 {
1570   size_t maxlen[256];
1571 } UDM_SECTIONPARAM;
1572 
1573 
1574 /** Config file */
1575 typedef struct udm_config_struct
1576 {
1577   char                errstr[UDM_ERRSTR_SIZE];
1578   UDM_CHARSET         *bcs;
1579   UDM_CHARSET         *lcs;
1580   UDM_UNIDATA         *unidata;
1581 
1582   int                 url_number;      /**< For indexer -nXXX          */
1583 
1584   UDM_SERVERLIST      Servers;         /**< List of servers and realms */
1585   UDM_SERVER          *Cfg_Srv;
1586 
1587   UDM_REPLACELIST     Aliases;         /**< Straight aliases           */
1588   UDM_REPLACELIST     ReverseAliases;  /**< Reverse aliases            */
1589   UDM_REPLACELIST     MimeTypes;       /**< For AddType commands       */
1590   UDM_REPLACELIST     Encodings;       /**< For AddType commands       */
1591 
1592   UDM_FILTERLIST      Filters;         /**< Allow, Disallow,etc        */
1593   UDM_SECTIONFILTERLIST SectionFilters;  /**< IndexIf, NoIndexIf         */
1594   UDM_USERSECTIONLIST SectionHdrMatch; /**< User sections after headers*/
1595   UDM_USERSECTIONLIST SectionGsrMatch; /**< User sections after quesser*/
1596   UDM_USERSECTIONLIST SectionMatch;    /**< User sections after parser */
1597 
1598   UDM_HREFLIST        Hrefs;           /**< Links cache                */
1599   UDM_RESULT          Targets;         /**< Targets cache              */
1600 
1601   UDM_SECTIONPARAM    SectionParam;    /**< section parameters         */
1602   UDM_VARLIST         Sections;        /**< document section to parse  */
1603   UDM_VARLIST         Vars;            /**< Config parameters          */
1604   UDM_VARLIST         Cookies;         /**< Cookie list                */
1605   UDM_VARLIST         XMLEnterHooks;
1606   UDM_VARLIST         XMLLeaveHooks;
1607   UDM_VARLIST         XMLDataHooks;
1608 
1609   UDM_LANGMAPLIST     LangMaps;        /**< For lang+charset quesser   */
1610   UDM_ROBOTS          Robots;          /**< robots.txt information     */
1611   UDM_SYNONYMLISTLIST Synonym;         /**< Synonims list              */
1612   UDM_STOPLISTLIST    StopWord;        /**< Stopwords list             */
1613   UDM_PARSERLIST      Parsers;         /**< External  parsers          */
1614   UDM_DBLIST          DBList;          /**< Databases                  */
1615   UDM_HOSTLIST        Hosts;           /**< Resolve cache              */
1616   UDM_INADDRLIST      InAddr;          /**< IP cache                   */
1617   UDM_SPELLLISTLIST   Spells;          /**< For ispell dictionaries    */
1618   UDM_AFFIXLISTLIST   Affixes;         /**< For ispell affixes         */
1619   UDM_WORDPARAM       WordParam;       /**< Word limits                */
1620   UDM_CHINALIST       Chi;             /**< Chinese words list         */
1621   UDM_CHINALIST       Thai;            /**< Thai words list            */
1622 
1623   UDM_LOG Log;
1624 
1625   int    CVS_ignore;                   /**< Skip CVS directgories, for tests */
1626 
1627   /* Various virtual functions */
1628   UDM_THDHANDLER THDHandler;
1629   void (*ThreadInfo)(struct udm_indexer_struct_st *, const char * state, const char * str);
1630   void (*RefInfo)(int code, const char *url, const char *ref);
1631   int  (*DumpDoc)(struct udm_indexer_struct_st *, UDM_DOCUMENT *Doc);
1632 #ifdef MECAB
1633    mecab_t         *mecab;
1634 #endif
1635 
1636 } UDM_ENV;
1637 
1638 
1639 #define UDM_AGENT_STATE_MUTEXES 5
1640 
1641 typedef struct udm_agent_state_t
1642 {
1643   const char *task;
1644   const char *param;
1645   const char *extra;
1646   time_t start_time;
1647 } UDM_AGENT_STATE;
1648 
1649 
1650 /** Indexer */
1651 typedef struct udm_indexer_struct_st
1652 {
1653   udm_threadid_t  handle;      /**< Handler for threaded version */
1654   time_t          start_time;  /**< Time of allocation, for stat */
1655   udm_uint8       nbytes;      /**< Number of bytes downloaded   */
1656   size_t          ndocs;       /**< Number of documents          */
1657   size_t          nsleepsecs;  /**> Number of sleep seconds      */
1658   int             flags;       /**< Callback function to request action */
1659   udm_rc_t        action;
1660   int             doccount;    /**< for UdmGetDocCount()         */
1661   UDM_ENV         *Conf;       /**< Configuration                */
1662   UDM_LANGMAP     *LangMap;    /**< LangMap for current document */
1663   UDM_RESULT      Indexed;     /**< Indexed cache                */
1664   size_t          Indexed_memused; /**< Memory used by Indexed cache */
1665 
1666   UDM_AGENT_STATE State;
1667 
1668 #ifdef USE_TRACE
1669   FILE *TR;
1670 #endif
1671 
1672 } UDM_AGENT;
1673 
1674 
1675 typedef struct
1676 {
1677   UDM_AGENT Agent;
1678   UDM_QUERY *Query;
1679 } UDM_CRAWLER;
1680 
1681 
1682 typedef struct udm_cfg_st
1683 {
1684   UDM_AGENT   *Indexer;
1685   UDM_SERVER  *Srv;
1686   int         flags;
1687   int         level;
1688   int         ordre;
1689   size_t      excerpt_fragments_count;
1690 } UDM_CFG;
1691 
1692 
1693 typedef struct
1694 {
1695   UDM_DSTR Value;
1696   char *name;
1697   size_t lineno;
1698 } UDM_TEMPLATE_ITEM;
1699 
1700 
1701 typedef struct
1702 {
1703   size_t nitems;
1704   size_t mitems;
1705   UDM_TEMPLATE_ITEM *Item;
1706   int rc;      /* The code returned from the template program, e.g. exit() */
1707 } UDM_TEMPLATE;
1708 
1709 
1710 #ifdef DMALLOC
1711 #include <dmalloc.h>
1712 #endif
1713 
1714 #define UDM_DT_BACK    1
1715 #define UDM_DT_ER      2
1716 #define UDM_DT_RANGE   3
1717 #define UDM_DT_UNKNOWN 4
1718 
1719 typedef enum
1720 {
1721   UDM_OPT_BOOL,
1722   UDM_OPT_INT,
1723   UDM_OPT_STR,
1724   UDM_OPT_TITLE
1725 } udm_opttype_t;
1726 
1727 
1728 typedef struct udm_cmdline_opt_st
1729 {
1730   int id;
1731   const char *name;
1732   int type;
1733   void *value;
1734   const char *comment;
1735 } UDM_CMDLINE_OPT;
1736 
1737 
1738 typedef enum
1739 {
1740   UDM_IND_AMBIGUOUS=     0,
1741   UDM_IND_UNKNOWN=       1,
1742   UDM_IND_INDEX=         300,
1743   UDM_IND_CRAWL=         301,
1744   UDM_IND_STAT=          'S',
1745   UDM_IND_CREATE=        303,
1746   UDM_IND_DROP=          304,
1747   UDM_IND_DELETE=        'C',
1748   UDM_IND_REFERERS=      'I',
1749   UDM_IND_SQLMON=        'Q',
1750   UDM_IND_CHECKCONF=     308,
1751   UDM_IND_CONVERT=       309,
1752   UDM_IND_MULTI2BLOB=    310,
1753   UDM_IND_EXPORT=        311,
1754   UDM_IND_WRDSTAT=       312,
1755   UDM_IND_REWRITEURL=    313,
1756   UDM_IND_HASHSPELL=     314,
1757   UDM_IND_DUMPSPELL=     315,
1758   UDM_IND_REWRITELIMITS= 316,
1759   UDM_IND_DUMPCONF=      317,
1760   UDM_IND_DUMPDATA=      318,
1761   UDM_IND_RESTOREDATA=   319,
1762   UDM_IND_EXECSQL=       320,
1763   UDM_IND_SET=           321, /* indexer --set=a=b -> a=b    */
1764   UDM_IND_SET0=          322, /* indexer --fl=xxx  -> fl=xxx */
1765   UDM_IND_REWRITEPOP=    'R'
1766 } udm_indcmd_t;
1767 
1768 #endif
1769