1 /*************************************************************************************************
2  * Interface of common features
3  *                                                      Copyright (C) 2003-2006 Mikio Hirabayashi
4  * This file is part of Estraier, a personal full-text search system.
5  * Estraier is free software; you can redistribute it and/or modify it under the terms of the GNU
6  * General Public License as published by the Free Software Foundation; either version 2 of the
7  * License, or any later version.
8  * Estraier is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY;
9  * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
10  * See the GNU General Public License for more details.
11  * You should have received a copy of the GNU General Public License along with Estraier;
12  * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
13  * MA 02111-1307 USA.
14  *************************************************************************************************/
15 
16 
17 #ifndef _ESTCOMMON_H                     /* duplication check */
18 #define _ESTCOMMON_H
19 
20 
21 
22 /*************************************************************************************************
23  * headers
24  *************************************************************************************************/
25 
26 
27 #include <depot.h>
28 #include <curia.h>
29 #include <cabin.h>
30 #include <villa.h>
31 #include <odeum.h>
32 
33 #include <stdlib.h>
34 #include <stdio.h>
35 #include <string.h>
36 #include <assert.h>
37 #include <stdarg.h>
38 #include <limits.h>
39 #include <time.h>
40 #include <signal.h>
41 
42 #ifdef MYHAVE_UNISTD_H
43 #include <unistd.h>
44 #endif
45 #ifdef MYHAVE_ERRNO_H
46 #include <errno.h>
47 #endif
48 #ifdef MYHAVE_SYS_TYPES_H
49 #include <sys/types.h>
50 #endif
51 #ifdef MYHAVE_SYS_TIME_H
52 #include <sys/time.h>
53 #endif
54 #ifdef MYHAVE_SYS_WAIT_H
55 #include <sys/wait.h>
56 #endif
57 #ifdef MYHAVE_SYS_SELECT_H
58 #include <sys/select.h>
59 #endif
60 #ifdef MYHAVE_SYS_SOCKET_H
61 #include <sys/socket.h>
62 #endif
63 #ifdef MYHAVE_NETINET_IN_H
64 #include <netinet/in.h>
65 #endif
66 #ifdef MYHAVE_ARPA_INET_H
67 #include <arpa/inet.h>
68 #endif
69 #ifdef MYHAVE_NETDB_H
70 #include <netdb.h>
71 #endif
72 #ifdef MYHAVE_FCNTL_H
73 #include <fcntl.h>
74 #endif
75 
76 #if defined(__APPLE__) && defined(__MACH__)  /* for Mac OS X */
77 #define socklen_t   int
78 #endif
79 
80 
81 
82 
83 /*************************************************************************************************
84  * macros
85  *************************************************************************************************/
86 
87 
88 #ifndef _EST_VERSION                     /* dummy version */
89 #define _EST_VERSION   "0.0.0"
90 #endif
91 #ifndef _EST_PREFIX                      /* dummy prefix */
92 #define _EST_PREFIX    "/usr/local"
93 #endif
94 #ifndef _EST_BINDIR                      /* dummy bindir */
95 #define _EST_BINDIR    "/usr/local/bin"
96 #endif
97 #ifndef _EST_LEXEDIR                     /* dummy libexecdir */
98 #define _EST_LEXEDIR   "/usr/local/libexec"
99 #endif
100 #ifndef _EST_DATADIR                     /* dummy datadir */
101 #define _EST_DATADIR   "/usr/local/share/estraier"
102 #endif
103 
104 #undef TRUE                              /* boolean true */
105 #define TRUE           1
106 #undef FALSE                             /* boolean false */
107 #define FALSE          0
108 
109 #define ESTPATHCHR     '/'               /* delimiter character of path */
110 #define ESTEXTCHR      '.'               /* delimiter character of extension */
111 #define ESTCDIRSTR     "."               /* string of current directory */
112 #define ESTPDIRSTR     ".."              /* string of parent directory */
113 
114 #define ESTLOCALE      "C"               /* name of the common locale */
115 #define ESTDBGFDENV    "ESTDBGFD"        /* environment variable for debug fd */
116 #define ESTMTDBNAME    "_mtime"          /* name of the database for last modified times */
117 #define ESTMTDBLRM     81                /* records in a leaf node of time database */
118 #define ESTMTDBNIM     192               /* records in a non-leaf node of time database */
119 #define ESTMTDBLCN     64                /* number of leaf cache of time database */
120 #define ESTMTDBNCN     32                /* number of non-leaf cache of time database */
121 #define ESTSCDBNAME    "_score"          /* name of the database for scores */
122 #define ESTSCDBBRAT    4                 /* ratio of bnum and dnum of the score database */
123 #define ESTSCDBDIVNUM  7                 /* division number of the score database */
124 #define ESTDTDBNAME    "_date"           /* name of the database for dates */
125 #define ESTDTDBBRAT    4                 /* ratio of bnum and dnum of the date database */
126 #define ESTWDLSNAME    "_wlist"          /* name of the text file for word list */
127 #define ESTFILTERFUNC  "estfilter"       /* name of dynamic linking functins for filter */
128 #define ESTPATHBUFSIZ  2048              /* size of a path buffer */
129 #define ESTNUMBUFSIZ   32                /* size of a number buffer */
130 #define ESTDATEBUFSIZ  1024              /* size of a date buffer */
131 #define ESTPETITBNUM   31                /* bucket number of a petit map */
132 #define ESTCJKPMIN     0x20              /* minimum plain of UCS-2 handled as CJK text */
133 #define ESTENCMISSMAX  16                /* max number of misses of encoding characters */
134 #define ESTMIMEFOLD    60                /* folding bytes of MIME message */
135 #define ESTMAXLOAD     0.9               /* max ratio of bucket loading */
136 #define ESTKEYNUM      32                /* number of keywords to store */
137 #define ESTWMINLEN     2                 /* min length of a word */
138 #define ESTWMAXLEN     32                /* max length of a word */
139 
140 #if defined(MYSTRICT)                    /* split text strictly by space */
141 #define ESTISSTRICT    TRUE
142 #define ESTDELIMCHARS  "+,-.:;!\"#$%&'()*/<=>?@[\\]^`{|}~"
143 #else
144 #define ESTISSTRICT    FALSE
145 #define ESTDELIMCHARS  ""
146 #endif
147 
148 #if defined(MYNOSTOPW)                   /* do not exclude stop words */
149 #define ESTISNOSTOPW   TRUE
150 #else
151 #define ESTISNOSTOPW   FALSE
152 #endif
153 
154 
155 
156 /*************************************************************************************************
157  * types and constants
158  *************************************************************************************************/
159 
160 
161 enum {                                   /* mode of estdocaddtext */
162   ESTDOCBOTH,                            /* register both of normal and appearance words */
163   ESTDOCNONLY,                           /* register both of normal words only */
164   ESTDOCAONLY                            /* register both of appearance words only */
165 };
166 
167 /* type of the pointer to a filter function. */
168 typedef void (*ESTFILTER)(const char *infile, const char *outfile);
169 
170 typedef struct {                         /* type of structure for a search word */
171   int type;                              /* condition type */
172   char *word;                            /* string of the word */
173   int dnum;                              /* number of corresponding documents */
174   CBLIST *evwords;                       /* evolved words from the regular expression */
175 } ESTWORD;
176 
177 enum {                                   /* enumeration for condition types */
178   ESTCONDAND,                            /* and search */
179   ESTCONDOR,                             /* or search */
180   ESTCONDNOT                             /* notand search */
181 };
182 
183 
184 
185 /*************************************************************************************************
186  * global variables
187  *************************************************************************************************/
188 
189 
190 extern int estisregex;                   /* whether regex is supported */
191 extern int estisdlfunc;                  /* whether dlfunc is supported */
192 extern int estiscjkuni;                  /* whether cjkuni is supported */
193 extern int estischasen;                  /* whether chasen is supported */
194 extern int estismecab;                   /* whether mecab is supported */
195 extern int estiskakasi;                  /* whether kakasi is supported */
196 
197 
198 
199 /*************************************************************************************************
200  * functions
201  *************************************************************************************************/
202 
203 
204 /* Make a document handle from a plain text.
205    `uri' specifies the URI of a document.
206    `text' specifies the data of the document.
207    `size' specifies the size of the text.
208    `code' specifies the character encoding of the text.  If it is `NULL', the encoding is
209    detected automatically.
210    The return value is a document handle for Odeum. */
211 ODDOC *estdocplain(const char *uri, const char *text, int size, const char *code);
212 
213 
214 /* Make a document handle from a HTML.
215    `uri' specifies the URI of a document.
216    `text' specifies the data of the document.
217    `size' specifies the size of the text.
218    `code' specifies the character encoding of the text.  If it is `NULL', the encoding is
219    detected automatically.
220    The return value is a document handle for Odeum. */
221 ODDOC *estdochtml(const char *uri, const char *text, int size, const char *code);
222 
223 
224 /* Make a document handle from a MIME.
225    `uri' specifies the URI of a document.
226    `text' specifies the string of the document.
227    `code' specifies the character encoding of the text.  If it is `NULL', the encoding is
228    `nude' specifies whether attributes of the inner content to be prior.
229    detected automatically.
230    The return value is a document handle for Odeum. */
231 ODDOC *estdocmime(const char *uri, const char *text, const char *code, int nude);
232 
233 
234 /* Break a text into words and register them to a document handle.
235    `doc' specifies a document handle.
236    `text' specifies a text.
237    `size' specifies the size of the text.
238    `code' specifies the character encoding of the text.  If it is `NULL', the encoding is
239    detected automatically.
240    `mode' specifies detailed behavior.  `ESTDOCBOTH' register both of normal and appearance
241    words.  `ESTDOCBOTH' registers both of normal words only.  `ESTDOCAONLY' register both of
242    appearance words only. */
243 void estdocaddtext(ODDOC *doc, const char *text, int size, const char *code, int mode);
244 
245 
246 /* Get the handle of the filter function in a dynamic linking library.
247    `file' specifies the name of a dynamic linking library.
248    The return value is the pointer to the function or NULL on failure. */
249 ESTFILTER estfilterget(const char *file);
250 
251 
252 /* Make a search words and their conditions from a search phrase.
253    `phrase' specifies a search phrase whose encoding is UTF-8.
254    `np' specifies the pointer to a variable which the number of elements of the return value
255    is assigned.
256    `norm' specifies whether to normalize each word.
257    The return value is an array whose elements are structures composed of search words and
258    their conditions.  The returned array should be released with `estfreewords'. */
259 ESTWORD *estsearchwords(const char *phrase, int *np, int norm);
260 
261 
262 /* Release regions of an array of search words.
263    `words' specifies an array made with `estsearchwords'.
264    `num' specifies the number of elements of the array. */
265 void estfreewords(ESTWORD *words, int num);
266 
267 
268 /* Get search result with search words made with `estsearchwords'.
269    `odeum' specifies a database handle.
270    `words' specifies an array made with `estsearchwords'.
271    `wnum' specifies the number of elements of the array.
272    `unit' specifies search unit to specify accuracy.
273    `tfidf' specifies whether scores are tuned by TF-IDF method.
274    `np' specifies the pointer to a variable to which the number of elements of the return value
275    is assigned.
276    `lp' specifies the pointer to a variable to which the number of documents leaved in the index
277    is assgned.
278    `regex' specifies whether search words are treated as regular expressions.
279    `wild' specifies whether search words are treated as expressions with wild cards.
280    `reevmax' specifies the max number of evolved words from regular expressions or expressions with
281    wild cards.
282    The return value is an array as with `odsearch'. */
283 ODPAIR *estsearch(ODEUM *odeum, ESTWORD *words, int wnum, int unit, int tfidf, int *np, int *lp,
284                   int regex, int wild, int reevmax);
285 
286 
287 /* Initialize the iterator of a database.
288    `odeum' specifies a database handle.
289    `prefix' specifies a prefix of URIs.
290    The return value is whether the processing is success or not. */
291 int estiterinit(ODEUM *odeum, const char *prefix);
292 
293 
294 /* Get the URI of the next document whose URI begins with a prefix.
295    `odeum' specifies a database handle.
296    `prefix' specifies a prefix of URIs.
297    The return value is the URI of the next document or `NULL' if no document is left. */
298 char *estiternext(ODEUM *odeum, const char *prefix);
299 
300 
301 /* Resurge the cursor to the next document of the last deleted document.
302    `odeum' specifies a database handle.
303    `uri' specifies the URI of the last deleted document.
304    The return value is whether the processing is success or not. */
305 int estiterresurge(ODEUM *odeum, const char *uri);
306 
307 
308 /* Set an environment variable.
309    `name' specifies the name of an environment variable.
310    `value' specifies the value of the variable. */
311 void estputenv(const char *name, const char *value);
312 
313 
314 /* Execute a command and get the result.
315    `command' specifies a command line.
316    `sp' specifies the pointer to a variable to which the size of the region of the return value
317    is assigned.  If it is `NULL', it is not used.
318    If successful, the return value is the pointer to the region containing the standard output
319    of the command, else, it is `NULL'. */
320 char *estreadexec(const char *command, int *sp);
321 
322 
323 /* Make time data from a string of RFC822, RFC850, or ASCTIME.
324    `str' specifies a date string of RFC822, RFC850, or ASCTIME.
325    The return value is UNIX time or -1 if the string is invalid. */
326 int eststrmktime(const char *str);
327 
328 
329 
330 #endif                                   /* duplication check */
331 
332 
333 /* END OF FILE */
334