1 /************************************************************************************************* 2 * Interface of common features 3 * Copyright (C) 2003-2006 Mikio Hirabayashi 4 * This file is part of Estraier, a personal full-text search system. 5 * Estraier is free software; you can redistribute it and/or modify it under the terms of the GNU 6 * General Public License as published by the Free Software Foundation; either version 2 of the 7 * License, or any later version. 8 * Estraier is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; 9 * without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 10 * See the GNU General Public License for more details. 11 * You should have received a copy of the GNU General Public License along with Estraier; 12 * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, 13 * MA 02111-1307 USA. 14 *************************************************************************************************/ 15 16 17 #ifndef _ESTCOMMON_H /* duplication check */ 18 #define _ESTCOMMON_H 19 20 21 22 /************************************************************************************************* 23 * headers 24 *************************************************************************************************/ 25 26 27 #include <depot.h> 28 #include <curia.h> 29 #include <cabin.h> 30 #include <villa.h> 31 #include <odeum.h> 32 33 #include <stdlib.h> 34 #include <stdio.h> 35 #include <string.h> 36 #include <assert.h> 37 #include <stdarg.h> 38 #include <limits.h> 39 #include <time.h> 40 #include <signal.h> 41 42 #ifdef MYHAVE_UNISTD_H 43 #include <unistd.h> 44 #endif 45 #ifdef MYHAVE_ERRNO_H 46 #include <errno.h> 47 #endif 48 #ifdef MYHAVE_SYS_TYPES_H 49 #include <sys/types.h> 50 #endif 51 #ifdef MYHAVE_SYS_TIME_H 52 #include <sys/time.h> 53 #endif 54 #ifdef MYHAVE_SYS_WAIT_H 55 #include <sys/wait.h> 56 #endif 57 #ifdef MYHAVE_SYS_SELECT_H 58 #include <sys/select.h> 59 #endif 60 #ifdef MYHAVE_SYS_SOCKET_H 61 #include <sys/socket.h> 62 #endif 63 #ifdef MYHAVE_NETINET_IN_H 64 #include <netinet/in.h> 65 #endif 66 #ifdef MYHAVE_ARPA_INET_H 67 #include <arpa/inet.h> 68 #endif 69 #ifdef MYHAVE_NETDB_H 70 #include <netdb.h> 71 #endif 72 #ifdef MYHAVE_FCNTL_H 73 #include <fcntl.h> 74 #endif 75 76 #if defined(__APPLE__) && defined(__MACH__) /* for Mac OS X */ 77 #define socklen_t int 78 #endif 79 80 81 82 83 /************************************************************************************************* 84 * macros 85 *************************************************************************************************/ 86 87 88 #ifndef _EST_VERSION /* dummy version */ 89 #define _EST_VERSION "0.0.0" 90 #endif 91 #ifndef _EST_PREFIX /* dummy prefix */ 92 #define _EST_PREFIX "/usr/local" 93 #endif 94 #ifndef _EST_BINDIR /* dummy bindir */ 95 #define _EST_BINDIR "/usr/local/bin" 96 #endif 97 #ifndef _EST_LEXEDIR /* dummy libexecdir */ 98 #define _EST_LEXEDIR "/usr/local/libexec" 99 #endif 100 #ifndef _EST_DATADIR /* dummy datadir */ 101 #define _EST_DATADIR "/usr/local/share/estraier" 102 #endif 103 104 #undef TRUE /* boolean true */ 105 #define TRUE 1 106 #undef FALSE /* boolean false */ 107 #define FALSE 0 108 109 #define ESTPATHCHR '/' /* delimiter character of path */ 110 #define ESTEXTCHR '.' /* delimiter character of extension */ 111 #define ESTCDIRSTR "." /* string of current directory */ 112 #define ESTPDIRSTR ".." /* string of parent directory */ 113 114 #define ESTLOCALE "C" /* name of the common locale */ 115 #define ESTDBGFDENV "ESTDBGFD" /* environment variable for debug fd */ 116 #define ESTMTDBNAME "_mtime" /* name of the database for last modified times */ 117 #define ESTMTDBLRM 81 /* records in a leaf node of time database */ 118 #define ESTMTDBNIM 192 /* records in a non-leaf node of time database */ 119 #define ESTMTDBLCN 64 /* number of leaf cache of time database */ 120 #define ESTMTDBNCN 32 /* number of non-leaf cache of time database */ 121 #define ESTSCDBNAME "_score" /* name of the database for scores */ 122 #define ESTSCDBBRAT 4 /* ratio of bnum and dnum of the score database */ 123 #define ESTSCDBDIVNUM 7 /* division number of the score database */ 124 #define ESTDTDBNAME "_date" /* name of the database for dates */ 125 #define ESTDTDBBRAT 4 /* ratio of bnum and dnum of the date database */ 126 #define ESTWDLSNAME "_wlist" /* name of the text file for word list */ 127 #define ESTFILTERFUNC "estfilter" /* name of dynamic linking functins for filter */ 128 #define ESTPATHBUFSIZ 2048 /* size of a path buffer */ 129 #define ESTNUMBUFSIZ 32 /* size of a number buffer */ 130 #define ESTDATEBUFSIZ 1024 /* size of a date buffer */ 131 #define ESTPETITBNUM 31 /* bucket number of a petit map */ 132 #define ESTCJKPMIN 0x20 /* minimum plain of UCS-2 handled as CJK text */ 133 #define ESTENCMISSMAX 16 /* max number of misses of encoding characters */ 134 #define ESTMIMEFOLD 60 /* folding bytes of MIME message */ 135 #define ESTMAXLOAD 0.9 /* max ratio of bucket loading */ 136 #define ESTKEYNUM 32 /* number of keywords to store */ 137 #define ESTWMINLEN 2 /* min length of a word */ 138 #define ESTWMAXLEN 32 /* max length of a word */ 139 140 #if defined(MYSTRICT) /* split text strictly by space */ 141 #define ESTISSTRICT TRUE 142 #define ESTDELIMCHARS "+,-.:;!\"#$%&'()*/<=>?@[\\]^`{|}~" 143 #else 144 #define ESTISSTRICT FALSE 145 #define ESTDELIMCHARS "" 146 #endif 147 148 #if defined(MYNOSTOPW) /* do not exclude stop words */ 149 #define ESTISNOSTOPW TRUE 150 #else 151 #define ESTISNOSTOPW FALSE 152 #endif 153 154 155 156 /************************************************************************************************* 157 * types and constants 158 *************************************************************************************************/ 159 160 161 enum { /* mode of estdocaddtext */ 162 ESTDOCBOTH, /* register both of normal and appearance words */ 163 ESTDOCNONLY, /* register both of normal words only */ 164 ESTDOCAONLY /* register both of appearance words only */ 165 }; 166 167 /* type of the pointer to a filter function. */ 168 typedef void (*ESTFILTER)(const char *infile, const char *outfile); 169 170 typedef struct { /* type of structure for a search word */ 171 int type; /* condition type */ 172 char *word; /* string of the word */ 173 int dnum; /* number of corresponding documents */ 174 CBLIST *evwords; /* evolved words from the regular expression */ 175 } ESTWORD; 176 177 enum { /* enumeration for condition types */ 178 ESTCONDAND, /* and search */ 179 ESTCONDOR, /* or search */ 180 ESTCONDNOT /* notand search */ 181 }; 182 183 184 185 /************************************************************************************************* 186 * global variables 187 *************************************************************************************************/ 188 189 190 extern int estisregex; /* whether regex is supported */ 191 extern int estisdlfunc; /* whether dlfunc is supported */ 192 extern int estiscjkuni; /* whether cjkuni is supported */ 193 extern int estischasen; /* whether chasen is supported */ 194 extern int estismecab; /* whether mecab is supported */ 195 extern int estiskakasi; /* whether kakasi is supported */ 196 197 198 199 /************************************************************************************************* 200 * functions 201 *************************************************************************************************/ 202 203 204 /* Make a document handle from a plain text. 205 `uri' specifies the URI of a document. 206 `text' specifies the data of the document. 207 `size' specifies the size of the text. 208 `code' specifies the character encoding of the text. If it is `NULL', the encoding is 209 detected automatically. 210 The return value is a document handle for Odeum. */ 211 ODDOC *estdocplain(const char *uri, const char *text, int size, const char *code); 212 213 214 /* Make a document handle from a HTML. 215 `uri' specifies the URI of a document. 216 `text' specifies the data of the document. 217 `size' specifies the size of the text. 218 `code' specifies the character encoding of the text. If it is `NULL', the encoding is 219 detected automatically. 220 The return value is a document handle for Odeum. */ 221 ODDOC *estdochtml(const char *uri, const char *text, int size, const char *code); 222 223 224 /* Make a document handle from a MIME. 225 `uri' specifies the URI of a document. 226 `text' specifies the string of the document. 227 `code' specifies the character encoding of the text. If it is `NULL', the encoding is 228 `nude' specifies whether attributes of the inner content to be prior. 229 detected automatically. 230 The return value is a document handle for Odeum. */ 231 ODDOC *estdocmime(const char *uri, const char *text, const char *code, int nude); 232 233 234 /* Break a text into words and register them to a document handle. 235 `doc' specifies a document handle. 236 `text' specifies a text. 237 `size' specifies the size of the text. 238 `code' specifies the character encoding of the text. If it is `NULL', the encoding is 239 detected automatically. 240 `mode' specifies detailed behavior. `ESTDOCBOTH' register both of normal and appearance 241 words. `ESTDOCBOTH' registers both of normal words only. `ESTDOCAONLY' register both of 242 appearance words only. */ 243 void estdocaddtext(ODDOC *doc, const char *text, int size, const char *code, int mode); 244 245 246 /* Get the handle of the filter function in a dynamic linking library. 247 `file' specifies the name of a dynamic linking library. 248 The return value is the pointer to the function or NULL on failure. */ 249 ESTFILTER estfilterget(const char *file); 250 251 252 /* Make a search words and their conditions from a search phrase. 253 `phrase' specifies a search phrase whose encoding is UTF-8. 254 `np' specifies the pointer to a variable which the number of elements of the return value 255 is assigned. 256 `norm' specifies whether to normalize each word. 257 The return value is an array whose elements are structures composed of search words and 258 their conditions. The returned array should be released with `estfreewords'. */ 259 ESTWORD *estsearchwords(const char *phrase, int *np, int norm); 260 261 262 /* Release regions of an array of search words. 263 `words' specifies an array made with `estsearchwords'. 264 `num' specifies the number of elements of the array. */ 265 void estfreewords(ESTWORD *words, int num); 266 267 268 /* Get search result with search words made with `estsearchwords'. 269 `odeum' specifies a database handle. 270 `words' specifies an array made with `estsearchwords'. 271 `wnum' specifies the number of elements of the array. 272 `unit' specifies search unit to specify accuracy. 273 `tfidf' specifies whether scores are tuned by TF-IDF method. 274 `np' specifies the pointer to a variable to which the number of elements of the return value 275 is assigned. 276 `lp' specifies the pointer to a variable to which the number of documents leaved in the index 277 is assgned. 278 `regex' specifies whether search words are treated as regular expressions. 279 `wild' specifies whether search words are treated as expressions with wild cards. 280 `reevmax' specifies the max number of evolved words from regular expressions or expressions with 281 wild cards. 282 The return value is an array as with `odsearch'. */ 283 ODPAIR *estsearch(ODEUM *odeum, ESTWORD *words, int wnum, int unit, int tfidf, int *np, int *lp, 284 int regex, int wild, int reevmax); 285 286 287 /* Initialize the iterator of a database. 288 `odeum' specifies a database handle. 289 `prefix' specifies a prefix of URIs. 290 The return value is whether the processing is success or not. */ 291 int estiterinit(ODEUM *odeum, const char *prefix); 292 293 294 /* Get the URI of the next document whose URI begins with a prefix. 295 `odeum' specifies a database handle. 296 `prefix' specifies a prefix of URIs. 297 The return value is the URI of the next document or `NULL' if no document is left. */ 298 char *estiternext(ODEUM *odeum, const char *prefix); 299 300 301 /* Resurge the cursor to the next document of the last deleted document. 302 `odeum' specifies a database handle. 303 `uri' specifies the URI of the last deleted document. 304 The return value is whether the processing is success or not. */ 305 int estiterresurge(ODEUM *odeum, const char *uri); 306 307 308 /* Set an environment variable. 309 `name' specifies the name of an environment variable. 310 `value' specifies the value of the variable. */ 311 void estputenv(const char *name, const char *value); 312 313 314 /* Execute a command and get the result. 315 `command' specifies a command line. 316 `sp' specifies the pointer to a variable to which the size of the region of the return value 317 is assigned. If it is `NULL', it is not used. 318 If successful, the return value is the pointer to the region containing the standard output 319 of the command, else, it is `NULL'. */ 320 char *estreadexec(const char *command, int *sp); 321 322 323 /* Make time data from a string of RFC822, RFC850, or ASCTIME. 324 `str' specifies a date string of RFC822, RFC850, or ASCTIME. 325 The return value is UNIX time or -1 if the string is invalid. */ 326 int eststrmktime(const char *str); 327 328 329 330 #endif /* duplication check */ 331 332 333 /* END OF FILE */ 334