1 /*************************************************************************************************
2  * Implementation of the core API
3  *                                                      Copyright (C) 2004-2007 Mikio Hirabayashi
4  * This file is part of Hyper Estraier.
5  * Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of
6  * the GNU Lesser General Public License as published by the Free Software Foundation; either
7  * version 2.1 of the License or any later version.  Hyper Estraier is distributed in the hope
8  * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
10  * License for more details.
11  * You should have received a copy of the GNU Lesser General Public License along with Hyper
12  * Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
13  * Boston, MA 02111-1307 USA.
14  *************************************************************************************************/
15 
16 
17 #if defined(_MYVISTA)
18 #include <vista.h>
19 #endif
20 
21 #include "estraier.h"
22 #include "myconf.h"
23 
24 #define ESTNUMBUFSIZ   32                /* size of a buffer for a number */
25 #define ESTPATHBUFSIZ  4096              /* size of a buffer for a path */
26 #define ESTIOBUFSIZ    8192              /* size of a buffer for I/O */
27 #define ESTALLOCUNIT   1024              /* unit number of memory allocation */
28 #define ESTMINIBNUM    31                /* bucket number of map for attributes */
29 #define ESTSCANWNUM    256               /* number of words for scaning check */
30 #define ESTSIGNUM      64                /* number of signals */
31 #define ESTREGSUBMAX   32                /* maximum number of substrings for regex */
32 
33 #define ESTMETADBNAME  "_meta"           /* name of the meta database */
34 #define ESTKEYIDXNUM   "_idxnum"         /* key for the number of inverted indexes */
35 #define ESTKEYDSEQ     "_dseq"           /* key for the sequence for document IDs */
36 #define ESTKEYDNUM     "_dnum"           /* key for the number of documents */
37 #define ESTKEYMETA     "_meta"           /* key for meta data */
38 
39 #define ESTIDXDBNAME   "_idx"            /* name of the inverted index */
40 #define ESTIDXDBLRM    109               /* records in a leaf node of the inverted index */
41 #define ESTIDXDBLRMA   17                /* records in a leaf node of the index in APN mode */
42 #define ESTIDXDBNIM    160               /* records in a non-leaf node of the inverted index */
43 #define ESTIDXDBLCN    16                /* number of leaf cache of the inverted index */
44 #define ESTIDXDBNCN    16                /* number of non-leaf cache of the inverted index */
45 #define ESTIDXDBRLCN   128               /* number of leaf cache of the index reader */
46 #define ESTIDXDBRLCNA  32                /* number of leaf cache of the reader in APN mode */
47 #define ESTIDXDBRNCN   256               /* number of non-leaf cache of the index reader */
48 #define ESTIDXDBFBP    512               /* size of free block pool of the inverted index */
49 #define ESTIDXDBMIN    (1048576*512)     /* minimum size of a database file */
50 #define ESTIDXDBMAX    (1048576*1536)    /* maximum size of a database file */
51 
52 #define ESTFWMDBNAME   "_fwm"            /* name of the database for forward matching */
53 #define ESTFWMDBLRM    251               /* records in a leaf node of forward matching DB */
54 #define ESTFWMDBNIM    110               /* records in a non-leaf node of forward matching DB */
55 #define ESTFWMDBLCN    32                /* number of leaf cache of forward matching DB */
56 #define ESTFWMDBNCN    16                /* number of non-leaf cache of forward matching DB */
57 #define ESTFWMDBFBP    128               /* size of free block pool of forward matching DB */
58 
59 #define ESTAUXDBNAME   "_aux"            /* name of the auxiliary index */
60 #define ESTAUXDBLRM    23                /* records in a leaf node of the auxiliary index */
61 #define ESTAUXDBNIM    160               /* records in a non-leaf node of the auxiliary index */
62 #define ESTAUXDBLCN    16                /* number of leaf cache of the auxiliary index */
63 #define ESTAUXDBNCN    16                /* number of non-leaf cache of the auxiliary index */
64 #define ESTAUXDBRLCN   256               /* number of leaf cache of the auxiliary reader */
65 #define ESTAUXDBRNCN   64                /* number of non-leaf cache of the auxiliary reader */
66 #define ESTAUXDBFBP    256               /* size of free block pool of the auxiliary index */
67 
68 #define ESTXFMDBNAME   "_xfm"            /* name of the database for auxiliary forward matching */
69 #define ESTXFMDBLRM    111               /* records in a leaf node of xfm DB */
70 #define ESTXFMDBNIM    110               /* records in a non-leaf node of xfm DB */
71 #define ESTXFMDBLCN    32                /* number of leaf cache of xfm DB */
72 #define ESTXFMDBNCN    16                /* number of non-leaf cache of xfm DB */
73 #define ESTXFMDBFBP    128               /* size of free block pool of xfm DB */
74 
75 #define ESTATTRDBNAME  "_attr"           /* name of the database for attributes */
76 #define ESTATTRDBBNUM  212987            /* bucket number of the database for attributes */
77 #define ESTATTRDBDNUM  3                 /* division number of the database for attributes */
78 #define ESTATTRDBALN   -5                /* alignment of the database for attributes */
79 #define ESTATTRDBFBP   64                /* size of free block pool of the attribute DB */
80 
81 #define ESTTEXTDBNAME  "_text"           /* name of the database of texts */
82 #define ESTTEXTDBBNUM  61417             /* bucket number of the database for texts */
83 #define ESTTEXTDBDNUM  7                 /* division number of the database for texts */
84 #define ESTTEXTDBALN   -5                /* alignment of the database for texts */
85 #define ESTTEXTDBFBP   128               /* size of free block pool of the text DB */
86 
87 #define ESTKWDDBNAME   "_kwd"            /* name of the database of keywords */
88 #define ESTKWDDBBNUM   163819            /* bucket number of the database for keywords */
89 #define ESTKWDDBDNUM   3                 /* division number of the database for keywords */
90 #define ESTKWDDBALN    -5                /* alignment of the database for keywords */
91 #define ESTKWDDBFBP    64                /* size of free block pool of the keyword DB */
92 
93 #define ESTLISTDBNAME  "_list"           /* name of the database of document list */
94 #define ESTLISTDBLRM   99                /* records in a leaf node of document list DB */
95 #define ESTLISTDBNIM   200               /* records in a non-leaf node of document list DB */
96 #define ESTLISTDBLCN   64                /* number of leaf cache of document list DB */
97 #define ESTLISTDBNCN   16                /* number of non-leaf cache of document list DB */
98 #define ESTLISTDBFBP   128               /* size of free block pool of document list DB */
99 
100 #define ESTAISEQPREF   "__seq_"          /* prefix of the database for sequencial access */
101 #define ESTAISTRPREF   "__str_"          /* prefix of the database for string narrowing */
102 #define ESTAINUMPREF   "__num_"          /* prefix of the database for number narrowing */
103 #define ESTAIBDIAM     0.8               /* diameter of the bucket number */
104 #define ESTAIDXLRM     99                /* records in a leaf node of narrowing index */
105 #define ESTAIDXNIM     120               /* records in a non-leaf node of narrowing index */
106 #define ESTAIDXLCN     1024              /* number of leaf cache of narrowing index */
107 #define ESTAIDXNCN     256               /* number of non-leaf cache of narrowing index */
108 #define ESTAIDXDPFBP   32                /* size of free block pool of sequencial DB */
109 #define ESTAIDXVLFBP   128               /* size of free block pool of narrowing DB */
110 #define ESTAIKBUFSIZ   8192              /* size of a buffer for a key */
111 #define ESTAISNUMMIN   256               /* minimum number of scores to use narrowing index */
112 #define ESTOPDUMMY     "[DUMMY]"         /* dummy operator */
113 
114 #define ESTDBSBRAT     0.3               /* ratio of bucket numbers of large mode */
115 #define ESTDBSDRAT     0.4               /* ratio of the division number of large mode */
116 #define ESTDBLBRAT     3.0               /* ratio of bucket numbers of large mode */
117 #define ESTDBLDRAT     1.0               /* ratio of the division number of large mode */
118 #define ESTDBHBRAT     5.0               /* ratio of bucket numbers of huge mode */
119 #define ESTDBHDRAT     2.0               /* ratio of the division number of huge mode */
120 #define ESTDBH2RAT     1.4               /* ratio of huge mode second */
121 #define ESTDBH3RAT     2.0               /* ratio of huge mode third */
122 
123 #define ESTVLCRDNUM    2                 /* division number of usual Villa databases */
124 #define ESTVLCRDNAUX   7                 /* division number of the auxiliary index */
125 
126 #define ESTIDXCCBNUM   524288            /* bucket number of cache for the inverted index */
127 #define ESTAUXCCBNUM   65521             /* bucket number of cache for the auxiliary index */
128 #define ESTIDXCCMAX    (1048576*64)      /* max size of the cache */
129 #define ESTOUTCCBNUM   131072            /* bucket number of cache for deleted documents */
130 #define ESTKEYCCMNUM   65536             /* bucket number of cache for keys for TF-IDF */
131 #define ESTATTRCCMNUM  8192              /* number of cache for attributes */
132 #define ESTTEXTCCMNUM  1024              /* number of cache for texts */
133 #define ESTRESCCMNUM   256               /* number of cache for results */
134 #define ESTCCIRSLOT    256               /* slot timing for interruption */
135 #define ESTCCCBFREQ    10000             /* frequency of callback for flushing words */
136 
137 #define ESTDIRMODE     00755             /* permission of a creating directory */
138 #define ESTICCHECKSIZ  32768             /* size of checking character code */
139 #define ESTICMISSMAX   256               /* allowance number of missing characters */
140 #define ESTICALLWRAT   0.001             /* allowance ratio of missing characters */
141 #define ESTOCPOINT     16                /* point per occurrence */
142 #define ESTJHASHNUM    251               /* hash number for a junction */
143 #define ESTWORDMAXLEN  48                /* maximum length of a word */
144 #define ESTWORDAVGLEN  8                 /* average length of a word */
145 #define ESTATTRALW     1.5               /* allowance ratio of attribute narrowing */
146 #define ESTKEYSCALW    3                 /* allowance ratio of TF-IDF for keywords */
147 #define ESTMEMIRATIO   1.1               /* incremental ratio of memory allocation */
148 
149 #define ESTSCOREUNIT   1000              /* unit of standard deviation of scoring */
150 #define ESTAUXMIN      32                /* minimum hits to adopt the auxiliary index */
151 #define ESTAUXEXRAT    16                /* ratio of hits of keywords expansion */
152 #define ESTWILDMAX     256               /* maximum number of expansion of wild cards */
153 #define ESTECLKNUM     32                /* number of keywords to eclipse candidates */
154 #define ESTSMLRKNUM    16                /* number of keywords to get candidates */
155 #define ESTSMLRUNUM    1024              /* number of adopted documents for a keyword */
156 #define ESTSMLRMNUM    4096              /* maximum number of candidates to be checked */
157 #define ESTSMLRNMIN    0.5               /* the minimum value for narrowing */
158 
159 /* set a buffer for a variable length number */
160 #define EST_SET_VNUMBUF(EST_len, EST_buf, EST_num) \
161   do { \
162     int _EST_num = (EST_num); \
163     div_t EST_d; \
164     if(_EST_num == 0){ \
165       ((signed char *)(EST_buf))[0] = 0; \
166       (EST_len) = 1; \
167     } else { \
168       (EST_len) = 0; \
169       while(_EST_num > 0){ \
170         EST_d = div(_EST_num, 128); \
171         _EST_num = EST_d.quot; \
172         if(_EST_num > 0){ \
173           ((signed char *)(EST_buf))[(EST_len)] = -EST_d.rem - 1; \
174         } else { \
175           ((signed char *)(EST_buf))[(EST_len)] = EST_d.rem; \
176         } \
177         (EST_len)++; \
178       } \
179     } \
180   } while(FALSE)
181 
182 /* read a variable length buffer */
183 #define EST_READ_VNUMBUF(EST_buf, EST_num, EST_step) \
184   do { \
185     int _EST_i, _EST_base; \
186     (EST_num) = 0; \
187     _EST_base = 1; \
188     for(_EST_i = 0; TRUE; _EST_i++){ \
189       if(((signed char *)(EST_buf))[_EST_i] >= 0){ \
190         (EST_num) += ((signed char *)(EST_buf))[_EST_i] * _EST_base; \
191         break; \
192       } \
193       (EST_num) += _EST_base * (((signed char *)(EST_buf))[_EST_i] + 1) * -1; \
194       _EST_base *= 128; \
195     } \
196     EST_step = _EST_i + 1; \
197   } while(FALSE)
198 
199 typedef struct {                         /* type of structure for an attribute database */
200   void *db;                              /* handle of the database */
201   int type;                              /* data type */
202 } ESTATTRIDX;
203 
204 enum {                                   /* enumeration for character categories */
205   ESTSPACECHR,                           /* space characters */
206   ESTDELIMCHR,                           /* delimiter characters */
207   ESTWESTALPH,                           /* west alphabets */
208   ESTEASTALPH,                           /* east alphabets */
209   ESTHIRAGANA,                           /* east alphabets: hiragana */
210   ESTKATAKANA,                           /* east alphabets: katakana */
211   ESTHANGUL,                             /* east alphabets: hangul */
212   ESTKANJI                               /* east alphabets: kanji */
213 };
214 
215 enum {                                   /* enumeration for flags for databases */
216   ESTDFPERFNG = 1 << 10,                 /* use perfect N-gram analizer */
217   ESTDFCHRCAT = 1 << 11,                 /* use character category analizer */
218   ESTDFZLIB = 1 << 15,                   /* compress records with ZLIB */
219   ESTDFLZO = 1 << 16,                    /* compress records with LZO */
220   ESTDFBZIP = 1 << 17,                   /* compress records with BZIP2 */
221   ESTDFSCVOID = 1 << 20,                 /* store scores as void */
222   ESTDFSCINT = 1 << 21,                  /* store scores as integer */
223   ESTDFSCASIS = 1 << 22                  /* refrain from adjustment of scores */
224 };
225 
226 enum {                                   /* enumration for phrase format */
227   ESTPMUSUAL,                            /* usual phrase */
228   ESTPMSIMPLE,                           /* simplified phrase */
229   ESTPMROUGH,                            /* rough phrase */
230   ESTPMUNION,                            /* union phrase */
231   ESTPMISECT                             /* intersection phrase */
232 };
233 
234 typedef struct {                         /* type of structure for a hitting object */
235   int id;                                /* ID of a document */
236   int score;                             /* score tuned by TF-IDF */
237   char *value;                           /* value of an attribute for sorting */
238 } ESTSCORE;
239 
240 typedef struct {                         /* type of structure for a conditional attribute */
241   char *name;                            /* name */
242   int nsiz;                              /* size of the name */
243   CBLIST *nlist;                         /* list of plural names */
244   char *oper;                            /* operator */
245   char *val;                             /* value */
246   int vsiz;                              /* size of the value */
247   const char *cop;                       /* canonical operator */
248   int sign;                              /* positive or negative */
249   char *sval;                            /* value of small cases */
250   int ssiz;                              /* size of the small value */
251   void *regex;                           /* compiled regular expressions */
252   time_t num;                            /* numeric value */
253 } ESTCATTR;
254 
255 typedef struct {                         /* type of structure for a hitting object */
256   const char *word;                      /* face of keyword */
257   int wsiz;                              /* size of the keyword */
258   int pt;                                /* score tuned by TF-IDF */
259 } ESTKEYSC;
260 
261 typedef struct {                         /* type of structure for a meta hitting object */
262   int db;                                /* index of a container database */
263   int id;                                /* ID of a document */
264   int score;                             /* score tuned by TF-IDF */
265   char *value;                           /* value of an attribute for sorting */
266 } ESTMETASCORE;
267 
268 
269 /* private function prototypes */
270 static void est_set_ecode(int *ecp, int value, int line);
271 static char *est_hex_encode(const char *str);
272 static char *est_hex_decode(const char *str);
273 static int est_enc_miss(const char *ptr, int size, const char *icode, const char *ocode);
274 static void est_normalize_text(unsigned char *utext, int size, int *sp);
275 static void est_canonicalize_text(unsigned char *utext, int size, int funcspc);
276 static int est_char_category(int c);
277 static int est_char_category_perfng(int c);
278 static int est_char_category_chrcat(int c);
279 static char *est_make_snippet(const char *str, int len, const CBLIST *words,
280                               int wwidth, int hwidth, int awidth);
281 static int est_check_cjk_only(const char *str);
282 static char *est_phrase_from_simple(const char *sphrase);
283 static char *est_phrase_from_rough(const char *rphrase);
284 static char *est_phrase_from_union(const char *uphrase);
285 static char *est_phrase_from_isect(const char *iphrase);
286 static void est_snippet_add_text(const unsigned char *rtext, const unsigned char *ctext,
287                                  int size, int awsiz, CBDATUM *res, const CBLIST *rwords);
288 static int est_str_fwmatch_wide(const unsigned char *haystack, int hsiz,
289                                 const unsigned char *needle, int nsiz);
290 static char *est_strstr_sparse(const char *haystack, const char *needle);
291 static int est_idx_rec_last_id(const char *vbuf, int vsiz, int smode);
292 static void est_encode_idx_rec(CBDATUM *datum, const char *vbuf, int vsiz, int lid, int smode);
293 static void est_decode_idx_rec(CBDATUM *datum, const char *vbuf, int vsiz, int smode);
294 static ESTIDX *est_idx_open(const char *name, int omode, int dnum);
295 static int est_idx_close(ESTIDX *idx);
296 static void est_idx_set_tuning(ESTIDX *idx, int lrecmax, int nidxmax, int lcnum, int ncnum,
297                                int fbpsiz);
298 static void est_idx_increment(ESTIDX *idx);
299 static int est_idx_dnum(ESTIDX *idx);
300 static int est_idx_add(ESTIDX *idx, const char *word, int wsiz,
301                        const char *vbuf, int vsiz, int smode);
302 static int est_idx_put_one(ESTIDX *idx, int inum, const char *word, int wsiz,
303                            const char *vbuf, int vsiz);
304 static int est_idx_out(ESTIDX *idx, const char *word, int wsiz);
305 static char *est_idx_scan(ESTIDX *idx, const char *word, int wsiz, int *sp, int smode);
306 static const char *est_idx_get_one(ESTIDX *idx, int inum, const char *word, int wsiz, int *sp);
307 static int est_idx_vsiz(ESTIDX *idx, const char *word, int wsiz);
308 static int est_idx_num(ESTIDX *idx);
309 static double est_idx_size(ESTIDX *idx);
310 static int est_idx_size_current(ESTIDX *idx);
311 static int est_idx_memflush(ESTIDX *idx);
312 static int est_idx_sync(ESTIDX *idx);
313 static int est_idx_optimize(ESTIDX *idx);
314 static void est_idx_set_current(ESTIDX *idx);
315 static int est_crput(CURIA *curia, int zmode, int id, const char *vbuf, int vsiz, int dmode);
316 static int est_crout(CURIA *curia, int id);
317 static char *est_crget(CURIA *curia, int flags, int id, int *sp);
318 static int est_aidx_seq_put(DEPOT *db, int id, const char *vbuf, int vsiz);
319 static int est_aidx_seq_out(DEPOT *db, int id);
320 static char *est_aidx_seq_get(DEPOT *db, int id, int *sp);
321 static int est_aidx_seq_narrow(DEPOT *db, const CBLIST *pdocs, const char *cop, int sign,
322                                const char *oval, int osiz, const char *sval, int ssiz,
323                                const void *regex, int onum, ESTSCORE *scores, int snum,
324                                int limit, int *restp);
325 static int est_aidx_numcmp(const char *aptr, int asiz, const char *bptr, int bsiz);
326 static int est_aidx_attr_put(VILLA *db, int id, const char *vbuf, int vsiz);
327 static int est_aidx_attr_out(VILLA *db, int id, const char *vbuf, int vsiz);
328 static int est_aidx_attr_narrow(VILLA *db, const CBLIST *pdocs, const char *cop, int sign,
329                                 const char *oval, int osiz, const char *sval, int ssiz,
330                                 const void *regex, int onum, ESTSCORE *scores, int snum);
331 static int est_int_compare(const void *ap, const void *bp);
332 static int est_short_compare(const void *ap, const void *bp);
333 static void est_inodes_delete(void *arg);
334 static void est_inodes_delete_informer(const char *msg, void *opaque);
335 static int est_db_write_meta(ESTDB *db);
336 static void est_db_inform(ESTDB *db, const char *info);
337 static void est_db_prepare_meta(ESTDB *db);
338 static int est_db_score_doc(ESTDB *db, ESTDOC *doc, ESTCOND *cond, int *scp);
339 static int est_pidx_uri_to_id(ESTDB *db, const char *uri);
340 static CBLIST *est_phrase_terms(const char *phrase);
341 static int est_score_compare_by_id_asc(const void *ap, const void *bp);
342 static int est_score_compare_by_id_desc(const void *ap, const void *bp);
343 static int est_score_compare_by_score_asc(const void *ap, const void *bp);
344 static int est_score_compare_by_score_desc(const void *ap, const void *bp);
345 static int est_score_compare_by_str_asc(const void *ap, const void *bp);
346 static int est_score_compare_by_str_desc(const void *ap, const void *bp);
347 static int est_score_compare_by_num_asc(const void *ap, const void *bp);
348 static int est_score_compare_by_num_desc(const void *ap, const void *bp);
349 static int est_metascore_compare_by_id_asc(const void *ap, const void *bp);
350 static int est_metascore_compare_by_id_desc(const void *ap, const void *bp);
351 static int est_metascore_compare_by_score_asc(const void *ap, const void *bp);
352 static int est_metascore_compare_by_score_desc(const void *ap, const void *bp);
353 static int est_metascore_compare_by_str_asc(const void *ap, const void *bp);
354 static int est_metascore_compare_by_str_desc(const void *ap, const void *bp);
355 static int est_metascore_compare_by_num_asc(const void *ap, const void *bp);
356 static int est_metascore_compare_by_num_desc(const void *ap, const void *bp);
357 static ESTSCORE *est_search_uvset(ESTDB *db, int *nump, CBMAP *hints, int add);
358 static void est_expand_word_bw(ESTDB *db, const char *word, CBLIST *list);
359 static void est_expand_word_ew(ESTDB *db, const char *word, CBLIST *list);
360 static void est_expand_word_rx(ESTDB *db, const char *word, CBLIST *list);
361 static void est_expand_keyword_bw(ESTDB *db, const char *word, CBLIST *list);
362 static void est_expand_keyword_ew(ESTDB *db, const char *word, CBLIST *list);
363 static void est_expand_keyword_rx(ESTDB *db, const char *word, CBLIST *list);
364 static ESTSCORE *est_search_union(ESTDB *db, const char *term, int gstep,
365                                   void (*xpn)(const char *, CBLIST *),
366                                   int *nump, CBMAP *hints, int add, int auxmin, CBMAP *auxwords);
367 static const ESTSCORE *est_rescc_get(ESTDB *db, const char *word, int size, int *nump);
368 static void est_rescc_put(ESTDB *db, const char *word, int size, ESTSCORE *scores, int num);
369 static ESTSCORE *est_search_keywords(ESTDB *db, const char *word, int min, int *nump);
370 static void est_weight_keywords(ESTDB *db, const char *word, ESTSCORE *scores, int snum);
371 static ESTSCORE *est_search_rank(ESTDB *db, const char *name, int top, int *nump);
372 static ESTSCORE *est_search_aidx_attr(ESTDB *db, const char *expr, int *nump);
373 static ESTSCORE *est_search_pidxs(ESTDB *db, ESTCOND *cond, ESTSCORE *scores, int *nump,
374                                   CBMAP *ordattrs);
375 static int est_narrow_scores(ESTDB *db, const CBLIST *attrs, int ign,
376                              const char *order, const char *distinct, ESTSCORE *scores, int snum,
377                              int limit, int *restp, CBMAP *ordattrs);
378 static ESTCATTR *est_make_cattr_list(const CBLIST *attrs, int *nump);
379 static void est_free_cattr_list(ESTCATTR *list, int anum);
380 static int est_eclipse_scores(ESTDB *db, ESTSCORE *scores, int snum, int num,
381                               int vnum, int tfidf, double limit, CBMAP *shadows);
382 static int est_match_attr(const char *tval, int tsiz, const char *cop, int sign,
383                           const char *oval, int osiz, const char *sval, int ssiz,
384                           const void *regex, int onum);
385 static int est_check_strand(const char *tval, const char *oval);
386 static int est_check_stror(const char *tval, const char *oval);
387 static int est_check_stroreq(const char *tval, const char *oval);
388 static int est_check_numbt(const char *tval, const char *oval);
389 static int est_keysc_compare(const void *ap, const void *bp);
390 static ESTSCORE *est_search_similar(ESTDB *db, CBMAP *svmap, int *nump,
391                                     int knum, int unum, int mnum, int tfidf,
392                                     double nmin, int auxmin, CBMAP *auxwords);
393 static CBMAP *est_phrase_vector(const char *phrase);
394 static CBMAP *est_get_tvmap(ESTDB *db, int id, int vnum, int tfidf);
395 static int est_url_sameness(const char *aurl, const char *burl);
396 static void est_random_fclose(void);
397 static int est_signal_dispatch(int signum);
398 
399 
400 
401 /*************************************************************************************************
402  * common settings
403  *************************************************************************************************/
404 
405 
406 /* version of Hyper Estraier */
407 const char *est_version = _EST_VERSION;
408 
409 
410 
411 /*************************************************************************************************
412  * API for document
413  *************************************************************************************************/
414 
415 
416 /* Create a document object. */
est_doc_new(void)417 ESTDOC *est_doc_new(void){
418   ESTDOC *doc;
419   CB_MALLOC(doc, sizeof(ESTDOC));
420   doc->id = -1;
421   doc->attrs = NULL;
422   doc->dtexts = NULL;
423   doc->kwords = NULL;
424   return doc;
425 }
426 
427 
428 /* Create a document object made from draft data. */
est_doc_new_from_draft(const char * draft)429 ESTDOC *est_doc_new_from_draft(const char *draft){
430   ESTDOC *doc;
431   CBLIST *lines;
432   const char *line;
433   char *pv, *rp, *ep;
434   int i;
435   assert(draft);
436   doc = est_doc_new();
437   lines = cbsplit(draft, -1, "\n");
438   for(i = 0; i < CB_LISTNUM(lines); i++){
439     line = CB_LISTVAL(lines, i);
440     while(*line > '\0' && *line <= ' '){
441       line++;
442     }
443     if(*line == '\0'){
444       i++;
445       break;
446     }
447     if(*line == '%'){
448       if(cbstrfwmatch(line, ESTDCNTLVECTOR)){
449         if(!doc->kwords) doc->kwords = cbmapopenex(ESTMINIBNUM);
450         if((rp = strchr(line, '\t')) != NULL) rp++;
451         while(rp && (pv = strchr(rp, '\t')) != NULL){
452           pv++;
453           if((ep = strchr(pv, '\t')) != NULL){
454             *ep = '\0';
455             ep++;
456           }
457           if(rp[0] != '\0' && pv[0] != '\0') cbmapput(doc->kwords, rp, pv - rp - 1, pv, -1, TRUE);
458           rp = ep;
459         }
460       } else if(cbstrfwmatch(line, ESTDCNTLSCORE)){
461         if((rp = strchr(line, '\t')) != NULL) est_doc_set_score(doc, atoi(rp + 1));
462       }
463     } else if((pv = strchr(line, '=')) != NULL){
464       *(pv++) = '\0';
465       est_doc_add_attr(doc, line, pv);
466     }
467   }
468   for(; i < CB_LISTNUM(lines); i++){
469     line = CB_LISTVAL(lines, i);
470     if(*line == '\t'){
471       est_doc_add_hidden_text(doc, line + 1);
472     } else {
473       est_doc_add_text(doc, line);
474     }
475   }
476   CB_LISTCLOSE(lines);
477   return doc;
478 }
479 
480 
481 /* Destroy a document object. */
est_doc_delete(ESTDOC * doc)482 void est_doc_delete(ESTDOC *doc){
483   assert(doc);
484   if(doc->kwords) cbmapclose(doc->kwords);
485   if(doc->dtexts) CB_LISTCLOSE(doc->dtexts);
486   if(doc->attrs) cbmapclose(doc->attrs);
487   free(doc);
488 }
489 
490 
491 /* Add an attribute to a document object. */
est_doc_add_attr(ESTDOC * doc,const char * name,const char * value)492 void est_doc_add_attr(ESTDOC *doc, const char *name, const char *value){
493   char *rbuf, *wp;
494   int len;
495   assert(doc && name);
496   if(name[0] == '\0' || name[0] == '%') return;
497   if(!doc->attrs) doc->attrs = cbmapopenex(ESTMINIBNUM);
498   if(value){
499     rbuf = cbmemdup(value, -1);
500     for(wp = rbuf; *wp != '\0'; wp++){
501       if(*wp > 0 && *wp < ' ') *wp = ' ';
502     }
503     cbstrsqzspc(rbuf);
504     if((len = strlen(name)) > 0) cbmapput(doc->attrs, name, len, rbuf, -1, TRUE);
505     free(rbuf);
506   } else {
507     cbmapout(doc->attrs, name, -1);
508   }
509 }
510 
511 
512 /* Add a sentence of text to a document object. */
est_doc_add_text(ESTDOC * doc,const char * text)513 void est_doc_add_text(ESTDOC *doc, const char *text){
514   unsigned char *utext;
515   char *rtext, *wp;
516   int size;
517   assert(doc && text);
518   while(*text > '\0' && *text <= ' '){
519     text++;
520   }
521   if(text[0] == '\0') return;
522   if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
523   utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
524   est_normalize_text(utext, size, &size);
525   rtext = est_uconv_out((char *)utext, size, NULL);
526   for(wp = rtext; *wp != '\0'; wp++){
527     if(*wp > 0 && *wp < ' ') *wp = ' ';
528   }
529   cbstrsqzspc(rtext);
530   if(rtext[0] != '\0'){
531     CB_LISTPUSHBUF(doc->dtexts, rtext, strlen(rtext));
532   } else {
533     free(rtext);
534   }
535   free(utext);
536 }
537 
538 
539 /* Add a hidden sentence to a document object. */
est_doc_add_hidden_text(ESTDOC * doc,const char * text)540 void est_doc_add_hidden_text(ESTDOC *doc, const char *text){
541   unsigned char *utext;
542   char *rtext, *wp;
543   int size;
544   assert(doc && text);
545   while(*text > '\0' && *text <= ' '){
546     text++;
547   }
548   if(text[0] == '\0') return;
549   utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
550   est_normalize_text(utext, size, &size);
551   rtext = est_uconv_out((char *)utext, size, NULL);
552   for(wp = rtext; *wp != '\0'; wp++){
553     if(*wp > 0 && *wp < ' ') *wp = ' ';
554   }
555   cbstrsqzspc(rtext);
556   if(rtext[0] != '\0'){
557     if(!doc->attrs) doc->attrs = cbmapopenex(ESTMINIBNUM);
558     if(cbmapget(doc->attrs, "", 0, NULL)) cbmapputcat(doc->attrs, "", 0, " ", 1);
559     cbmapputcat(doc->attrs, "", 0, rtext, -1);
560   }
561   free(rtext);
562   free(utext);
563 }
564 
565 
566 /* Attach keywords to a document object. */
est_doc_set_keywords(ESTDOC * doc,CBMAP * kwords)567 void est_doc_set_keywords(ESTDOC *doc, CBMAP *kwords){
568   assert(doc && kwords);
569   if(doc->kwords) cbmapclose(doc->kwords);
570   doc->kwords = cbmapdup(kwords);
571 }
572 
573 
574 /* Set the substitute score of a document object. */
est_doc_set_score(ESTDOC * doc,int score)575 void est_doc_set_score(ESTDOC *doc, int score){
576   char numbuf[ESTNUMBUFSIZ];
577   assert(doc);
578   if(!doc->attrs) doc->attrs = cbmapopenex(ESTMINIBNUM);
579   if(score >= 0){
580     sprintf(numbuf, "%d", score);
581     cbmapput(doc->attrs, "\t", 1, numbuf, -1, TRUE);
582   } else {
583     cbmapout(doc->attrs, "\t", 1);
584   }
585 }
586 
587 
588 /* Get the ID number of a document object. */
est_doc_id(ESTDOC * doc)589 int est_doc_id(ESTDOC *doc){
590   assert(doc);
591   return doc->id;
592 }
593 
594 
595 /* Get a list of attribute names of a document object. */
est_doc_attr_names(ESTDOC * doc)596 CBLIST *est_doc_attr_names(ESTDOC *doc){
597   CBLIST *names;
598   const char *kbuf;
599   int ksiz;
600   assert(doc);
601   if(!doc->attrs){
602     CB_LISTOPEN(names);
603     return names;
604   }
605   CB_LISTOPEN(names);
606   cbmapiterinit(doc->attrs);
607   while((kbuf = cbmapiternext(doc->attrs, &ksiz)) != NULL){
608     if(ksiz > 0 && kbuf[0] != '\t') CB_LISTPUSH(names, kbuf, ksiz);
609   }
610   cblistsort(names);
611   return names;
612 }
613 
614 
615 /* Get the value of an attribute of a document object. */
est_doc_attr(ESTDOC * doc,const char * name)616 const char *est_doc_attr(ESTDOC *doc, const char *name){
617   assert(doc && name);
618   if(!doc->attrs || name[0] == '\0') return NULL;
619   return cbmapget(doc->attrs, name, -1, NULL);
620 }
621 
622 
623 /* Get a list of sentences of the text of a document object. */
est_doc_texts(ESTDOC * doc)624 const CBLIST *est_doc_texts(ESTDOC *doc){
625   assert(doc);
626   if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
627   return doc->dtexts;
628 }
629 
630 
631 /* Concatenate sentences of the text of a document object. */
est_doc_cat_texts(ESTDOC * doc)632 char *est_doc_cat_texts(ESTDOC *doc){
633   CBDATUM *datum;
634   const char *elem;
635   int i, size;
636   if(!doc->dtexts) return cbmemdup("", 0);
637   CB_DATUMOPEN(datum);
638   for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
639     elem = CB_LISTVAL2(doc->dtexts, i, size);
640     if(i > 0) CB_DATUMCAT(datum, " ", 1);
641     CB_DATUMCAT(datum, elem, size);
642   }
643   return cbdatumtomalloc(datum, NULL);
644 }
645 
646 
647 /* Get attached keywords of a document object. */
est_doc_keywords(ESTDOC * doc)648 CBMAP *est_doc_keywords(ESTDOC *doc){
649   assert(doc);
650   return doc->kwords;
651 }
652 
653 
654 /* Get the substitute score of a document object. */
est_doc_score(ESTDOC * doc)655 int est_doc_score(ESTDOC *doc){
656   const char *vbuf;
657   assert(doc);
658   if(doc->attrs && (vbuf = cbmapget(doc->attrs, "\t", 1, NULL)) != NULL) return atoi(vbuf);
659   return -1;
660 }
661 
662 
663 /* Dump draft data of a document object. */
est_doc_dump_draft(ESTDOC * doc)664 char *est_doc_dump_draft(ESTDOC *doc){
665   CBLIST *list;
666   CBDATUM *datum;
667   const char *kbuf, *vbuf;
668   int i, ksiz, vsiz;
669   assert(doc);
670   CB_DATUMOPEN(datum);
671   if(doc->attrs){
672     list = est_doc_attr_names(doc);
673     for(i = 0; i < CB_LISTNUM(list); i++){
674       kbuf = CB_LISTVAL2(list, i, ksiz);
675       vbuf = cbmapget(doc->attrs, kbuf, ksiz, &vsiz);
676       CB_DATUMCAT(datum, kbuf, ksiz);
677       CB_DATUMCAT(datum, "=", 1);
678       CB_DATUMCAT(datum, vbuf, vsiz);
679       CB_DATUMCAT(datum, "\n", 1);
680     }
681     CB_LISTCLOSE(list);
682   }
683   if(doc->kwords && cbmaprnum(doc->kwords) > 0){
684     CB_DATUMCAT(datum, ESTDCNTLVECTOR, strlen(ESTDCNTLVECTOR));
685     cbmapiterinit(doc->kwords);
686     while((kbuf = cbmapiternext(doc->kwords, &ksiz)) != NULL){
687       CB_MAPITERVAL(vbuf, kbuf, vsiz);
688       CB_DATUMCAT(datum, "\t", 1);
689       CB_DATUMCAT(datum, kbuf, ksiz);
690       CB_DATUMCAT(datum, "\t", 1);
691       CB_DATUMCAT(datum, vbuf, vsiz);
692     }
693     CB_DATUMCAT(datum, "\n", 1);
694   }
695   if(doc->attrs && (vbuf = cbmapget(doc->attrs, "\t", 1, &vsiz)) != NULL){
696     CB_DATUMCAT(datum, ESTDCNTLSCORE, strlen(ESTDCNTLSCORE));
697     CB_DATUMCAT(datum, "\t", 1);
698     CB_DATUMCAT(datum, vbuf, vsiz);
699     CB_DATUMCAT(datum, "\n", 1);
700   }
701   CB_DATUMCAT(datum, "\n", 1);
702   if(doc->dtexts){
703     for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
704       kbuf = CB_LISTVAL2(doc->dtexts, i, ksiz);
705       CB_DATUMCAT(datum, kbuf, ksiz);
706       CB_DATUMCAT(datum, "\n", 1);
707     }
708   }
709   if(doc->attrs && (vbuf = cbmapget(doc->attrs, "", 0, &vsiz)) != NULL){
710     CB_DATUMCAT(datum, "\t", 1);
711     CB_DATUMCAT(datum, vbuf, vsiz);
712     CB_DATUMCAT(datum, "\n", 1);
713   }
714   return cbdatumtomalloc(datum, NULL);
715 }
716 
717 
718 /* Make a snippet of the body text of a document object. */
est_doc_make_snippet(ESTDOC * doc,const CBLIST * words,int wwidth,int hwidth,int awidth)719 char *est_doc_make_snippet(ESTDOC *doc, const CBLIST *words, int wwidth, int hwidth, int awidth){
720   CBDATUM *sbuf;
721   const char *text;
722   char *snippet;
723   int i, size;
724   assert(doc && words && wwidth >= 0 && hwidth >= 0 && awidth >= 0);
725   if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
726   CB_DATUMOPEN(sbuf);
727   for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
728     text = CB_LISTVAL2(doc->dtexts, i, size);
729     if(i > 0) CB_DATUMCAT(sbuf, " ", 1);
730     CB_DATUMCAT(sbuf, text, size);
731   }
732   snippet = est_make_snippet(CB_DATUMPTR(sbuf), CB_DATUMSIZE(sbuf),
733                              words, wwidth, hwidth, awidth);
734   CB_DATUMCLOSE(sbuf);
735   return snippet;
736 }
737 
738 
739 
740 /*************************************************************************************************
741  * API for search conditions
742  *************************************************************************************************/
743 
744 
745 /* Create a condition object. */
est_cond_new(void)746 ESTCOND *est_cond_new(void){
747   ESTCOND *cond;
748   CB_MALLOC(cond, sizeof(ESTCOND));
749   cond->phrase = NULL;
750   cond->gstep = 2;
751   cond->tfidf = TRUE;
752   cond->pmode = ESTPMUSUAL;
753   cond->cbxpn = NULL;
754   cond->attrs = NULL;
755   cond->order = NULL;
756   cond->max = -1;
757   cond->skip = 0;
758   cond->auxmin = ESTAUXMIN;
759   cond->auxwords = NULL;
760   cond->scfb = FALSE;
761   cond->scores = NULL;
762   cond->snum = 0;
763   cond->nscores = NULL;
764   cond->nsnum = -1;
765   cond->opts = 0;
766   cond->ecllim = -1.0;
767   cond->shadows = NULL;
768   cond->distinct = NULL;
769   cond->mask = 0;
770   return cond;
771 }
772 
773 
774 /* Destroy a condition object. */
est_cond_delete(ESTCOND * cond)775 void est_cond_delete(ESTCOND *cond){
776   assert(cond);
777   if(cond->distinct) free(cond->distinct);
778   if(cond->shadows) cbmapclose(cond->shadows);
779   if(cond->auxwords) cbmapclose(cond->auxwords);
780   if(cond->scores) free(cond->scores);
781   if(cond->order) free(cond->order);
782   if(cond->attrs) CB_LISTCLOSE(cond->attrs);
783   if(cond->phrase) free(cond->phrase);
784   free(cond);
785 }
786 
787 
788 /* Set a search phrase to a condition object. */
est_cond_set_phrase(ESTCOND * cond,const char * phrase)789 void est_cond_set_phrase(ESTCOND *cond, const char *phrase){
790   assert(cond && phrase);
791   if(cond->phrase) free(cond->phrase);
792   while(*phrase > '\0' && *phrase <= ' '){
793     phrase++;
794   }
795   cond->phrase = cbmemdup(phrase, -1);
796 }
797 
798 
799 /* Add a condition of an attribute fo a condition object. */
est_cond_add_attr(ESTCOND * cond,const char * expr)800 void est_cond_add_attr(ESTCOND *cond, const char *expr){
801   assert(cond && expr);
802   while(*expr > '\0' && *expr <= ' '){
803     expr++;
804   }
805   if(*expr == '\0') return;
806   if(!cond->attrs) CB_LISTOPEN(cond->attrs);
807   CB_LISTPUSH(cond->attrs, expr, strlen(expr));
808 }
809 
810 
811 /* Set the order of a condition object. */
est_cond_set_order(ESTCOND * cond,const char * expr)812 void est_cond_set_order(ESTCOND *cond, const char *expr){
813   assert(cond && expr);
814   while(*expr > '\0' && *expr <= ' '){
815     expr++;
816   }
817   if(*expr == '\0') return;
818   if(cond->order) free(cond->order);
819   cond->order = cbmemdup(expr, -1);
820 }
821 
822 
823 /* Set the maximum number of retrieval of a condition object. */
est_cond_set_max(ESTCOND * cond,int max)824 void est_cond_set_max(ESTCOND *cond, int max){
825   assert(cond && max >= 0);
826   cond->max = max;
827 }
828 
829 
830 /* Set the number of skipped documents of a condition object. */
est_cond_set_skip(ESTCOND * cond,int skip)831 void est_cond_set_skip(ESTCOND *cond, int skip){
832   assert(cond && skip >= 0);
833   cond->skip = skip;
834 }
835 
836 
837 /* Set options of retrieval of a condition object. */
est_cond_set_options(ESTCOND * cond,int options)838 void est_cond_set_options(ESTCOND *cond, int options){
839   assert(cond);
840   if(options & ESTCONDSURE) cond->gstep = 1;
841   if(options & ESTCONDUSUAL) cond->gstep = 2;
842   if(options & ESTCONDFAST) cond->gstep = 3;
843   if(options & ESTCONDAGITO) cond->gstep = 4;
844   if(options & ESTCONDNOIDF) cond->tfidf = FALSE;
845   if(options & ESTCONDSIMPLE) cond->pmode = ESTPMSIMPLE;
846   if(options & ESTCONDROUGH) cond->pmode = ESTPMROUGH;
847   if(options & ESTCONDUNION) cond->pmode = ESTPMUNION;
848   if(options & ESTCONDISECT) cond->pmode = ESTPMISECT;
849   if(options & ESTCONDSCFB) cond->scfb = TRUE;
850   cond->opts |= options;
851 }
852 
853 
854 /* Set permission to adopt result of the auxiliary index. */
est_cond_set_auxiliary(ESTCOND * cond,int min)855 void est_cond_set_auxiliary(ESTCOND *cond, int min){
856   assert(cond);
857   cond->auxmin = min;
858 }
859 
860 
861 /* Set the upper limit of similarity for document eclipse. */
est_cond_set_eclipse(ESTCOND * cond,double limit)862 void est_cond_set_eclipse(ESTCOND *cond, double limit){
863   assert(cond);
864   if(limit > 0.0) cond->ecllim = limit;
865 }
866 
867 
868 /* Set the attribute distinction filter. */
est_cond_set_distinct(ESTCOND * cond,const char * name)869 void est_cond_set_distinct(ESTCOND *cond, const char *name){
870   assert(cond && name);
871   while(*name > '\0' && *name <= ' '){
872     name++;
873   }
874   if(*name == '\0') return;
875   if(cond->distinct) free(cond->distinct);
876   cond->distinct = cbmemdup(name, -1);
877 }
878 
879 
880 /* Set the mask of targets of meta search. */
est_cond_set_mask(ESTCOND * cond,int mask)881 void est_cond_set_mask(ESTCOND *cond, int mask){
882   assert(cond);
883   cond->mask = mask & INT_MAX;
884 }
885 
886 
887 
888 /*************************************************************************************************
889  * API for database
890  *************************************************************************************************/
891 
892 
893 /* Inode map for duplication check. */
894 CBMAP *est_inodes = NULL;
895 
896 
897 /* Get the string of an error code. */
est_err_msg(int ecode)898 const char *est_err_msg(int ecode){
899   switch(ecode){
900   case ESTENOERR: return "no error";
901   case ESTEINVAL: return "invalid argument";
902   case ESTEACCES: return "access forbidden";
903   case ESTELOCK: return "lock failure";
904   case ESTEDB: return "database problem";
905   case ESTEIO: return "I/O problem";
906   case ESTENOITEM: return "no such item";
907   default: break;
908   }
909   return "miscellaneous";
910 }
911 
912 
913 /* Open a database. */
est_db_open(const char * name,int omode,int * ecp)914 ESTDB *est_db_open(const char *name, int omode, int *ecp){
915   ESTDB *db;
916   DEPOT *metadb;
917   ESTIDX *idxdb;
918   CURIA *attrdb, *textdb, *kwddb;
919   VILLA *fwmdb, *auxdb, *xfmdb, *listdb;
920   CBMAP *aidxs;
921   CBLIST *list;
922   ESTATTRIDX attridx;
923   void *aidxdb;
924   const char *elem;
925   char path[ESTPATHBUFSIZ], vbuf[ESTNUMBUFSIZ], *dec;
926   int i, inode, domode, comode, vomode, flags, idxnum, dseq, dnum;
927   int amode, zmode, smode, vsiz, type, crdnum;
928   double bdiam, ddiam;
929   assert(name && ecp);
930   if(!est_inodes){
931     est_inodes = cbmapopenex(ESTMINIBNUM);
932     cbglobalgc(est_inodes, est_inodes_delete);
933   }
934   est_set_ecode(ecp, ESTENOERR, __LINE__);
935   if((omode & ESTDBWRITER) && (omode & ESTDBCREAT) && !est_mkdir(name)){
936     switch(errno){
937     case EACCES:
938       est_set_ecode(ecp, ESTEACCES, __LINE__);
939       return NULL;
940     case EEXIST:
941       break;
942     default:
943       est_set_ecode(ecp, ESTEIO, __LINE__);
944       return NULL;
945     }
946   }
947   if((inode = est_inode(name)) < 1){
948     est_set_ecode(ecp, ESTEIO, __LINE__);
949     return NULL;
950   }
951   if(cbmapget(est_inodes, (char *)&inode, sizeof(int), NULL) && !(omode & ESTDBNOLCK)){
952     est_set_ecode(ecp, ESTEACCES, __LINE__);
953     return NULL;
954   }
955   domode = DP_OREADER;
956   comode = CR_OREADER;
957   vomode = VL_OREADER;
958   if(omode & ESTDBWRITER){
959     domode = DP_OWRITER;
960     comode = CR_OWRITER;
961     vomode = VL_OWRITER;
962     if(ESTUSEBZIP){
963       vomode |= VL_OXCOMP;
964     } else if(ESTUSELZO){
965       vomode |= VL_OYCOMP;
966     } else if(ESTUSEZLIB){
967       vomode |= VL_OZCOMP;
968     }
969     if(omode & ESTDBCREAT){
970       domode |= DP_OCREAT;
971       comode |= CR_OCREAT;
972       vomode |= VL_OCREAT;
973     }
974     if(omode & ESTDBTRUNC){
975       domode |= DP_OTRUNC;
976       comode |= CR_OTRUNC;
977       vomode |= VL_OTRUNC;
978     }
979   }
980   if(omode & ESTDBNOLCK){
981     domode |= DP_ONOLCK;
982     comode |= CR_ONOLCK;
983     vomode |= VL_ONOLCK;
984   }
985   if(omode & ESTDBLCKNB){
986     domode |= DP_OLCKNB;
987     comode |= CR_OLCKNB;
988     vomode |= VL_OLCKNB;
989   }
990   flags = 0;
991   idxnum = 0;
992   dseq = 0;
993   dnum = 0;
994   amode = 0;
995   zmode = 0;
996   smode = 0;
997   if(omode & ESTDBSMALL){
998     bdiam = ESTDBSBRAT;
999     ddiam = ESTDBSDRAT;
1000   } else if(omode & ESTDBLARGE){
1001     bdiam = ESTDBLBRAT;
1002     ddiam = ESTDBLDRAT;
1003   } else if(omode & ESTDBHUGE){
1004     bdiam = ESTDBHBRAT;
1005     ddiam = ESTDBHDRAT;
1006   } else if(omode & ESTDBHUGE2){
1007     bdiam = ESTDBHBRAT * ESTDBH2RAT;
1008     ddiam = ESTDBHDRAT * ESTDBH2RAT;
1009   } else if(omode & ESTDBHUGE3){
1010     bdiam = ESTDBHBRAT * ESTDBH3RAT;
1011     ddiam = ESTDBHDRAT * ESTDBH3RAT;
1012   } else {
1013     bdiam = 1.0;
1014     ddiam = 1.0;
1015   }
1016   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTMETADBNAME);
1017   if((metadb = dpopen(path, domode, ESTMINIBNUM)) != NULL){
1018     flags = dpgetflags(metadb);
1019     if(dprnum(metadb) < 1){
1020       if(omode & ESTDBPERFNG){
1021         flags |= ESTDFPERFNG;
1022       } else if(omode & ESTDBCHRCAT){
1023         flags |= ESTDFCHRCAT;
1024       }
1025       if(ESTUSEBZIP){
1026         flags |= ESTDFBZIP;
1027       } else if(ESTUSELZO){
1028         flags |= ESTDFLZO;
1029       } else if(ESTUSEZLIB){
1030         flags |= ESTDFZLIB;
1031       }
1032       if(omode & ESTDBSCVOID){
1033         flags |= ESTDFSCVOID;
1034       } else if(omode & ESTDBSCINT){
1035         flags |= ESTDFSCINT;
1036       } else if(omode & ESTDBSCASIS){
1037         flags |= ESTDFSCASIS;
1038       }
1039       dpsetflags(metadb, flags);
1040     }
1041     if((vsiz = dpgetwb(metadb, ESTKEYIDXNUM, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
1042       vbuf[vsiz] = '\0';
1043       idxnum = atoi(vbuf);
1044     }
1045     if((vsiz = dpgetwb(metadb, ESTKEYDSEQ, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
1046       vbuf[vsiz] = '\0';
1047       dseq = atoi(vbuf);
1048     }
1049     if((vsiz = dpgetwb(metadb, ESTKEYDNUM, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
1050       vbuf[vsiz] = '\0';
1051       dnum = atoi(vbuf);
1052     }
1053     if(flags & ESTDFPERFNG){
1054       amode = ESTDFPERFNG;
1055     } else if(flags & ESTDFCHRCAT){
1056       amode = ESTDFCHRCAT;
1057     }
1058     if(flags & ESTDFZLIB){
1059       zmode = ESTDFZLIB;
1060     } else if(flags & ESTDFLZO){
1061       zmode = ESTDFLZO;
1062     } else if(flags & ESTDFBZIP){
1063       zmode = ESTDFBZIP;
1064     }
1065     if(flags & ESTDFSCVOID){
1066       smode = ESTDFSCVOID;
1067     } else if(flags & ESTDFSCINT){
1068       smode = ESTDFSCINT;
1069     } else if(flags & ESTDFSCASIS){
1070       smode = ESTDFSCASIS;
1071     }
1072   } else {
1073     est_set_ecode(ecp, dpecode == DP_ELOCK ? ESTELOCK : ESTEDB, __LINE__);
1074     return NULL;
1075   }
1076   if(idxnum < 1) idxnum = 1;
1077   if(dseq < 0) dseq = 0;
1078   if(dnum < 0) dnum = 0;
1079   crdnum = vlcrdnum;
1080   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTIDXDBNAME);
1081   idxdb = est_idx_open(path, vomode, idxnum);
1082   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTFWMDBNAME);
1083   vlcrdnum = ESTVLCRDNUM;
1084   fwmdb = vlopen(path, vomode, VL_CMPLEX);
1085   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTAUXDBNAME);
1086   vlcrdnum = ESTVLCRDNAUX;
1087   auxdb = vlopen(path, vomode, VL_CMPLEX);
1088   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTXFMDBNAME);
1089   vlcrdnum = ESTVLCRDNUM;
1090   xfmdb = vlopen(path, vomode, VL_CMPLEX);
1091   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTATTRDBNAME);
1092   attrdb = cropen(path, comode, ESTATTRDBBNUM * bdiam, ESTATTRDBDNUM * ddiam);
1093   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTTEXTDBNAME);
1094   textdb = cropen(path, comode, ESTTEXTDBBNUM * bdiam, ESTTEXTDBDNUM * ddiam);
1095   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTKWDDBNAME);
1096   kwddb = cropen(path, comode,  ESTKWDDBBNUM * bdiam, ESTKWDDBDNUM * ddiam);
1097   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTLISTDBNAME);
1098   vlcrdnum = ESTVLCRDNUM;
1099   listdb = vlopen(path, vomode, VL_CMPLEX);
1100   vlcrdnum = crdnum;
1101   if(!idxdb || !fwmdb || !auxdb || !xfmdb || !attrdb ||!textdb || !kwddb || !listdb){
1102     if(listdb) vlclose(listdb);
1103     if(kwddb) crclose(kwddb);
1104     if(textdb) crclose(textdb);
1105     if(attrdb) crclose(attrdb);
1106     if(xfmdb) vlclose(xfmdb);
1107     if(auxdb) vlclose(auxdb);
1108     if(fwmdb) vlclose(fwmdb);
1109     if(idxdb) est_idx_close(idxdb);
1110     dpclose(metadb);
1111     est_set_ecode(ecp, ESTEDB, __LINE__);
1112     return NULL;
1113   }
1114   if(omode & ESTDBWRITER){
1115     est_idx_set_tuning(idxdb, amode == ESTDFPERFNG ? ESTIDXDBLRMA : ESTIDXDBLRM, ESTIDXDBNIM,
1116                        ESTIDXDBLCN, ESTIDXDBNCN, ESTIDXDBFBP);
1117     est_idx_set_current(idxdb);
1118     vlsettuning(fwmdb, ESTFWMDBLRM, ESTFWMDBNIM, ESTFWMDBLCN, ESTFWMDBNCN);
1119     vlsetfbpsiz(fwmdb, ESTFWMDBFBP);
1120     vlsettuning(auxdb, ESTAUXDBLRM, ESTAUXDBNIM, ESTAUXDBLCN, ESTAUXDBNCN);
1121     vlsetfbpsiz(auxdb, ESTAUXDBFBP);
1122     vlsettuning(xfmdb, ESTXFMDBLRM, ESTXFMDBNIM, ESTXFMDBLCN, ESTXFMDBNCN);
1123     vlsetfbpsiz(xfmdb, ESTXFMDBFBP);
1124     crsetalign(attrdb, ESTATTRDBALN);
1125     crsetfbpsiz(attrdb, ESTATTRDBFBP);
1126     crsetalign(textdb, ESTTEXTDBALN);
1127     crsetfbpsiz(textdb, ESTTEXTDBFBP);
1128     crsetalign(kwddb, ESTKWDDBALN);
1129     crsetfbpsiz(kwddb, ESTKWDDBFBP);
1130     vlsettuning(listdb, ESTLISTDBLRM, ESTLISTDBNIM, ESTLISTDBLCN, ESTLISTDBNCN);
1131     vlsetfbpsiz(listdb, ESTLISTDBFBP);
1132   } else {
1133     est_idx_set_tuning(idxdb, -1, -1,
1134                        amode == ESTDFPERFNG ? ESTIDXDBRLCNA : ESTIDXDBRLCN, ESTIDXDBRNCN, -1);
1135     vlsettuning(fwmdb, -1, -1, ESTFWMDBLCN, ESTFWMDBNCN);
1136     vlsettuning(auxdb, -1, -1, ESTAUXDBRLCN, ESTAUXDBRNCN);
1137     vlsettuning(xfmdb, -1, -1, ESTXFMDBLCN, ESTXFMDBNCN);
1138     vlsettuning(listdb, -1, -1, ESTLISTDBLCN, ESTLISTDBNCN);
1139   }
1140   if((omode & ESTDBWRITER) && (omode & ESTDBTRUNC) && (list = cbdirlist(name)) != NULL){
1141     for(i = 0; i < CB_LISTNUM(list); i++){
1142       elem = CB_LISTVAL(list, i);
1143       if(cbstrfwmatch(elem, ESTAISEQPREF) || cbstrfwmatch(elem, ESTAISTRPREF) ||
1144          cbstrfwmatch(elem, ESTAINUMPREF)){
1145         sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
1146         if(unlink(path) == -1) est_rmdir_rec(path);
1147       }
1148     }
1149     CB_LISTCLOSE(list);
1150   }
1151   aidxs = cbmapopenex(ESTMINIBNUM);
1152   if((list = cbdirlist(name)) != NULL){
1153     for(i = 0; i < CB_LISTNUM(list); i++){
1154       elem = CB_LISTVAL(list, i);
1155       dec = NULL;
1156       type = -1;
1157       if(cbstrfwmatch(elem, ESTAISEQPREF)){
1158         dec = est_hex_decode(elem + strlen(ESTAISEQPREF));
1159         type = ESTIDXATTRSEQ;
1160       } else if(cbstrfwmatch(elem, ESTAISTRPREF)){
1161         dec = est_hex_decode(elem + strlen(ESTAISTRPREF));
1162         type = ESTIDXATTRSTR;
1163       } else if(cbstrfwmatch(elem, ESTAINUMPREF)){
1164         dec = est_hex_decode(elem + strlen(ESTAINUMPREF));
1165         type = ESTIDXATTRNUM;
1166       }
1167       if(dec){
1168         sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
1169         switch(type){
1170         case ESTIDXATTRSTR:
1171           if((aidxdb = vlopen(path, vomode, VL_CMPLEX)) != NULL){
1172             vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
1173             vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
1174             attridx.db = aidxdb;
1175             attridx.type = type;
1176             cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
1177           }
1178           break;
1179         case ESTIDXATTRNUM:
1180           if((aidxdb = vlopen(path, vomode, est_aidx_numcmp)) != NULL){
1181             vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
1182             vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
1183             attridx.db = aidxdb;
1184             attridx.type = type;
1185             cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
1186           }
1187           break;
1188         default:
1189           if((aidxdb = dpopen(path, domode, crbnum(attrdb) / ESTAIBDIAM)) != NULL){
1190             dpsetfbpsiz(aidxdb, ESTAIDXDPFBP);
1191             attridx.db = aidxdb;
1192             attridx.type = type;
1193             cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
1194           }
1195           break;
1196         }
1197         free(dec);
1198       }
1199     }
1200     CB_LISTCLOSE(list);
1201   }
1202   CB_MALLOC(db, sizeof(ESTDB));
1203   db->name = cbmemdup(name, -1);
1204   db->inode = inode;
1205   db->metadb = metadb;
1206   db->idxdb = idxdb;
1207   db->fwmdb = fwmdb;
1208   db->auxdb = auxdb;
1209   db->xfmdb = xfmdb;
1210   db->attrdb = attrdb;
1211   db->textdb = textdb;
1212   db->kwddb = kwddb;
1213   db->listdb = listdb;
1214   db->aidxs = aidxs;
1215   CB_LISTOPEN(db->pdocs);
1216   db->puris = NULL;
1217   est_set_ecode(&(db->ecode), ESTENOERR, __LINE__);
1218   db->fatal = FALSE;
1219   db->dseq = dseq;
1220   db->dnum = dnum;
1221   db->amode = amode;
1222   db->zmode = zmode;
1223   db->smode = smode;
1224   if(omode & ESTDBWRITER){
1225     db->idxcc = cbmapopenex(ESTIDXCCBNUM);
1226     db->auxcc = cbmapopenex(ESTAUXCCBNUM);
1227     db->icsiz = 0;
1228     db->icmax = ESTIDXCCMAX;
1229     db->outcc = cbmapopenex(ESTOUTCCBNUM);
1230   } else {
1231     db->idxcc = cbmapopenex(1);
1232     db->auxcc = cbmapopenex(1);
1233     db->icsiz = 0;
1234     db->icmax = 0;
1235     db->outcc = cbmapopenex(1);
1236   }
1237   db->keycc = cbmapopenex(ESTKEYCCMNUM + 1);
1238   db->kcmnum = ESTKEYCCMNUM;
1239   db->attrcc = cbmapopenex(ESTATTRCCMNUM + 1);
1240   db->acmnum = ESTATTRCCMNUM;
1241   db->textcc = cbmapopenex(ESTTEXTCCMNUM + 1);
1242   db->tcmnum = ESTTEXTCCMNUM;
1243   db->veccc = cbmapopenex(ESTATTRCCMNUM / 2 + 1);
1244   db->vcmnum = ESTATTRCCMNUM / 2;
1245   db->rescc = cbmapopenex(ESTRESCCMNUM * 2 + 1);
1246   db->rcmnum = ESTRESCCMNUM;
1247   db->spacc = NULL;
1248   db->scmnum = 0;
1249   db->scname = NULL;
1250   db->infocb = NULL;
1251   db->infoop = NULL;
1252   db->dfdb = NULL;
1253   db->metacc = NULL;
1254   db->wildmax = ESTWILDMAX;
1255   db->flsflag = FALSE;
1256   db->intflag = FALSE;
1257   cbmapput(est_inodes, (char *)&inode, sizeof(int), (char *)&db, sizeof(ESTDB *), FALSE);
1258   return db;
1259 }
1260 
1261 
1262 /* Close a database. */
est_db_close(ESTDB * db,int * ecp)1263 int est_db_close(ESTDB *db, int *ecp){
1264   ESTATTRIDX *attridx;
1265   const char *kbuf;
1266   int err;
1267   assert(db && ecp);
1268   est_set_ecode(ecp, ESTENOERR, __LINE__);
1269   err = FALSE;
1270   cbmapout(est_inodes, (char *)&(db->inode), sizeof(int));
1271   if(dpwritable(db->metadb)){
1272     if(!est_db_flush(db, -1)) err = TRUE;
1273     if(!est_db_write_meta(db)) err = TRUE;
1274   }
1275   est_db_inform(db, "closing");
1276   if(db->metacc) cbmapclose(db->metacc);
1277   if(db->spacc){
1278     free(db->scname);
1279     cbmapclose(db->spacc);
1280   }
1281   cbmapclose(db->rescc);
1282   cbmapclose(db->veccc);
1283   cbmapclose(db->textcc);
1284   cbmapclose(db->attrcc);
1285   cbmapclose(db->keycc);
1286   cbmapclose(db->outcc);
1287   cbmapclose(db->auxcc);
1288   cbmapclose(db->idxcc);
1289   if(db->puris) cbmapclose(db->puris);
1290   CB_LISTCLOSE(db->pdocs);
1291   cbmapiterinit(db->aidxs);
1292   while((kbuf = cbmapiternext(db->aidxs, NULL)) != NULL){
1293     attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
1294     switch(attridx->type){
1295     case ESTIDXATTRSTR:
1296     case ESTIDXATTRNUM:
1297       if(!vlclose(attridx->db)) err = TRUE;
1298       break;
1299     default:
1300       if(!dpclose(attridx->db)) err = TRUE;
1301       break;
1302     }
1303   }
1304   cbmapclose(db->aidxs);
1305   if(!vlclose(db->listdb)) err = TRUE;
1306   if(!crclose(db->kwddb)) err = TRUE;
1307   if(!crclose(db->textdb)) err = TRUE;
1308   if(!crclose(db->attrdb)) err = TRUE;
1309   if(!vlclose(db->xfmdb)) err = TRUE;
1310   if(!vlclose(db->auxdb)) err = TRUE;
1311   if(!vlclose(db->fwmdb)) err = TRUE;
1312   if(!est_idx_close(db->idxdb)) err = TRUE;
1313   if(!dpclose(db->metadb)) err = TRUE;
1314   free(db->name);
1315   if(db->fatal){
1316     est_set_ecode(ecp, db->ecode, __LINE__);
1317     err = TRUE;
1318   } else if(err){
1319     est_set_ecode(ecp, ESTEDB, __LINE__);
1320   }
1321   free(db);
1322   return err ? FALSE : TRUE;
1323 }
1324 
1325 
1326 /* Get the last happended error code of a database. */
est_db_error(ESTDB * db)1327 int est_db_error(ESTDB *db){
1328   assert(db);
1329   return db->ecode;
1330 }
1331 
1332 
1333 /* Check whether a database has a fatal error. */
est_db_fatal(ESTDB * db)1334 int est_db_fatal(ESTDB *db){
1335   assert(db);
1336   return db->fatal;
1337 }
1338 
1339 
1340 /* Add an index for narrowing or sorting with document attributes. */
est_db_add_attr_index(ESTDB * db,const char * name,int type)1341 int est_db_add_attr_index(ESTDB *db, const char *name, int type){
1342   ESTATTRIDX attridx;
1343   ESTSCORE *scores;
1344   void *aidxdb;
1345   char path[ESTPATHBUFSIZ], *enc, *vbuf;
1346   int i, domode, vomode, crdnum, err, snum;
1347   assert(db && name);
1348   if(!dpwritable(db->metadb)){
1349     est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
1350     return FALSE;
1351   }
1352   if(cbmapget(db->aidxs, name, -1, NULL)){
1353     est_set_ecode(&(db->ecode), ESTEMISC, __LINE__);
1354     return FALSE;
1355   }
1356   enc = est_hex_encode(name);
1357   switch(type){
1358   case ESTIDXATTRSEQ:
1359     sprintf(path, "%s%c%s%s", db->name, ESTPATHCHR, ESTAISEQPREF, enc);
1360     break;
1361   case ESTIDXATTRSTR:
1362     sprintf(path, "%s%c%s%s", db->name, ESTPATHCHR, ESTAISTRPREF, enc);
1363     break;
1364   case ESTIDXATTRNUM:
1365     sprintf(path, "%s%c%s%s", db->name, ESTPATHCHR, ESTAINUMPREF, enc);
1366     break;
1367   default:
1368     free(enc);
1369     est_set_ecode(&(db->ecode), ESTEINVAL, __LINE__);
1370     return FALSE;
1371   }
1372   free(enc);
1373   domode = DP_OWRITER | DP_OCREAT | DP_OTRUNC;
1374   vomode = VL_OWRITER | VL_OCREAT | VL_OTRUNC;
1375   if(ESTUSEBZIP){
1376     vomode |= VL_OXCOMP;
1377   } else if(ESTUSELZO){
1378     vomode |= VL_OYCOMP;
1379   } else if(ESTUSEZLIB){
1380     vomode |= VL_OZCOMP;
1381   }
1382   err = FALSE;
1383   crdnum = vlcrdnum;
1384   switch(type){
1385   case ESTIDXATTRSTR:
1386     vlcrdnum = ESTVLCRDNUM;
1387     if(!(aidxdb = vlopen(path, vomode, VL_CMPLEX))){
1388       est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1389       vlcrdnum = crdnum;
1390       return FALSE;
1391     }
1392     vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
1393     vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
1394     if(est_db_doc_num(db) > 0){
1395       scores = est_search_uvset(db, &snum, NULL, TRUE);
1396       for(i = 0; i < snum; i++){
1397         if((vbuf = est_db_get_doc_attr(db, scores[i].id, name)) != NULL){
1398           if(!est_aidx_attr_put(aidxdb, scores[i].id, vbuf, strlen(vbuf))){
1399             est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1400             db->fatal = TRUE;
1401             err = TRUE;
1402           }
1403           free(vbuf);
1404         }
1405         if(i % (ESTCCCBFREQ / 10) == 0) est_db_inform(db, "entering existing attributes");
1406       }
1407       free(scores);
1408     }
1409     break;
1410   case ESTIDXATTRNUM:
1411     vlcrdnum = ESTVLCRDNUM;
1412     if(!(aidxdb = vlopen(path, vomode, est_aidx_numcmp))){
1413       est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1414       vlcrdnum = crdnum;
1415       return FALSE;
1416     }
1417     vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
1418     vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
1419     if(est_db_doc_num(db) > 0){
1420       scores = est_search_uvset(db, &snum, NULL, TRUE);
1421       for(i = 0; i < snum; i++){
1422         if((vbuf = est_db_get_doc_attr(db, scores[i].id, name)) != NULL){
1423           if(!est_aidx_attr_put(aidxdb, scores[i].id, vbuf, strlen(vbuf))){
1424             est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1425             db->fatal = TRUE;
1426             err = TRUE;
1427           }
1428           free(vbuf);
1429         }
1430         if(i % (ESTCCCBFREQ / 10) == 0) est_db_inform(db, "entering existing attributes");
1431       }
1432       free(scores);
1433     }
1434     break;
1435   default:
1436     if(!(aidxdb = dpopen(path, domode, crbnum(db->attrdb) * ESTAIBDIAM))){
1437       est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1438       vlcrdnum = crdnum;
1439       return FALSE;
1440     }
1441     dpsetfbpsiz(aidxdb, ESTAIDXDPFBP);
1442     if(est_db_doc_num(db) > 0){
1443       scores = est_search_uvset(db, &snum, NULL, TRUE);
1444       for(i = 0; i < snum; i++){
1445         if((vbuf = est_db_get_doc_attr(db, scores[i].id, name)) != NULL){
1446           if(!est_aidx_seq_put(aidxdb, scores[i].id, vbuf, strlen(vbuf))){
1447             est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1448             db->fatal = TRUE;
1449             err = TRUE;
1450           }
1451           free(vbuf);
1452         }
1453         if(i % (ESTCCCBFREQ / 10) == 0) est_db_inform(db, "entering existing attributes");
1454       }
1455       free(scores);
1456     }
1457     break;
1458   }
1459   vlcrdnum = crdnum;
1460   attridx.db = aidxdb;
1461   attridx.type = type;
1462   cbmapput(db->aidxs, name, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
1463   return err ? FALSE : TRUE;
1464 }
1465 
1466 
1467 /* Flush index words in the cache of a database. */
est_db_flush(ESTDB * db,int max)1468 int est_db_flush(ESTDB *db, int max){
1469   ESTATTRIDX *attridx;
1470   CBMAP *ids;
1471   CBLIST *keys;
1472   CBDATUM *nval;
1473   const char *kbuf, *vbuf, *rp, *pv, *ep;
1474   char *tbuf, *wp, numbuf[ESTNUMBUFSIZ];
1475   int i, j, inc, err, ksiz, vsiz, rnum, len, id, sum, cid, vnum, lid, dnum, tsiz, vstep;
1476   assert(db);
1477   if(!dpwritable(db->metadb)){
1478     est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
1479     return FALSE;
1480   }
1481   if(max < 1 || max >= INT_MAX){
1482     if(!est_db_write_meta(db)) err = TRUE;
1483     if(!dpmemflush(db->metadb)) err = TRUE;
1484     if(!crmemflush(db->attrdb)) err = TRUE;
1485     if(!crmemflush(db->textdb)) err = TRUE;
1486     if(!crmemflush(db->kwddb)) err = TRUE;
1487     if(!vlmemflush(db->listdb)) err = TRUE;
1488     cbmapiterinit(db->aidxs);
1489     while((kbuf = cbmapiternext(db->aidxs, NULL)) != NULL){
1490       attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
1491       switch(attridx->type){
1492       case ESTIDXATTRSTR:
1493       case ESTIDXATTRNUM:
1494         if(!vlmemflush(attridx->db)) err = TRUE;
1495         break;
1496       default:
1497         if(!dpmemflush(attridx->db)) err = TRUE;
1498         break;
1499       }
1500     }
1501   }
1502   if(cbmaprnum(db->idxcc) < 1 && cbmaprnum(db->auxcc) < 1 && cbmaprnum(db->outcc) < 1)
1503     return TRUE;
1504   db->flsflag = TRUE;
1505   db->intflag = FALSE;
1506   inc = est_db_used_cache_size(db) > db->icmax;
1507   err = FALSE;
1508   CB_LISTOPEN(keys);
1509   cbmapiterinit(db->idxcc);
1510   while((kbuf = cbmapiternext(db->idxcc, &ksiz)) != NULL){
1511     CB_LISTPUSH(keys, kbuf, ksiz);
1512   }
1513   rnum = CB_LISTNUM(keys);
1514   cblistsort(keys);
1515   if(max > 0){
1516     while(CB_LISTNUM(keys) > max){
1517       CB_LISTDROP(keys);
1518     }
1519   }
1520   for(i = 0; i < CB_LISTNUM(keys); i++){
1521     kbuf = CB_LISTVAL2(keys, i, ksiz);
1522     vbuf = cbmapget(db->idxcc, kbuf, ksiz, &vsiz);
1523     if(!est_idx_add(db->idxdb, kbuf, ksiz, vbuf, vsiz, db->smode) ||
1524        (!vlput(db->fwmdb, kbuf, ksiz, "", 0, VL_DKEEP) && dpecode != DP_EKEEP)){
1525       err = TRUE;
1526       break;
1527     }
1528     cbmapout(db->idxcc, kbuf, ksiz);
1529     db->icsiz -= vsiz;
1530     if(i % ESTCCCBFREQ == 0){
1531       est_db_inform(db, "flushing index words");
1532       if(est_idx_size_current(db->idxdb) >= ESTIDXDBMAX){
1533         est_db_inform(db, "adding a new database file");
1534         est_idx_increment(db->idxdb);
1535         inc = FALSE;
1536       }
1537     }
1538     if(max > 0 && db->intflag && i > 0 && i % ESTCCIRSLOT == 0) break;
1539   }
1540   CB_LISTCLOSE(keys);
1541   if(cbmaprnum(db->idxcc) < 1){
1542     cbmapclose(db->idxcc);
1543     db->idxcc = cbmapopenex(rnum > ESTIDXCCBNUM ? rnum * 1.5 : ESTIDXCCBNUM);
1544     if(cbmaprnum(db->auxcc) > 0){
1545       CB_LISTOPEN(keys);
1546       cbmapiterinit(db->auxcc);
1547       while((kbuf = cbmapiternext(db->auxcc, &ksiz)) != NULL){
1548         CB_LISTPUSH(keys, kbuf, ksiz);
1549       }
1550       cblistsort(keys);
1551       for(i = 0; i < CB_LISTNUM(keys); i++){
1552         kbuf = CB_LISTVAL2(keys, i, ksiz);
1553         vbuf = cbmapget(db->auxcc, kbuf, ksiz, &vsiz);
1554         if(!vlput(db->auxdb, kbuf, ksiz, vbuf, vsiz, VL_DCAT)){
1555           err = TRUE;
1556           break;
1557         }
1558         len = sprintf(numbuf, "%d", vlvsiz(db->auxdb, kbuf, ksiz) / (int)(sizeof(int) * 2));
1559         if(!vlput(db->xfmdb, kbuf, ksiz, numbuf, len, VL_DOVER)){
1560           err = TRUE;
1561           break;
1562         }
1563         cbmapout(db->auxcc, kbuf, ksiz);
1564         db->icsiz -= vsiz;
1565         if(i % ESTCCCBFREQ == 0) est_db_inform(db, "flushing auxiliary keywords");
1566         if(max > 0 && db->intflag && i > 0 && i % ESTCCIRSLOT == 0) break;
1567       }
1568       CB_LISTCLOSE(keys);
1569       if(cbmaprnum(db->auxcc) < 1){
1570         cbmapclose(db->auxcc);
1571         db->auxcc = cbmapopenex(ESTAUXCCBNUM);
1572       }
1573     }
1574   }
1575   if(max < 1 && cbmaprnum(db->outcc) > 0){
1576     ids = cbmapopen();
1577     CB_LISTOPEN(keys);
1578     cbmapiterinit(db->outcc);
1579     while((kbuf = cbmapiternext(db->outcc, &ksiz)) != NULL){
1580       if(*kbuf == '\t'){
1581         id = atoi(kbuf + 1);
1582         cbmapput(ids, (char *)&id, sizeof(int), "", 0, FALSE);
1583       } else {
1584         CB_LISTPUSH(keys, kbuf, ksiz);
1585       }
1586     }
1587     cblistsort(keys);
1588     dnum = est_idx_dnum(db->idxdb);
1589     for(i = 0; i < CB_LISTNUM(keys); i++){
1590       kbuf = CB_LISTVAL2(keys, i, ksiz);
1591       if(kbuf[0] == ' '){
1592         if((tbuf = vlget(db->auxdb, kbuf + 1, ksiz - 1, &tsiz)) != NULL){
1593           rp = tbuf;
1594           wp = tbuf;
1595           ep = tbuf + tsiz;
1596           while(rp < ep){
1597             if(!cbmapget(ids, rp, sizeof(int), NULL)){
1598               memmove(wp, rp, sizeof(int) * 2);
1599               wp += sizeof(int) * 2;
1600             }
1601             rp += sizeof(int) * 2;
1602           }
1603           if(wp > tbuf){
1604             if(!vlput(db->auxdb, kbuf + 1, ksiz - 1, tbuf, wp - tbuf, VL_DOVER)) err = TRUE;
1605             len = sprintf(numbuf, "%d", (int)((wp - tbuf) / (sizeof(int) * 2)));
1606             if(!vlput(db->xfmdb, kbuf + 1, ksiz - 1, numbuf, len, VL_DOVER)) err = TRUE;
1607           } else {
1608             if(!vlout(db->auxdb, kbuf + 1, ksiz - 1)) err = TRUE;
1609             if(!vlout(db->xfmdb, kbuf + 1, ksiz - 1) && dpecode != DP_ENOITEM) err = TRUE;
1610           }
1611           free(tbuf);
1612         }
1613       } else {
1614         sum = 0;
1615         for(j = 0; j < dnum; j++){
1616           if((vbuf = est_idx_get_one(db->idxdb, j, kbuf, ksiz, &tsiz)) != NULL){
1617             CB_DATUMOPEN(nval);
1618             rp = vbuf;
1619             ep = vbuf + tsiz;
1620             lid = 0;
1621             cid = 0;
1622             while(rp < ep){
1623               EST_READ_VNUMBUF(rp, vnum, vstep);
1624               cid += vnum + 1;
1625               rp += vstep;
1626               pv = rp;
1627               switch(db->smode){
1628               case ESTDFSCVOID:
1629                 break;
1630               default:
1631                 rp++;
1632                 break;
1633               case ESTDFSCINT:
1634               case ESTDFSCASIS:
1635                 rp += sizeof(int);
1636                 break;
1637               }
1638               while(*rp != 0x0){
1639                 rp += 2;
1640               }
1641               rp++;
1642               if(!cbmapget(ids, (char *)&cid, sizeof(int), NULL)){
1643                 EST_SET_VNUMBUF(vstep, numbuf, cid - lid - 1);
1644                 CB_DATUMCAT(nval, numbuf, vstep);
1645                 CB_DATUMCAT(nval, pv, rp - pv);
1646                 lid = cid;
1647               }
1648             }
1649             if(!est_idx_put_one(db->idxdb, j, kbuf, ksiz, CB_DATUMPTR(nval), CB_DATUMSIZE(nval)))
1650               err = TRUE;
1651             sum += CB_DATUMSIZE(nval);
1652             CB_DATUMCLOSE(nval);
1653           }
1654         }
1655         if(sum < 1 && !vlout(db->fwmdb, kbuf, ksiz) && dpecode != DP_ENOITEM) err = TRUE;
1656       }
1657       cbmapout(db->outcc, kbuf, ksiz);
1658       if(i % ESTCCCBFREQ == 0) est_db_inform(db, "cleaning dispensable keys");
1659       if(max > 0 && db->intflag && i > 0 && i % ESTCCIRSLOT == 0) break;
1660     }
1661     if(cbmaprnum(db->outcc) <= cbmaprnum(ids)){
1662       cbmapclose(db->outcc);
1663       db->outcc = cbmapopenex(ESTOUTCCBNUM);
1664     }
1665     CB_LISTCLOSE(keys);
1666     cbmapclose(ids);
1667   }
1668   cbmapclose(db->keycc);
1669   db->keycc = cbmapopenex(ESTKEYCCMNUM + 1);
1670   db->kcmnum = ESTKEYCCMNUM;
1671   if(!(max > 0 && db->intflag) && inc && est_idx_size_current(db->idxdb) >= ESTIDXDBMIN){
1672     est_db_inform(db, "adding a new database file");
1673     est_idx_increment(db->idxdb);
1674   }
1675   if(max < 1 || max >= INT_MAX){
1676     if(!vlmemflush(db->auxdb)) err = TRUE;
1677     if(!est_idx_memflush(db->idxdb)) err = TRUE;
1678   }
1679   if(max > 0 && db->intflag) est_db_inform(db, "flushing interrupted");
1680   db->flsflag = FALSE;
1681   db->intflag = FALSE;
1682   if(err){
1683     est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1684     db->fatal = TRUE;
1685     return FALSE;
1686   }
1687   return TRUE;
1688 }
1689 
1690 
1691 /* Synchronize updating contents of a database. */
est_db_sync(ESTDB * db)1692 int est_db_sync(ESTDB *db){
1693   ESTATTRIDX *attridx;
1694   const char *kbuf;
1695   int err;
1696   assert(db);
1697   if(!dpwritable(db->metadb)){
1698     est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
1699     return FALSE;
1700   }
1701   err = FALSE;
1702   if(!est_db_flush(db, -1) || !est_db_write_meta(db)) err = TRUE;
1703   est_db_inform(db, "synchronizing the database for meta information");
1704   if(!dpsync(db->metadb)) err = TRUE;
1705   est_db_inform(db, "synchronizing the inverted index");
1706   if(!est_idx_sync(db->idxdb)) err = TRUE;
1707   est_db_inform(db, "synchronizing the database for forward matching");
1708   if(!vlsync(db->fwmdb)) err = TRUE;
1709   est_db_inform(db, "synchronizing the database for attributes");
1710   if(!crsync(db->attrdb)) err = TRUE;
1711   est_db_inform(db, "synchronizing the database for texts");
1712   if(!crsync(db->textdb)) err = TRUE;
1713   est_db_inform(db, "synchronizing the database for keywords");
1714   if(!crsync(db->kwddb)) err = TRUE;
1715   est_db_inform(db, "synchronizing the database for document list");
1716   if(!vlsync(db->listdb)) err = TRUE;
1717   if(cbmaprnum(db->aidxs) > 0){
1718     est_db_inform(db, "synchronizing the databases for attribute narrowing");
1719     cbmapiterinit(db->aidxs);
1720     while((kbuf = cbmapiternext(db->aidxs, NULL)) != NULL){
1721       attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
1722       switch(attridx->type){
1723       case ESTIDXATTRSTR:
1724       case ESTIDXATTRNUM:
1725         if(!vlsync(attridx->db)) err = TRUE;
1726         break;
1727       default:
1728         if(!dpsync(attridx->db)) err = TRUE;
1729         break;
1730       }
1731     }
1732   }
1733   if(err){
1734     est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1735     db->fatal = TRUE;
1736   }
1737   return err ? FALSE : TRUE;
1738 }
1739 
1740 
1741 /* Optimize a database. */
est_db_optimize(ESTDB * db,int options)1742 int est_db_optimize(ESTDB *db, int options){
1743   CBMAP *dmap;
1744   CBLIST *words;
1745   CBDATUM *nval;
1746   ESTATTRIDX *attridx;
1747   const char *word, *rp, *pv, *ep;
1748   char *kbuf, *vbuf, *wp, numbuf[ESTNUMBUFSIZ];
1749   int i, err, id, ksiz, vsiz, wsiz, len, vstep;
1750   assert(db);
1751   if(!dpwritable(db->metadb)){
1752     est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
1753     return FALSE;
1754   }
1755   if(!est_db_flush(db, -1)) return FALSE;
1756   err = FALSE;
1757   if(!(options & ESTOPTNOPURGE)){
1758     dmap = cbmapopenex(vlrnum(db->listdb) + 1);
1759     vlcurfirst(db->listdb);
1760     while((vbuf = vlcurval(db->listdb, NULL)) != NULL){
1761       id = atoi(vbuf);
1762       cbmapput(dmap, (char *)&id, sizeof(int), "", 0, FALSE);
1763       free(vbuf);
1764       vlcurnext(db->listdb);
1765     }
1766     CB_LISTOPEN(words);
1767     vlcurfirst(db->fwmdb);
1768     while((kbuf = vlcurkey(db->fwmdb, &ksiz)) != NULL){
1769       CB_LISTPUSHBUF(words, kbuf, ksiz);
1770       vlcurnext(db->fwmdb);
1771     }
1772     for(i = 0; i < CB_LISTNUM(words); i++){
1773       if(i % (ESTIDXDBLRM * 4) == 0) est_idx_set_current(db->idxdb);
1774       word = CB_LISTVAL2(words, i, wsiz);
1775       vbuf = est_idx_scan(db->idxdb, word, wsiz, &vsiz, db->smode);
1776       CB_DATUMOPEN(nval);
1777       rp = vbuf;
1778       ep = vbuf + vsiz;
1779       while(rp < ep){
1780         pv = rp;
1781         EST_READ_VNUMBUF(rp, id, vstep);
1782         rp += vstep;
1783         switch(db->smode){
1784         case ESTDFSCVOID:
1785           break;
1786         default:
1787           rp++;
1788           break;
1789         case ESTDFSCINT:
1790         case ESTDFSCASIS:
1791           rp += sizeof(int);
1792           break;
1793         }
1794         while(*rp != 0x00){
1795           rp += 2;
1796         }
1797         rp++;
1798         if(cbmapget(dmap, (char *)&id, sizeof(int), NULL)) CB_DATUMCAT(nval, pv, rp - pv);
1799       }
1800       if(!est_idx_out(db->idxdb, word, wsiz)) err = TRUE;
1801       if(CB_DATUMSIZE(nval) > 0){
1802         if(!est_idx_add(db->idxdb, word, wsiz, CB_DATUMPTR(nval), CB_DATUMSIZE(nval), db->smode))
1803           err = TRUE;
1804       } else {
1805         if(!vlout(db->fwmdb, word, wsiz)) err = TRUE;
1806       }
1807       CB_DATUMCLOSE(nval);
1808       free(vbuf);
1809       free(kbuf);
1810       if(i % ESTCCCBFREQ == 0) est_db_inform(db, "cleaning dispensable keys");
1811     }
1812     CB_LISTCLOSE(words);
1813     CB_LISTOPEN(words);
1814     vlcurfirst(db->auxdb);
1815     while((kbuf = vlcurkey(db->auxdb, &ksiz)) != NULL){
1816       CB_LISTPUSHBUF(words, kbuf, ksiz);
1817       vlcurnext(db->auxdb);
1818     }
1819     for(i = 0; i < CB_LISTNUM(words); i++){
1820       word = CB_LISTVAL2(words, i, wsiz);
1821       if(!(vbuf = vlget(db->auxdb, word, wsiz, &vsiz))) continue;
1822       rp = vbuf;
1823       wp = vbuf;
1824       ep = vbuf + vsiz;
1825       while(rp < ep){
1826         if(cbmapget(dmap, rp, sizeof(int), NULL)){
1827           memmove(wp, rp, sizeof(int) * 2);
1828           wp += sizeof(int) * 2;
1829         }
1830         rp += sizeof(int) * 2;
1831       }
1832       if(wp > vbuf){
1833         if(!vlput(db->auxdb, word, wsiz, vbuf, wp - vbuf, VL_DOVER)) err = TRUE;
1834         len = sprintf(numbuf, "%d", (int)((wp - vbuf) / (sizeof(int) * 2)));
1835         if(!vlput(db->xfmdb, word, wsiz, numbuf, len, VL_DOVER)) err = TRUE;
1836       } else {
1837         if(!vlout(db->auxdb, word, wsiz)) err = TRUE;
1838         if(!vlout(db->xfmdb, word, wsiz) && dpecode != DP_ENOITEM) err = TRUE;
1839       }
1840       free(vbuf);
1841       if(i % ESTCCCBFREQ == 0) est_db_inform(db, "cleaning dispensable auxiliary keys");
1842     }
1843     CB_LISTCLOSE(words);
1844     cbmapclose(dmap);
1845   }
1846   if(!(options & ESTOPTNODBOPT)){
1847     est_db_inform(db, "optimizing the inverted index");
1848     if(!est_idx_optimize(db->idxdb)) err = TRUE;
1849     est_db_inform(db, "optimizing the database for forward matching");
1850     if(!vloptimize(db->fwmdb)) err = TRUE;
1851     est_db_inform(db, "optimizing the auxiliary index");
1852     if(!vloptimize(db->auxdb)) err = TRUE;
1853     est_db_inform(db, "optimizing the database for auxiliary forward matching");
1854     if(!vloptimize(db->xfmdb)) err = TRUE;
1855     est_db_inform(db, "optimizing the database for attributes");
1856     if(!croptimize(db->attrdb, -1)) err = TRUE;
1857     est_db_inform(db, "optimizing the database for texts");
1858     if(!croptimize(db->textdb, -1)) err = TRUE;
1859     est_db_inform(db, "optimizing the database for keywords");
1860     if(!croptimize(db->kwddb, -1)) err = TRUE;
1861     est_db_inform(db, "optimizing the database for document list");
1862     if(!vloptimize(db->listdb)) err = TRUE;
1863     if(cbmaprnum(db->aidxs) > 0){
1864       est_db_inform(db, "optimizing the databases for attribute narrowing");
1865       cbmapiterinit(db->aidxs);
1866       while((rp = cbmapiternext(db->aidxs, NULL)) != NULL){
1867         attridx = (ESTATTRIDX *)cbmapiterval(rp, NULL);
1868         switch(attridx->type){
1869         case ESTIDXATTRSTR:
1870         case ESTIDXATTRNUM:
1871           if(!vloptimize(attridx->db)) err = TRUE;
1872           break;
1873         default:
1874           if(!dpoptimize(attridx->db, -1)) err = TRUE;
1875           break;
1876         }
1877       }
1878     }
1879   }
1880   cbmapclose(db->rescc);
1881   db->rescc = cbmapopenex(db->rcmnum * 2 + 1);
1882   if(err){
1883     est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1884     db->fatal = TRUE;
1885   }
1886   return err ? FALSE : TRUE;
1887 }
1888 
1889 
1890 /* Merge another database. */
est_db_merge(ESTDB * db,const char * name,int options)1891 int est_db_merge(ESTDB *db, const char *name, int options){
1892   ESTDB *tgdb;
1893   ESTATTRIDX *attridx;
1894   CBMAP *idmap, *seqmap, *attrs;
1895   CBLIST *words;
1896   CBDATUM *rbuf;
1897   const char *kbuf, *vbuf, *rp, *ep, *sp;
1898   char *tbuf, numbuf[ESTNUMBUFSIZ];
1899   int i, j, ecode, err, ksiz, vsiz, tsiz, oid, nid, len, vstep, anum, *ary;
1900   assert(db && name);
1901   if(!dpwritable(db->metadb)){
1902     est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
1903     return FALSE;
1904   }
1905   est_db_inform(db, "opening the target database");
1906   if(!(tgdb = est_db_open(name, ESTDBREADER, &ecode))){
1907     est_set_ecode(&(db->ecode), ecode, __LINE__);
1908     return FALSE;
1909   }
1910   if(dpgetflags(db->metadb) != dpgetflags(tgdb->metadb)){
1911     est_db_close(tgdb, &ecode);
1912     est_set_ecode(&(db->ecode), ESTEMISC, __LINE__);
1913     return FALSE;
1914   }
1915   err = FALSE;
1916   idmap = cbmapopenex(est_db_doc_num(tgdb) + 1);
1917   vlcurfirst(tgdb->listdb);
1918   for(i = 0; (kbuf = vlcurkeycache(tgdb->listdb, &ksiz)) != NULL; i++){
1919     if((vbuf = vlgetcache(db->listdb, kbuf, ksiz, NULL)) != NULL &&
1920        !est_db_out_doc(db, atoi(vbuf), options & ESTMGCLEAN ? ESTODCLEAN : 0)) err = TRUE;
1921     oid = atoi(vlcurvalcache(tgdb->listdb, NULL));
1922     db->dseq++;
1923     db->dnum++;
1924     cbmapput(idmap, (char *)&oid, sizeof(int), (char *)&(db->dseq), sizeof(int), FALSE);
1925     vlcurnext(tgdb->listdb);
1926     if(i % (ESTCCCBFREQ / 10) == 0) est_db_inform(db, "calculating ID mapping");
1927   }
1928   if(!est_db_flush(db, -1)){
1929     cbmapclose(idmap);
1930     est_db_close(tgdb, &ecode);
1931     return FALSE;
1932   }
1933   cbmapiterinit(idmap);
1934   for(i = 0; (kbuf = cbmapiternext(idmap, &ksiz)) != NULL; i++){
1935     CB_MAPITERVAL(vbuf, kbuf, vsiz);
1936     oid = *(int *)kbuf;
1937     nid = *(int *)vbuf;
1938     if((tbuf = est_crget(tgdb->attrdb, tgdb->zmode, oid, &tsiz)) != NULL){
1939       attrs = cbmapload(tbuf, tsiz);
1940       len = sprintf(numbuf, "%d", nid);
1941       cbmapput(attrs, ESTDATTRID, -1, numbuf, len, TRUE);
1942       free(tbuf);
1943       tbuf = cbmapdump(attrs, &tsiz);
1944       if((vbuf = cbmapget(attrs, ESTDATTRURI, -1, &vsiz)) != NULL){
1945         if(!vlput(db->listdb, vbuf, vsiz, numbuf, len, VL_DKEEP)){
1946           est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1947           db->fatal = TRUE;
1948           err = TRUE;
1949         }
1950       } else {
1951         est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1952         db->fatal = TRUE;
1953         err = TRUE;
1954       }
1955       if(!est_crput(db->attrdb, db->zmode, nid, tbuf, tsiz, CR_DKEEP)){
1956         est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1957         db->fatal = TRUE;
1958         err = TRUE;
1959       }
1960       if(cbmaprnum(db->aidxs) > 0){
1961         cbmapiterinit(db->aidxs);
1962         while((kbuf = cbmapiternext(db->aidxs, &ksiz)) != NULL){
1963           if(!(vbuf = cbmapget(attrs, kbuf, ksiz, &vsiz))) continue;
1964           attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
1965           switch(attridx->type){
1966           case ESTIDXATTRSTR:
1967           case ESTIDXATTRNUM:
1968             if(!est_aidx_attr_put(attridx->db, nid, vbuf, vsiz)){
1969               est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1970               db->fatal = TRUE;
1971               err = TRUE;
1972             }
1973             break;
1974           default:
1975             if(!est_aidx_seq_put(attridx->db, nid, vbuf, vsiz)){
1976               est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1977               db->fatal = TRUE;
1978               err = TRUE;
1979             }
1980             break;
1981           }
1982         }
1983       }
1984       cbmapclose(attrs);
1985       free(tbuf);
1986     } else {
1987       est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1988       err = TRUE;
1989     }
1990     if((tbuf = est_crget(tgdb->textdb, tgdb->zmode, oid, &tsiz)) != NULL){
1991       if(!est_crput(db->textdb, db->zmode, nid, tbuf, tsiz, CR_DKEEP)){
1992         est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1993         db->fatal = TRUE;
1994         err = TRUE;
1995       }
1996       free(tbuf);
1997     } else {
1998       est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1999       err = TRUE;
2000     }
2001     if((tbuf = est_crget(tgdb->kwddb, tgdb->zmode, oid, &tsiz)) != NULL){
2002       if(!est_crput(db->kwddb, db->zmode, nid, tbuf, tsiz, CR_DKEEP)){
2003         est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2004         db->fatal = TRUE;
2005         err = TRUE;
2006       }
2007       free(tbuf);
2008     } else if(dpecode != DP_ENOITEM){
2009       est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2010       db->fatal = TRUE;
2011       err = TRUE;
2012     }
2013     if(i % (ESTCCCBFREQ / 10) == 0) est_db_inform(db, "importing documents");
2014   }
2015   CB_LISTOPEN(words);
2016   vlcurfirst(tgdb->fwmdb);
2017   while((kbuf = vlcurkeycache(tgdb->fwmdb, &ksiz)) != NULL){
2018     CB_LISTPUSH(words, kbuf, ksiz);
2019     vlcurnext(tgdb->fwmdb);
2020   }
2021   for(i = 0; i < CB_LISTNUM(words); i++){
2022     kbuf = CB_LISTVAL2(words, i, ksiz);
2023     seqmap = cbmapopenex(tsiz / sizeof(int) + 1);
2024     tbuf = est_idx_scan(tgdb->idxdb, kbuf, ksiz, &tsiz, tgdb->smode);
2025     rp = tbuf;
2026     ep = tbuf + tsiz;
2027     while(rp < ep){
2028       EST_READ_VNUMBUF(rp, oid, vstep);
2029       rp += vstep;
2030       vbuf = cbmapget(idmap, (char *)&oid, sizeof(int), NULL);
2031       nid = vbuf ? *(int *)vbuf : -1;
2032       sp = rp;
2033       switch(tgdb->smode){
2034       case ESTDFSCVOID:
2035         break;
2036       default:
2037         rp++;
2038         break;
2039       case ESTDFSCINT:
2040       case ESTDFSCASIS:
2041         rp += sizeof(int);
2042         break;
2043       }
2044       while(*rp != 0x00){
2045         rp += 2;
2046       }
2047       rp++;
2048       if(nid > 0) cbmapputcat(seqmap, (char *)&nid, sizeof(int), sp, rp - sp);
2049     }
2050     anum = cbmaprnum(seqmap);
2051     CB_MALLOC(ary, anum * sizeof(int) + 1);
2052     cbmapiterinit(seqmap);
2053     for(j = 0; (rp = cbmapiternext(seqmap, NULL)) != NULL; j++){
2054       ary[j] = *(int *)rp;
2055     }
2056     qsort(ary, anum, sizeof(int), est_int_compare);
2057     CB_DATUMOPEN(rbuf);
2058     for(j = 0; j < anum; j++){
2059       EST_SET_VNUMBUF(vstep, numbuf, ary[j]);
2060       CB_DATUMCAT(rbuf, numbuf, vstep);
2061       vbuf = cbmapget(seqmap, (char *)(ary + j), sizeof(int), &vsiz);
2062       CB_DATUMCAT(rbuf, vbuf, vsiz);
2063     }
2064     if(!est_idx_add(db->idxdb, kbuf, ksiz, CB_DATUMPTR(rbuf), CB_DATUMSIZE(rbuf), db->smode)){
2065       est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2066       db->fatal = TRUE;
2067       err = TRUE;
2068     }
2069     CB_DATUMCLOSE(rbuf);
2070     free(ary);
2071     cbmapclose(seqmap);
2072     free(tbuf);
2073     vlput(db->fwmdb, kbuf, ksiz, "", 0, VL_DKEEP);
2074     if(i % ESTCCCBFREQ == 0){
2075       est_db_inform(db, "importing words");
2076       if(est_idx_size_current(db->idxdb) >= ESTIDXDBMAX){
2077         est_db_inform(db, "adding a new database file");
2078         est_idx_increment(db->idxdb);
2079       }
2080     }
2081   }
2082   CB_LISTCLOSE(words);
2083   CB_LISTOPEN(words);
2084   vlcurfirst(tgdb->auxdb);
2085   while((kbuf = vlcurkeycache(tgdb->auxdb, &ksiz)) != NULL){
2086     CB_LISTPUSH(words, kbuf, ksiz);
2087     vlcurnext(tgdb->auxdb);
2088   }
2089   for(i = 0; i < CB_LISTNUM(words); i++){
2090     kbuf = CB_LISTVAL2(words, i, ksiz);
2091     vbuf = vlgetcache(tgdb->auxdb, kbuf, ksiz, &vsiz);
2092     CB_DATUMOPEN(rbuf);
2093     rp = vbuf;
2094     ep = vbuf + vsiz;
2095     while(rp < ep){
2096       oid = *(int *)rp;
2097       vbuf = cbmapget(idmap, rp, sizeof(int), NULL);
2098       nid = vbuf ? *(int *)vbuf : -1;
2099       if(nid > 0){
2100         CB_DATUMCAT(rbuf, (char *)&nid, sizeof(int));
2101         CB_DATUMCAT(rbuf, rp + sizeof(int), sizeof(int));
2102       }
2103       rp += sizeof(int) * 2;
2104     }
2105     if(!vlput(db->auxdb, kbuf, ksiz, CB_DATUMPTR(rbuf), CB_DATUMSIZE(rbuf), VL_DCAT)){
2106       est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2107       db->fatal = TRUE;
2108       err = TRUE;
2109     }
2110     CB_DATUMCLOSE(rbuf);
2111     anum = 0;
2112     if((vbuf = vlgetcache(tgdb->xfmdb, kbuf, ksiz, NULL)) != NULL) anum += atoi(vbuf);
2113     len = sprintf(numbuf, "%d", anum);
2114     vlput(db->xfmdb, kbuf, ksiz, numbuf, len, VL_DOVER);
2115     if(i % ESTCCCBFREQ == 0) est_db_inform(db, "importing auxiliary words");
2116   }
2117   CB_LISTCLOSE(words);
2118   cbmapclose(idmap);
2119   est_db_inform(db, "closing the target database");
2120   if(!est_db_close(tgdb, &ecode)){
2121     est_set_ecode(&(db->ecode), ecode, __LINE__);
2122     return FALSE;
2123   }
2124   if(!est_db_flush(db, -1)) err = TRUE;
2125   return err ? FALSE : TRUE;
2126 }
2127 
2128 
2129 /* Add a document to a database. */
est_db_put_doc(ESTDB * db,ESTDOC * doc,int options)2130 int est_db_put_doc(ESTDB *db, ESTDOC *doc, int options){
2131   CBMAP *ocmap, *fmap, *qmap;
2132   CBLIST *words;
2133   CBDATUM *ocbuf;
2134   ESTATTRIDX *attridx;
2135   md5_state_t ms;
2136   const char *uri, *ndig, *text, *word, *fnext, *snext, *kbuf, *vbuf;
2137   unsigned char junc[2], c;
2138   char dobuf[32], dsbuf[64], *wp, *odig, wbuf[ESTWORDMAXLEN+3], *sbuf, nbuf[ESTNUMBUFSIZ];
2139   int i, j, id, err, wnum, wsiz, fnsiz, snsiz, *np, score, num, ksiz, vsiz, ssiz;
2140   double tune, weight;
2141   assert(db && doc);
2142   if(!dpwritable(db->metadb)){
2143     est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
2144     return FALSE;
2145   }
2146   if(!doc->attrs || !(uri = cbmapget(doc->attrs, ESTDATTRURI, -1, NULL)) || uri[0] == '\0'){
2147     est_set_ecode(&(db->ecode), ESTEINVAL, __LINE__);
2148     return FALSE;
2149   }
2150   if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
2151   if(!(ndig = cbmapget(doc->attrs, ESTDATTRDIGEST, -1, NULL))){
2152     md5_init(&ms);
2153     for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
2154       vbuf = CB_LISTVAL2(doc->dtexts, i, vsiz);
2155       md5_append(&ms, (md5_byte_t *)vbuf, vsiz);
2156       md5_append(&ms, (md5_byte_t *)"\n", 1);
2157     }
2158     if((vbuf = cbmapget(doc->attrs, "", 0, &vsiz)) != NULL){
2159       md5_append(&ms, (md5_byte_t *)"\t", 1);
2160       md5_append(&ms, (md5_byte_t *)vbuf, vsiz);
2161       md5_append(&ms, (md5_byte_t *)"\n", 1);
2162     }
2163     md5_finish(&ms, (md5_byte_t *)dobuf);
2164     wp = dsbuf;
2165     for(i = 0; i < 16; i++){
2166       wp += sprintf(wp, "%02x", ((unsigned char *)dobuf)[i]);
2167     }
2168     ndig = dsbuf;
2169     cbmapput(doc->attrs, ESTDATTRDIGEST, -1, ndig, -1, FALSE);
2170   }
2171   if((id = est_db_uri_to_id(db, uri)) > 0){
2172     if((odig = est_db_get_doc_attr(db, id, ESTDATTRDIGEST)) != NULL){
2173       if(!strcmp(odig, ndig)){
2174         free(odig);
2175         doc->id = id;
2176         sprintf(nbuf, "%d", id);
2177         cbmapput(doc->attrs, ESTDATTRID, -1, nbuf, -1, TRUE);
2178         return est_db_edit_doc(db, doc);
2179       }
2180       free(odig);
2181     }
2182     if(!est_db_out_doc(db, id, (options & ESTPDCLEAN) ? ESTODCLEAN : 0)) return FALSE;
2183   }
2184   doc->id = ++(db->dseq);
2185   sprintf(nbuf, "%d", doc->id);
2186   cbmapput(doc->attrs, ESTDATTRID, -1, nbuf, -1, TRUE);
2187   ocmap = cbmapopen();
2188   fmap = cbmapopen();
2189   qmap = cbmapopen();
2190   wnum = 0;
2191   for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
2192     if(i < 0){
2193       if(!(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
2194     } else {
2195       text = CB_LISTVAL(doc->dtexts, i);
2196     }
2197     CB_LISTOPEN(words);
2198     switch(db->amode){
2199     case ESTDFPERFNG:
2200       est_break_text_perfng(text, words, FALSE, TRUE);
2201       break;
2202     case ESTDFCHRCAT:
2203       est_break_text_chrcat(text, words, FALSE);
2204       break;
2205     default:
2206       est_break_text(text, words, FALSE, TRUE);
2207       break;
2208     }
2209     wnum += CB_LISTNUM(words);
2210     for(j = 0; j < CB_LISTNUM(words); j++){
2211       word = CB_LISTVAL2(words, j, wsiz);
2212       if(wsiz > ESTWORDMAXLEN) continue;
2213       fnext = cblistval(words, j + 1, &fnsiz);
2214       snext = cblistval(words, j + 2, &snsiz);
2215       junc[0] = fnext ? dpinnerhash(fnext, fnsiz) % ESTJHASHNUM + 1: 0xff;
2216       junc[1] = snext ? dpouterhash(snext, snsiz) % ESTJHASHNUM + 1: 0xff;
2217       memcpy(wbuf, word, wsiz);
2218       memcpy(wbuf + wsiz, "\t", 1);
2219       memcpy(wbuf + wsiz + 1, junc, 2);
2220       np = (int *)cbmapget(fmap, word, wsiz, NULL);
2221       num = np ? *(int *)np : 0;
2222       num += ESTOCPOINT;
2223       cbmapput(fmap, word, wsiz, (char *)&num, sizeof(int), TRUE);
2224       if(cbmapput(qmap, wbuf, wsiz + 3, "", 0, FALSE))
2225         cbmapputcat(ocmap, word, wsiz, (char *)junc, fnext ? 2 : 0);
2226     }
2227     CB_LISTCLOSE(words);
2228   }
2229   score = (vbuf = cbmapget(doc->attrs, "\t", 1, NULL)) ? atoi(vbuf) : -1;
2230   weight = 1.0;
2231   if(score < 0 && (options & ESTPDWEIGHT) &&
2232      (vbuf = cbmapget(doc->attrs, ESTDATTRWEIGHT, -1, NULL)) != NULL){
2233     weight = strtod(vbuf, NULL);
2234     weight = weight >= 0.01 ? weight : 0.01;
2235   }
2236   tune = sqrt(wnum + 128) / 16.0 / weight;
2237   cbmapiterinit(ocmap);
2238   while((kbuf = cbmapiternext(ocmap, &ksiz)) != NULL){
2239     CB_MAPITERVAL(vbuf, kbuf, vsiz);
2240     if(vsiz > 2) qsort((void *)vbuf, vsiz / 2, 2, est_short_compare);
2241     CB_DATUMOPEN(ocbuf);
2242     EST_SET_VNUMBUF(wsiz, wbuf, doc->id);
2243     CB_DATUMCAT(ocbuf, wbuf, wsiz);
2244     switch(db->smode){
2245     case ESTDFSCVOID:
2246       break;
2247     default:
2248       num = score < 0 ? *(int *)cbmapget(fmap, kbuf, ksiz, NULL) / tune : score;
2249       if(num >= 0x80) num += (0x80 - num) * 0.75;
2250       if(num >= 0xc0) num += (0xc0 - num) * 0.75;
2251       c = num < 0xff ? num : 0xff;
2252       CB_DATUMCAT(ocbuf, (char *)&c, 1);
2253       break;
2254     case ESTDFSCINT:
2255     case ESTDFSCASIS:
2256       num = score < 0 ? *(int *)cbmapget(fmap, kbuf, ksiz, NULL) * 10 / tune : score;
2257       CB_DATUMCAT(ocbuf, (char *)&num, sizeof(int));
2258       break;
2259     }
2260     CB_DATUMCAT(ocbuf, vbuf, vsiz);
2261     c = 0x00;
2262     CB_DATUMCAT(ocbuf, (char *)&c, 1);
2263     cbmapputcat(db->idxcc, kbuf, ksiz, CB_DATUMPTR(ocbuf), CB_DATUMSIZE(ocbuf));
2264     db->icsiz += CB_DATUMSIZE(ocbuf);
2265     CB_DATUMCLOSE(ocbuf);
2266   }
2267   cbmapclose(qmap);
2268   cbmapclose(fmap);
2269   cbmapclose(ocmap);
2270   err = FALSE;
2271   sbuf = cbmapdump(doc->attrs, &ssiz);
2272   if(!est_crput(db->attrdb, db->zmode, doc->id, sbuf, ssiz, CR_DKEEP)){
2273     est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2274     db->fatal = TRUE;
2275     err = TRUE;
2276   }
2277   free(sbuf);
2278   sbuf = cblistdump(doc->dtexts, &ssiz);
2279   if(!est_crput(db->textdb, db->zmode, doc->id, sbuf, ssiz, CR_DKEEP)){
2280     est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2281     db->fatal = TRUE;
2282     err = TRUE;
2283   }
2284   free(sbuf);
2285   if(doc->kwords && !est_db_put_keywords(db, doc->id, doc->kwords, weight)) err = TRUE;
2286   sprintf(nbuf, "%d", doc->id);
2287   if(!vlput(db->listdb, uri, -1, nbuf, -1, VL_DKEEP)){
2288     est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2289     db->fatal = TRUE;
2290     err = TRUE;
2291   }
2292   if(cbmaprnum(db->aidxs) > 0){
2293     cbmapiterinit(db->aidxs);
2294     while((kbuf = cbmapiternext(db->aidxs, &ksiz)) != NULL){
2295       if(!(vbuf = cbmapget(doc->attrs, kbuf, ksiz, &vsiz))) continue;
2296       attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
2297       switch(attridx->type){
2298       case ESTIDXATTRSTR:
2299       case ESTIDXATTRNUM:
2300         if(!est_aidx_attr_put(attridx->db, doc->id, vbuf, vsiz)){
2301           est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2302           db->fatal = TRUE;
2303           err = TRUE;
2304         }
2305         break;
2306       default:
2307         if(!est_aidx_seq_put(attridx->db, doc->id, vbuf, vsiz)){
2308           est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2309           db->fatal = TRUE;
2310           err = TRUE;
2311         }
2312         break;
2313       }
2314     }
2315   }
2316   db->dnum++;
2317   if(est_db_used_cache_size(db) > db->icmax && !est_db_flush(db, INT_MAX)) err = TRUE;
2318   return err ? FALSE : TRUE;
2319 }
2320 
2321 
2322 /* Remove a document from a database. */
est_db_out_doc(ESTDB * db,int id,int options)2323 int est_db_out_doc(ESTDB *db, int id, int options){
2324   ESTDOC *doc;
2325   CBLIST *words;
2326   ESTATTRIDX *attridx;
2327   const char *uri, *kbuf, *vbuf, *text, *word;
2328   char numbuf[ESTNUMBUFSIZ];
2329   int i, j, ksiz, vsiz, len, wsiz;
2330   assert(db && id > 0);
2331   if(!dpwritable(db->metadb)){
2332     est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
2333     return FALSE;
2334   }
2335   if(id >= ESTPDOCIDMIN){
2336     est_set_ecode(&(db->ecode), ESTEINVAL, __LINE__);
2337     return FALSE;
2338   }
2339   if(!(doc = est_db_get_doc(db, id, ESTGDNOKWD))) return FALSE;
2340   if(!doc->attrs || !(uri = cbmapget(doc->attrs, ESTDATTRURI, -1, NULL))){
2341     est_doc_delete(doc);
2342     est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2343     db->fatal = TRUE;
2344     return FALSE;
2345   }
2346   if(!est_crout(db->attrdb, id) || !est_crout(db->textdb, id) || !vlout(db->listdb, uri, -1)){
2347     est_doc_delete(doc);
2348     est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2349     db->fatal = TRUE;
2350     return FALSE;
2351   }
2352   cbmapout(db->attrcc, (char *)&id, sizeof(int));
2353   cbmapout(db->textcc, (char *)&id, sizeof(int));
2354   if(db->spacc) cbmapout(db->spacc, (char *)&id, sizeof(int));
2355   if(cbmaprnum(db->aidxs) > 0){
2356     cbmapiterinit(db->aidxs);
2357     while((kbuf = cbmapiternext(db->aidxs, &ksiz)) != NULL){
2358       if(!(vbuf = cbmapget(doc->attrs, kbuf, ksiz, &vsiz))) continue;
2359       attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
2360       switch(attridx->type){
2361       case ESTIDXATTRSTR:
2362       case ESTIDXATTRNUM:
2363         if(!est_aidx_attr_out(attridx->db, doc->id, vbuf, vsiz)){
2364           est_doc_delete(doc);
2365           est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2366           db->fatal = TRUE;
2367           return FALSE;
2368         }
2369         break;
2370       default:
2371         if(!est_aidx_seq_out(attridx->db, doc->id)){
2372           est_doc_delete(doc);
2373           est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2374           db->fatal = TRUE;
2375           return FALSE;
2376         }
2377         break;
2378       }
2379     }
2380   }
2381   if(options & ESTODCLEAN){
2382     len = sprintf(numbuf, "\t%d", doc->id);
2383     cbmapput(db->outcc, numbuf, len, "", 0, FALSE);
2384     for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
2385       if(i < 0){
2386         if(!(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
2387       } else {
2388         text = CB_LISTVAL(doc->dtexts, i);
2389       }
2390       CB_LISTOPEN(words);
2391       switch(db->amode){
2392       case ESTDFPERFNG:
2393         est_break_text_perfng(text, words, FALSE, TRUE);
2394         break;
2395       case ESTDFCHRCAT:
2396         est_break_text_chrcat(text, words, FALSE);
2397         break;
2398       default:
2399         est_break_text(text, words, FALSE, TRUE);
2400         break;
2401       }
2402       for(j = 0; j < CB_LISTNUM(words); j++){
2403         word = CB_LISTVAL2(words, j, wsiz);
2404         cbmapput(db->outcc, word, wsiz, "", 0, FALSE);
2405       }
2406       CB_LISTCLOSE(words);
2407     }
2408     if(!est_db_out_keywords(db, id) && db->ecode != ESTENOITEM){
2409       est_doc_delete(doc);
2410       est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2411       db->fatal = TRUE;
2412       return FALSE;
2413     }
2414   } else {
2415     if(!est_crout(db->kwddb, id) && dpecode != DP_ENOITEM){
2416       est_doc_delete(doc);
2417       est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2418       db->fatal = TRUE;
2419       return FALSE;
2420     }
2421     cbmapout(db->veccc, (char *)&id, sizeof(int));
2422   }
2423   est_doc_delete(doc);
2424   if(!est_db_set_doc_entity(db, id, NULL, -1) && db->ecode != ESTENOITEM) return FALSE;
2425   db->dnum--;
2426   return TRUE;
2427 }
2428 
2429 
2430 /* Edit attributes of a document object in a database. */
est_db_edit_doc(ESTDB * db,ESTDOC * doc)2431 int est_db_edit_doc(ESTDB *db, ESTDOC *doc){
2432   ESTDOC *odoc;
2433   ESTATTRIDX *attridx;
2434   const char *uri, *tmp, *kbuf, *vbuf;
2435   char *ouri, numbuf[ESTNUMBUFSIZ], *text, *sbuf;
2436   int err, id, oid, ksiz, vsiz, ssiz;
2437   assert(db && doc);
2438   if(!dpwritable(db->metadb)){
2439     est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
2440     return FALSE;
2441   }
2442   id = -1;
2443   uri = NULL;
2444   if(doc->attrs){
2445     if((tmp = cbmapget(doc->attrs, ESTDATTRID, -1, NULL)) != NULL) id = atoi(tmp);
2446     if((tmp = cbmapget(doc->attrs, ESTDATTRURI, -1, NULL)) != NULL) uri = tmp;
2447   }
2448   if(id < 1 || id >= ESTPDOCIDMIN || (doc->id > 0 && doc->id != id) || !uri || uri[0] == '\0'){
2449     est_set_ecode(&(db->ecode), ESTEINVAL, __LINE__);
2450     return FALSE;
2451   }
2452   err = FALSE;
2453   if((oid = est_db_uri_to_id(db, uri)) == -1){
2454     if(!(ouri = est_db_get_doc_attr(db, id, ESTDATTRURI))){
2455       est_set_ecode(&(db->ecode), ESTEINVAL, __LINE__);
2456       return FALSE;
2457     }
2458     sprintf(numbuf, "%d", id);
2459     if(!vlout(db->listdb, ouri, -1) || !vlput(db->listdb, uri, -1, numbuf, -1, VL_DKEEP))
2460       err = TRUE;
2461     free(ouri);
2462   } else if(oid != id){
2463     est_set_ecode(&(db->ecode), ESTEINVAL, __LINE__);
2464     return FALSE;
2465   }
2466   doc->id = id;
2467   if(cbmaprnum(db->aidxs) > 0 && (odoc = est_db_get_doc(db, id, ESTGDNOTEXT))){
2468     if(!odoc->attrs) odoc->attrs = cbmapopenex(ESTMINIBNUM);
2469     cbmapiterinit(db->aidxs);
2470     while((kbuf = cbmapiternext(db->aidxs, &ksiz)) != NULL){
2471       if(!(vbuf = cbmapget(odoc->attrs, kbuf, ksiz, &vsiz))) continue;
2472       attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
2473       switch(attridx->type){
2474       case ESTIDXATTRSTR:
2475       case ESTIDXATTRNUM:
2476         if(!est_aidx_attr_out(attridx->db, id, vbuf, vsiz)){
2477           est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2478           db->fatal = TRUE;
2479           err = TRUE;
2480         }
2481         break;
2482       default:
2483         if(!est_aidx_seq_out(attridx->db, id)){
2484           est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2485           db->fatal = TRUE;
2486           err = TRUE;
2487         }
2488         break;
2489       }
2490     }
2491     cbmapiterinit(db->aidxs);
2492     while((kbuf = cbmapiternext(db->aidxs, &ksiz)) != NULL){
2493       if(!(vbuf = cbmapget(doc->attrs, kbuf, ksiz, &vsiz))) continue;
2494       attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
2495       switch(attridx->type){
2496       case ESTIDXATTRSTR:
2497       case ESTIDXATTRNUM:
2498         if(!est_aidx_attr_put(attridx->db, id, vbuf, vsiz)){
2499           est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2500           db->fatal = TRUE;
2501           err = TRUE;
2502         }
2503         break;
2504       default:
2505         if(!est_aidx_seq_put(attridx->db, id, vbuf, vsiz)){
2506           est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2507           db->fatal = TRUE;
2508           err = TRUE;
2509         }
2510         break;
2511       }
2512     }
2513     est_doc_delete(odoc);
2514   }
2515   if((text = est_db_get_doc_attr(db, id, "")) != NULL){
2516     cbmapput(doc->attrs, "", 0, text, -1, TRUE);
2517     free(text);
2518   }
2519   sbuf = cbmapdump(doc->attrs, &ssiz);
2520   if(!est_crput(db->attrdb, db->zmode, id, sbuf, ssiz, CR_DOVER)){
2521     est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2522     db->fatal = TRUE;
2523     err = TRUE;
2524   }
2525   free(sbuf);
2526   cbmapout(db->attrcc, (char *)&id, sizeof(int));
2527   if(db->spacc) cbmapout(db->spacc, (char *)&id, sizeof(int));
2528   return err ? FALSE : TRUE;
2529 }
2530 
2531 
2532 /* Retrieve a document in a database. */
est_db_get_doc(ESTDB * db,int id,int options)2533 ESTDOC *est_db_get_doc(ESTDB *db, int id, int options){
2534   ESTDOC *doc;
2535   const char *cbuf;
2536   char *vbuf, numbuf[ESTNUMBUFSIZ];
2537   int i, csiz, vsiz, num;
2538   assert(db && id > 0);
2539   if(id >= ESTPDOCIDMIN){
2540     if((num = id - ESTPDOCIDMIN) >= CB_LISTNUM(db->pdocs)){
2541       est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
2542       return NULL;
2543     }
2544     if((vbuf = cbreadfile(CB_LISTVAL(db->pdocs, num), NULL)) != NULL){
2545       doc = est_doc_new_from_draft(vbuf);
2546       free(vbuf);
2547     } else {
2548       doc = est_doc_new();
2549     }
2550     doc->id = id;
2551     sprintf(numbuf, "%d", id);
2552     est_doc_add_attr(doc, ESTDATTRID, numbuf);
2553     if(!est_doc_attr(doc, ESTDATTRURI))
2554       est_doc_add_attr(doc, ESTDATTRURI, CB_LISTVAL(db->pdocs, num));
2555     return doc;
2556   }
2557   cbuf = NULL;
2558   if(options & ESTGDNOATTR){
2559     if(crvsiz(db->attrdb, (char *)&id, sizeof(int)) == -1){
2560       if(dpecode == DP_ENOITEM){
2561         est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
2562         return NULL;
2563       } else {
2564         est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2565         db->fatal = TRUE;
2566         return NULL;
2567       }
2568     }
2569     vbuf = NULL;
2570   } else if((cbuf = cbmapget(db->attrcc, (char *)&id, sizeof(int), &csiz)) != NULL){
2571     cbmapmove(db->attrcc, (char *)&id, sizeof(int), FALSE);
2572     vbuf = NULL;
2573   } else if(!(vbuf = est_crget(db->attrdb, db->zmode, id, &vsiz))){
2574     if(dpecode == DP_ENOITEM){
2575       est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
2576       return NULL;
2577     } else {
2578       est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2579       db->fatal = TRUE;
2580       return NULL;
2581     }
2582   }
2583   doc = est_doc_new();
2584   doc->id = id;
2585   if(cbuf){
2586     doc->attrs = cbmapload(cbuf, csiz);
2587   } else if(vbuf){
2588     doc->attrs = cbmapload(vbuf, vsiz);
2589     if(db->acmnum > 0) cbmapput(db->attrcc, (char *)&id, sizeof(int), vbuf, vsiz, TRUE);
2590     free(vbuf);
2591     if(cbmaprnum(db->attrcc) > db->acmnum){
2592       num = cbmaprnum(db->attrcc) * 0.1 + 1;
2593       cbmapiterinit(db->attrcc);
2594       for(i = 0; i < num && (cbuf = cbmapiternext(db->attrcc, NULL)) != NULL; i++){
2595         cbmapout(db->attrcc, cbuf, sizeof(int));
2596       }
2597     }
2598   } else {
2599     doc->attrs = NULL;
2600   }
2601   if(!(options & ESTGDNOTEXT)){
2602     if((cbuf = cbmapget(db->textcc, (char *)&id, sizeof(int), &csiz)) != NULL){
2603       cbmapmove(db->textcc, (char *)&id, sizeof(int), FALSE);
2604       doc->dtexts = cblistload(cbuf, csiz);
2605     } else {
2606       if(!(vbuf = est_crget(db->textdb, db->zmode, id, &vsiz))){
2607         est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2608         db->fatal = TRUE;
2609         est_doc_delete(doc);
2610         return NULL;
2611       }
2612       doc->dtexts = cblistload(vbuf, vsiz);
2613       if(db->tcmnum > 0) cbmapput(db->textcc, (char *)&id, sizeof(int), vbuf, vsiz, TRUE);
2614       free(vbuf);
2615       if(cbmaprnum(db->textcc) > db->tcmnum){
2616         num = cbmaprnum(db->textcc) * 0.1 + 1;
2617         cbmapiterinit(db->textcc);
2618         for(i = 0; i < num &&(cbuf = cbmapiternext(db->textcc, NULL)) != NULL; i++){
2619           cbmapout(db->textcc, cbuf, sizeof(int));
2620         }
2621       }
2622     }
2623   }
2624   if(!(options & ESTGDNOKWD)) doc->kwords = est_db_get_keywords(db, id);
2625   return doc;
2626 }
2627 
2628 
2629 /* Retrieve the value of an attribute of a document in a database. */
est_db_get_doc_attr(ESTDB * db,int id,const char * name)2630 char *est_db_get_doc_attr(ESTDB *db, int id, const char *name){
2631   ESTATTRIDX *attridx;
2632   ESTDOC *doc;
2633   const char *cbuf;
2634   char *mbuf, *vbuf;
2635   int cb, csiz, msiz, vsiz;
2636   assert(db && id > 0 && name);
2637   if(id >= ESTPDOCIDMIN){
2638     if(!(doc = est_db_get_doc(db, id, 0))){
2639       est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
2640       return NULL;
2641     }
2642     if(!(cbuf = est_doc_attr(doc, name))){
2643       est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
2644       est_doc_delete(doc);
2645       return NULL;
2646     }
2647     vbuf = cbmemdup(cbuf, -1);
2648     est_doc_delete(doc);
2649     return vbuf;
2650   }
2651   cb = db->spacc && !strcmp(name, db->scname);
2652   if(cb && (cbuf = cbmapget(db->spacc, (char *)&id, sizeof(int), &csiz)) != NULL){
2653     cbmapmove(db->spacc, (char *)&id, sizeof(int), FALSE);
2654     return cbmemdup(cbuf, csiz);
2655   }
2656   if((attridx = (ESTATTRIDX *)cbmapget(db->aidxs, name, -1, NULL)) != NULL &&
2657      attridx->type ==  ESTIDXATTRSEQ){
2658     if(!(vbuf = est_aidx_seq_get(attridx->db, id, &vsiz))){
2659       est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
2660       return NULL;
2661     }
2662     if(cb) cbmapput(db->spacc, (char *)&id, sizeof(int), vbuf, vsiz, FALSE);
2663     return vbuf;
2664   }
2665   if(!(mbuf = est_crget(db->attrdb, db->zmode, id, &msiz))){
2666     est_set_ecode(&(db->ecode), dpecode == DP_ENOITEM ? ESTENOITEM : ESTEDB, __LINE__);
2667     return NULL;
2668   }
2669   if(!(vbuf = cbmaploadone(mbuf, msiz, name, -1, &vsiz))){
2670     est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
2671     free(mbuf);
2672     return NULL;
2673   }
2674   if(cb) cbmapput(db->spacc, (char *)&id, sizeof(int), vbuf, vsiz, FALSE);
2675   free(mbuf);
2676   return vbuf;
2677 }
2678 
2679 
2680 /* Get the ID of a document spacified by URI. */
est_db_uri_to_id(ESTDB * db,const char * uri)2681 int est_db_uri_to_id(ESTDB *db, const char *uri){
2682   const char *vbuf;
2683   int id;
2684   assert(db && uri);
2685   if(!(vbuf = vlgetcache(db->listdb, uri, -1, NULL))){
2686     if(CB_LISTNUM(db->pdocs) > 0 && (id = est_pidx_uri_to_id(db, uri)) > 0) return id;
2687     est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
2688     return -1;
2689   }
2690   return atoi(vbuf);
2691 }
2692 
2693 
2694 /* Get the name of a database. */
est_db_name(ESTDB * db)2695 const char *est_db_name(ESTDB *db){
2696   assert(db);
2697   return db->name;
2698 }
2699 
2700 
2701 /* Get the number of documents in a database. */
est_db_doc_num(ESTDB * db)2702 int est_db_doc_num(ESTDB *db){
2703   assert(db);
2704   return db->dnum;
2705 }
2706 
2707 
2708 /* Get the number of words in a database. */
est_db_word_num(ESTDB * db)2709 int est_db_word_num(ESTDB *db){
2710   int wnum;
2711   assert(db);
2712   wnum = vlrnum(db->fwmdb);
2713   return wnum > 0 ? wnum : 0;
2714 }
2715 
2716 
2717 /* Get the size of a database. */
est_db_size(ESTDB * db)2718 double est_db_size(ESTDB *db){
2719   ESTATTRIDX *attridx;
2720   const char *kbuf;
2721   double size;
2722   assert(db);
2723   size = (double)dpfsiz(db->metadb) + est_idx_size(db->idxdb) + vlfsiz(db->fwmdb) +
2724     vlfsiz(db->auxdb) + vlfsiz(db->xfmdb) + crfsizd(db->attrdb) + crfsizd(db->textdb) +
2725     crfsizd(db->kwddb) + vlfsiz(db->listdb);
2726   if(cbmaprnum(db->aidxs) > 0){
2727     cbmapiterinit(db->aidxs);
2728     while((kbuf = cbmapiternext(db->aidxs, NULL)) != NULL){
2729       attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
2730       switch(attridx->type){
2731       case ESTIDXATTRSTR:
2732       case ESTIDXATTRNUM:
2733         size += vlfsiz(attridx->db);
2734         break;
2735       default:
2736         size += dpfsiz(attridx->db);
2737         break;
2738       }
2739     }
2740   }
2741   return size;
2742 }
2743 
2744 
2745 /* Search documents corresponding a condition for a database. */
est_db_search(ESTDB * db,ESTCOND * cond,int * nump,CBMAP * hints)2746 int *est_db_search(ESTDB *db, ESTCOND *cond, int *nump, CBMAP *hints){
2747   ESTSCORE *scores, *tscores;
2748   CBMAP *svmap, *ordattrs;
2749   CBLIST *terms;
2750   const char *term, *rp;
2751   char *tmp, numbuf[ESTNUMBUFSIZ];
2752   const int *nscores;
2753   int i, j, snum, ign, nsnum, unum, knum, mnum, top, pcnum, ncnum, tsnum, add;
2754   int nnum, id, score, hnum, len, rest, *rval, rnum;
2755   double tune;
2756   assert(db && cond && nump);
2757   if(cond->auxwords) cbmapclose(cond->auxwords);
2758   cond->auxwords = cbmapopenex(ESTMINIBNUM);
2759   scores = NULL;
2760   snum = 0;
2761   ign = -1;
2762   nscores = cond->nscores;
2763   nsnum = cond->nsnum;
2764   ordattrs = cbmapopenex(cond->order ? (CB_LISTNUM(db->pdocs) + ESTMINIBNUM) : 1);
2765   if(cond->phrase){
2766     if(cbstrfwmatch(cond->phrase, ESTOPID)){
2767       if((id = atoi(cond->phrase + strlen(ESTOPID))) > 0){
2768         CB_MALLOC(scores, sizeof(ESTSCORE));
2769         scores[0].id = id;
2770         scores[0].score = 0;
2771         scores[0].value = NULL;
2772         snum = 1;
2773       } else {
2774         CB_MALLOC(scores, 1);
2775         snum = 0;
2776       }
2777     } else if(cbstrfwmatch(cond->phrase, ESTOPURI)){
2778       rp = cond->phrase + strlen(ESTOPURI);
2779       while(*rp > '\0' && *rp <= ' '){
2780         rp++;
2781       }
2782       if((id = est_db_uri_to_id(db, rp)) > 0){
2783         CB_MALLOC(scores, sizeof(ESTSCORE));
2784         scores[0].id = id;
2785         scores[0].score = 0;
2786         scores[0].value = NULL;
2787         snum = 1;
2788       } else {
2789         CB_MALLOC(scores, 1);
2790         snum = 0;
2791       }
2792     } else if(cbstrfwmatch(cond->phrase, ESTOPSIMILAR)){
2793       rp = cond->phrase + strlen(ESTOPSIMILAR);
2794       while(*rp > '\0' && *rp <= ' '){
2795         rp++;
2796       }
2797       knum = -1;
2798       unum = -1;
2799       mnum = -1;
2800       if(*rp >= '0' && *rp <= '9'){
2801         knum = atoi(rp);
2802         while(*rp >= '0' && *rp <= '9'){
2803           rp++;
2804         }
2805         while(*rp > '\0' && *rp <= ' '){
2806           rp++;
2807         }
2808         if(*rp >= '0' && *rp <= '9'){
2809           unum = atoi(rp);
2810           while(*rp >= '0' && *rp <= '9'){
2811             rp++;
2812           }
2813           while(*rp > '\0' && *rp <= ' '){
2814             rp++;
2815           }
2816           if(*rp >= '0' && *rp <= '9'){
2817             mnum = atoi(rp);
2818             while(*rp >= '0' && *rp <= '9'){
2819               rp++;
2820             }
2821             while(*rp > '\0' && *rp <= ' '){
2822               rp++;
2823             }
2824           }
2825         }
2826       }
2827       if(knum < 1) knum = ESTSMLRKNUM;
2828       if(unum < 1) unum = ESTSMLRUNUM;
2829       if(mnum < 1) mnum = ESTSMLRMNUM;
2830       svmap = est_phrase_vector(rp);
2831       scores = est_search_similar(db, svmap, &snum, knum, unum, mnum, cond->tfidf,
2832                                   cond->order ? ESTSMLRNMIN : 0.0, cond->auxmin, cond->auxwords);
2833       cbmapclose(svmap);
2834     } else if(cbstrfwmatch(cond->phrase, ESTOPRANK)){
2835       rp = cond->phrase + strlen(ESTOPRANK);
2836       while(*rp > '\0' && *rp <= ' '){
2837         rp++;
2838       }
2839       top = atoi(rp);
2840       while((*rp >= '0' && *rp <= '9') || *rp == '-'){
2841         rp++;
2842       }
2843       while(*rp > '\0' && *rp <= ' '){
2844         rp++;
2845       }
2846       scores = est_search_rank(db, rp, top, &snum);
2847     } else {
2848       switch(cond->pmode){
2849       default:
2850         terms = est_phrase_terms(cond->phrase);
2851         break;
2852       case ESTPMSIMPLE:
2853         tmp = est_phrase_from_simple(cond->phrase);
2854         terms = est_phrase_terms(tmp);
2855         free(tmp);
2856         break;
2857       case ESTPMROUGH:
2858         tmp = est_phrase_from_rough(cond->phrase);
2859         terms = est_phrase_terms(tmp);
2860         free(tmp);
2861         break;
2862       case ESTPMUNION:
2863         tmp = est_phrase_from_union(cond->phrase);
2864         terms = est_phrase_terms(tmp);
2865         free(tmp);
2866         break;
2867       case ESTPMISECT:
2868         tmp = est_phrase_from_isect(cond->phrase);
2869         terms = est_phrase_terms(tmp);
2870         free(tmp);
2871         break;
2872       }
2873       pcnum = 0;
2874       ncnum = 0;
2875       add = TRUE;
2876       for(i = 0; i < CB_LISTNUM(terms); i++){
2877         term = CB_LISTVAL(terms, i);
2878         if(!strcmp(term, ESTOPISECT)){
2879           add = TRUE;
2880         } else if(!strcmp(term, ESTOPDIFF)){
2881           add = FALSE;
2882         } else {
2883           if(!strcmp(term, ESTOPUVSET)){
2884             tscores = est_search_uvset(db, &tsnum, hints, add);
2885           } else {
2886             tscores = est_search_union(db, term, cond->gstep, cond->cbxpn, &tsnum, hints, add,
2887                                        add && !cond->order ? cond->auxmin : -1, cond->auxwords);
2888           }
2889           if(add){
2890             if(db->smode != ESTDFSCASIS){
2891               if(cond->tfidf){
2892                 tune = pow(tsnum + 64, 0.4);
2893                 for(j = 0; j < tsnum; j++){
2894                   tscores[j].score *= 100.0 / tune;
2895                 }
2896               } else {
2897                 for(j = 0; j < tsnum; j++){
2898                   tscores[j].score *= 10;
2899                 }
2900               }
2901             }
2902             pcnum++;
2903           } else {
2904             ncnum++;
2905           }
2906           if(scores){
2907             CB_REALLOC(scores, (snum + tsnum) * sizeof(ESTSCORE) + 1);
2908             for(j = 0; j < tsnum; j++){
2909               scores[snum+j].id = tscores[j].id;
2910               scores[snum+j].score = add ? tscores[j].score : -1;
2911               scores[snum+j].value = NULL;
2912             }
2913             snum += tsnum;
2914             free(tscores);
2915           } else {
2916             scores = tscores;
2917             snum = tsnum;
2918           }
2919         }
2920       }
2921       if(scores){
2922         if(pcnum > 1 || ncnum > 0){
2923           qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
2924           nnum = 0;
2925           for(i = 0; i < snum; i++){
2926             id = scores[i].id;
2927             score = scores[i].score;
2928             hnum = score >= 0 ? 1 : 0;
2929             for(j = i + 1; j < snum && scores[j].id == id; j++){
2930               if(score >= 0 && scores[j].score >= 0){
2931                 if(db->smode != ESTDFSCASIS) score += scores[j].score;
2932                 hnum++;
2933               } else {
2934                 score = -1;
2935               }
2936             }
2937             if(score >= 0 && hnum >= pcnum){
2938               scores[nnum].id = id;
2939               scores[nnum].score = score;
2940               scores[nnum].value = NULL;
2941               nnum++;
2942             }
2943             i = j - 1;
2944           }
2945           snum = nnum;
2946         }
2947       } else {
2948         CB_MALLOC(scores, 1);
2949         snum = 0;
2950       }
2951       CB_LISTCLOSE(terms);
2952     }
2953   } else if(cond->attrs){
2954     if(nscores && nsnum < ESTAISNUMMIN * 4){
2955       CB_MALLOC(scores, nsnum * sizeof(ESTSCORE) + 1);
2956       nnum = 0;
2957       for(i = 0; i < nsnum; i++){
2958         sprintf(numbuf, "%d", nscores[i]);
2959         if((id = est_db_uri_to_id(db, numbuf)) > 0){
2960           scores[nnum].id = id;
2961           scores[nnum].score = nscores[i];
2962           scores[nnum].value = NULL;
2963           nnum++;
2964         }
2965       }
2966       snum = nnum;
2967       nscores = NULL;
2968       nsnum = -1;
2969     } else {
2970       scores = NULL;
2971       for(i = 0; i < CB_LISTNUM(cond->attrs); i++){
2972         if((scores = est_search_aidx_attr(db, CB_LISTVAL(cond->attrs, i), &snum)) != NULL){
2973           ign = i;
2974           break;
2975         }
2976       }
2977       if(!scores) scores = est_search_uvset(db, &snum, hints, TRUE);
2978     }
2979   } else {
2980     CB_MALLOC(scores, 1);
2981     snum = 0;
2982   }
2983   if(CB_LISTNUM(db->pdocs) > 0) scores = est_search_pidxs(db, cond, scores, &snum, ordattrs);
2984   if(nscores && cond->phrase && cond->phrase[0] != '\0'){
2985     qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
2986     nnum = 0;
2987     j = 0;
2988     for(i = 0; i < snum; i++){
2989       id = scores[i].id;
2990       score = scores[i].score;
2991       while(j < nsnum && nscores[j] > score){
2992         j++;
2993       }
2994       if(j < nsnum && nscores[j] == score){
2995         scores[nnum].id = id;
2996         scores[nnum].score = score;
2997         scores[nnum].value = NULL;
2998         nnum++;
2999         j++;
3000       }
3001     }
3002     snum = nnum;
3003   }
3004   if(cbmaprnum(db->outcc) > 0){
3005     tsnum = 0;
3006     for(i = 0; i < snum; i++){
3007       len = sprintf(numbuf, "\t%d", scores[i].id);
3008       if(cbmapget(db->outcc, numbuf, len, NULL)) continue;
3009       scores[tsnum++] = scores[i];
3010     }
3011     snum = tsnum;
3012   }
3013   if(cond->max > 0 && cond->max * ESTATTRALW + 1 < snum && cond->attrs &&
3014      !cond->order && !cond->distinct){
3015     qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
3016     nnum = est_narrow_scores(db, cond->attrs, ign, cond->order, cond->distinct, scores, snum,
3017                              cond->max * ESTATTRALW + 1, &rest, ordattrs);
3018     if(hints){
3019       sprintf(numbuf, "%d",
3020               rest > cond->max / 2 ? (int)(snum * (nnum / (double)(snum - rest))) : nnum);
3021       cbmapput(hints, "", 0, numbuf, -1, TRUE);
3022     }
3023     snum = nnum;
3024   } else {
3025     if(cond->attrs || cond->order || cond->distinct)
3026       snum = est_narrow_scores(db, cond->attrs, ign, cond->order, cond->distinct, scores, snum,
3027                                INT_MAX, &rest, ordattrs);
3028     if(!cond->order) qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
3029     if(hints){
3030       sprintf(numbuf, "%d", snum);
3031       cbmapput(hints, "", 0, numbuf, -1, TRUE);
3032     }
3033   }
3034   if(cond->shadows) cbmapclose(cond->shadows);
3035   if(cond->ecllim >= 0.0){
3036     cond->shadows = cbmapopenex(snum + 1);
3037     snum = est_eclipse_scores(db, scores, snum, cond->max > 0 ? cond->max : snum,
3038                               ESTECLKNUM, cond->tfidf, cond->ecllim, cond->shadows);
3039   } else {
3040     cond->shadows = NULL;
3041   }
3042   rnum = snum - cond->skip;
3043   if(rnum < 0) rnum = 0;
3044   if(cond->max >= 0 && cond->max < rnum) rnum = cond->max;
3045   CB_MALLOC(rval, rnum * sizeof(int) + 1);
3046   tscores = scores + cond->skip;
3047   for(i = 0; i < rnum; i++){
3048     rval[i] = tscores[i].id;
3049   }
3050   if(cond->scfb){
3051     if(rnum > 0){
3052       CB_REALLOC(cond->scores, rnum * sizeof(int) + 1);
3053       for(i = 0; i < rnum; i++){
3054         cond->scores[i] = tscores[i].score;
3055       }
3056       cond->snum = rnum;
3057     } else {
3058       free(cond->scores);
3059       cond->scores = NULL;
3060       cond->snum = 0;
3061     }
3062   }
3063   *nump = rnum;
3064   if(*nump < 1) est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
3065   cbmapclose(ordattrs);
3066   free(scores);
3067   return rval;
3068 }
3069 
3070 
3071 /* Search documents of plural databases. */
est_db_search_meta(ESTDB ** dbs,int dbnum,ESTCOND * cond,int * nump,CBMAP * hints)3072 int *est_db_search_meta(ESTDB **dbs, int dbnum, ESTCOND *cond, int *nump, CBMAP *hints){
3073   ESTMETASCORE *scores, *tscores;
3074   ESTCOND *tcond;
3075   CBMAP *thints, *umap;
3076   const char *kbuf, *otype, *rp;
3077   char *distinct, numbuf[ESTNUMBUFSIZ], *oname, *wp, *vbuf;
3078   int i, j, max, skip, smax, snum, *res, rnum, ksiz, num;
3079   time_t tval;
3080   assert(dbs && dbnum >= 0 && cond && nump);
3081   max = cond->max;
3082   if(cond->distinct) cond->max = -1;
3083   skip = cond->skip;
3084   cond->skip = 0;
3085   distinct = cond->distinct;
3086   cond->distinct = NULL;
3087   smax = ESTALLOCUNIT;
3088   CB_MALLOC(scores, smax * sizeof(ESTMETASCORE));
3089   snum = 0;
3090   for(i = 0; i < dbnum; i++){
3091     if(cond->mask & (1 << i)) continue;
3092     tcond = est_cond_dup(cond);
3093     est_cond_set_options(tcond, ESTCONDSCFB);
3094     thints = cbmapopenex(ESTMINIBNUM);
3095     res = est_db_search(dbs[i], tcond, &rnum, thints);
3096     for(j = 0; j < rnum; j++){
3097       if(snum >= smax){
3098         smax *= 2;
3099         CB_REALLOC(scores, smax * sizeof(ESTMETASCORE));
3100       }
3101       scores[snum].db = i;
3102       scores[snum].id = res[j];
3103       scores[snum].score = est_cond_score(tcond, j);
3104       scores[snum].value = NULL;
3105       snum++;
3106     }
3107     if(hints){
3108       cbmapiterinit(thints);
3109       while((kbuf = cbmapiternext(thints, &ksiz)) != NULL){
3110         num = atoi(cbmapiterval(kbuf, NULL));
3111         if((rp = cbmapget(hints, kbuf, ksiz, NULL)) != NULL) num += atoi(rp);
3112         sprintf(numbuf, "%d", num);
3113         cbmapput(hints, kbuf, ksiz, numbuf, -1, TRUE);
3114       }
3115     }
3116     free(res);
3117     cbmapclose(thints);
3118     est_cond_delete(tcond);
3119   }
3120   oname = NULL;
3121   otype = NULL;
3122   if(cond->order){
3123     oname = cbmemdup(cond->order, -1);
3124     cbstrtrim(oname);
3125     otype = ESTORDSTRA;
3126     if((wp = strchr(oname, ' ')) != NULL){
3127       *wp = '\0';
3128       rp = wp + 1;
3129       while(*rp == ' '){
3130         rp++;
3131       }
3132       otype = rp;
3133     }
3134   }
3135   if(oname){
3136     if(!cbstricmp(oname, ESTORDIDA)){
3137       qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_id_asc);
3138     } else if(!cbstricmp(oname, ESTORDIDD)){
3139       qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_id_desc);
3140     } else if(!cbstricmp(oname, ESTORDSCA)){
3141       qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_score_asc);
3142     } else if(!cbstricmp(oname, ESTORDSCD)){
3143       qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_score_desc);
3144     } else {
3145       for(i = 0; i < snum; i++){
3146         scores[i].value = est_db_get_doc_attr(dbs[scores[i].db], scores[i].id, oname);
3147         if(!scores[i].value) scores[i].value = cbmemdup("", 0);
3148       }
3149       if(!cbstricmp(otype, ESTORDSTRA)){
3150         qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_str_asc);
3151       } else if(!cbstricmp(otype, ESTORDSTRD)){
3152         qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_str_desc);
3153       } else if(!cbstricmp(otype, ESTORDNUMA)){
3154         for(i = 0; i < snum; i++){
3155           tval = cbstrmktime(scores[i].value);
3156           free(scores[i].value);
3157           scores[i].value = (void *)tval;
3158         }
3159         qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_num_asc);
3160         for(i = 0; i < snum; i++){
3161           scores[i].value = NULL;
3162         }
3163       } else if(!cbstricmp(otype, ESTORDNUMD)){
3164         for(i = 0; i < snum; i++){
3165           tval = cbstrmktime(scores[i].value);
3166           free(scores[i].value);
3167           scores[i].value = (void *)tval;
3168         }
3169         qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_num_desc);
3170         for(i = 0; i < snum; i++){
3171           scores[i].value = NULL;
3172         }
3173       }
3174       for(i = 0; i < snum; i++){
3175         free(scores[i].value);
3176       }
3177     }
3178     free(oname);
3179   } else {
3180     qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_score_desc);
3181   }
3182   if(distinct){
3183     umap = cbmapopenex(snum + 1);
3184     rnum = 0;
3185     for(i = 0; i < snum; i++){
3186       vbuf = est_db_get_doc_attr(dbs[scores[i].db], scores[i].id, distinct);
3187       if(!vbuf) vbuf = cbmemdup("", 0);
3188       if(cbmapput(umap, vbuf, -1, "", 0, FALSE)) scores[rnum++] = scores[i];
3189       free(vbuf);
3190     }
3191     snum = rnum;
3192     cbmapclose(umap);
3193   }
3194   rnum = snum - skip;
3195   if(rnum < 0) rnum = 0;
3196   if(cond->max >= 0 && cond->max < rnum) rnum = cond->max;
3197   CB_MALLOC(res, rnum * sizeof(int) * 2 + 1);
3198   tscores = scores + skip;
3199   for(i = 0; i < rnum; i++){
3200     res[i*2] = tscores[i].db;
3201     res[i*2+1] = tscores[i].id;
3202   }
3203   if(cond->scfb){
3204     if(rnum > 0){
3205       CB_REALLOC(cond->scores, rnum * sizeof(int) + 1);
3206       for(i = 0; i < rnum; i++){
3207         cond->scores[i] = tscores[i].score;
3208       }
3209       cond->snum = rnum;
3210     } else {
3211       free(cond->scores);
3212       cond->scores = NULL;
3213       cond->snum = 0;
3214     }
3215   }
3216   *nump = rnum * 2;
3217   free(scores);
3218   cond->max = max;
3219   cond->skip = skip;
3220   cond->distinct = distinct;
3221   return res;
3222 }
3223 
3224 
3225 /* Check whether a document object matches the phrase of a search condition object definitely. */
est_db_scan_doc(ESTDB * db,ESTDOC * doc,ESTCOND * cond)3226 int est_db_scan_doc(ESTDB *db, ESTDOC *doc, ESTCOND *cond){
3227   struct { char *word; int num; } wsets[ESTSCANWNUM];
3228   CBLIST *terms, *words;
3229   const char *term, *text;
3230   unsigned char *rbuf;
3231   char *tmp;
3232   int i, j, k, wsnum, add, rsiz, hit;
3233   assert(db && doc && cond);
3234   if(!cond->phrase || cbstrfwmatch(cond->phrase, ESTOPSIMILAR) ||
3235      cbstrfwmatch(cond->phrase, ESTOPID) || cbstrfwmatch(cond->phrase, ESTOPURI)) return FALSE;
3236   if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
3237   switch(cond->pmode){
3238   default:
3239     terms = est_phrase_terms(cond->phrase);
3240     break;
3241   case ESTPMSIMPLE:
3242     tmp = est_phrase_from_simple(cond->phrase);
3243     terms = est_phrase_terms(tmp);
3244     free(tmp);
3245     break;
3246   case ESTPMROUGH:
3247     tmp = est_phrase_from_rough(cond->phrase);
3248     terms = est_phrase_terms(tmp);
3249     free(tmp);
3250     break;
3251   case ESTPMUNION:
3252     tmp = est_phrase_from_union(cond->phrase);
3253     terms = est_phrase_terms(tmp);
3254     free(tmp);
3255     break;
3256   case ESTPMISECT:
3257     tmp = est_phrase_from_isect(cond->phrase);
3258     terms = est_phrase_terms(tmp);
3259     free(tmp);
3260     break;
3261   }
3262   wsnum = 0;
3263   add = TRUE;
3264   for(i = 0; i < CB_LISTNUM(terms); i++){
3265     term = CB_LISTVAL(terms, i);
3266     if(!strcmp(term, ESTOPISECT)){
3267       add = TRUE;
3268     } else if(!strcmp(term, ESTOPDIFF)){
3269       add = FALSE;
3270     } else if(add && strcmp(term, ESTOPUVSET)){
3271       if(term[0] == ' '){
3272         term++;
3273         if(term[0] == 'b'){
3274           term++;
3275         } else  if(term[0] == 'e'){
3276           term++;
3277         }
3278       }
3279       words = cbsplit(term, -1, "\t");
3280       while(wsnum < ESTSCANWNUM && CB_LISTNUM(words) > 0){
3281         wsets[wsnum].word = cblistshift(words, NULL);
3282         wsets[wsnum].num = i;
3283         wsnum++;
3284       }
3285       CB_LISTCLOSE(words);
3286     }
3287   }
3288   for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
3289     if(i < 0){
3290       if(!doc->attrs || !(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
3291     } else {
3292       text = CB_LISTVAL(doc->dtexts, i);
3293     }
3294     rbuf = (unsigned char *)est_uconv_in(text, strlen(text), &rsiz);
3295     est_canonicalize_text(rbuf, rsiz, FALSE);
3296     tmp = est_uconv_out((char *)rbuf, rsiz, &rsiz);
3297     for(j = 0; j < wsnum; j++){
3298       if(!wsets[j].word) continue;
3299       if(est_strstr_sparse(tmp, wsets[j].word)){
3300         for(k = 0; k < wsnum; k++){
3301           if(!wsets[k].word) continue;
3302           if(wsets[k].num == wsets[j].num){
3303             free(wsets[k].word);
3304             wsets[k].word = NULL;
3305           }
3306         }
3307       }
3308     }
3309     free(tmp);
3310     free(rbuf);
3311   }
3312   hit = TRUE;
3313   for(i = 0; i < wsnum; i++){
3314     if(!wsets[i].word) continue;
3315     free(wsets[i].word);
3316     hit = FALSE;
3317   }
3318   CB_LISTCLOSE(terms);
3319   return hit;
3320 }
3321 
3322 
3323 /* Set the maximum size of the cache memory of a database. */
est_db_set_cache_size(ESTDB * db,size_t size,int anum,int tnum,int rnum)3324 void est_db_set_cache_size(ESTDB *db, size_t size, int anum, int tnum, int rnum){
3325   assert(db);
3326   if(dpwritable(db->metadb) && size >= 0) db->icmax = size;
3327   if(anum >= 0) db->acmnum = anum;
3328   if(tnum >= 0) db->tcmnum = tnum;
3329   if(rnum >= 0) db->rcmnum = rnum;
3330   db->vcmnum = db->acmnum / 2;
3331 }
3332 
3333 
3334 /* Add a pseudo index directory to a database. */
est_db_add_pseudo_index(ESTDB * db,const char * path)3335 int est_db_add_pseudo_index(ESTDB *db, const char *path){
3336   CBLIST *files;
3337   const char *file;
3338   char pbuf[ESTPATHBUFSIZ];
3339   int i, len;
3340   assert(db && path);
3341   if(!(files = cbdirlist(path))){
3342     est_set_ecode(&(db->ecode), ESTEINVAL, __LINE__);
3343     return FALSE;
3344   }
3345   cblistsort(files);
3346   for(i = 0; i < CB_LISTNUM(files); i++){
3347     file = CB_LISTVAL(files, i);
3348     if(!strcmp(file, ESTCDIRSTR) || !strcmp(file, ESTPDIRSTR)) continue;
3349     len = sprintf(pbuf, "%s%c%s", path, ESTPATHCHR, file);
3350     CB_LISTPUSH(db->pdocs, pbuf, len);
3351   }
3352   CB_LISTCLOSE(files);
3353   if(db->puris){
3354     cbmapclose(db->puris);
3355     db->puris = NULL;
3356   }
3357   return TRUE;
3358 }
3359 
3360 
3361 
3362 /*************************************************************************************************
3363  * features for experts
3364  *************************************************************************************************/
3365 
3366 
3367 /* Handle to the file of random number generator. */
3368 FILE *est_random_ifp = NULL;
3369 
3370 
3371 /* POSIX signal handlers. */
3372 void (*est_signal_handlers[ESTSIGNUM])(int);
3373 
3374 
3375 /* Break a sentence of text and extract words. */
est_break_text(const char * text,CBLIST * list,int norm,int tail)3376 void est_break_text(const char *text, CBLIST *list, int norm, int tail){
3377   CBLIST *words;
3378   const unsigned char *word, *next;
3379   unsigned char *utext;
3380   char *tmp;
3381   int i, j, k, size, cc, wsiz, nsiz, tsiz;
3382   assert(text && list);
3383   utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
3384   if(norm) est_normalize_text(utext, size, &size);
3385   est_canonicalize_text(utext, size, FALSE);
3386   CB_LISTOPEN(words);
3387   for(i = 0; i < size; i += 2){
3388     cc = est_char_category(utext[i] * 0x100 + utext[i+1]);
3389     for(j = i + 2; j < size; j += 2){
3390       if(est_char_category(utext[j] * 0x100 + utext[j+1]) != cc) break;
3391     }
3392     switch(cc){
3393     case ESTDELIMCHR:
3394     case ESTWESTALPH:
3395       CB_LISTPUSH(words, (char *)(utext + i), j - i);
3396       break;
3397     case ESTEASTALPH:
3398       for(k = i; k < j; k += 2){
3399         if(j - k >= 4){
3400           CB_LISTPUSH(words, (char *)(utext + k), 4);
3401         } else {
3402           CB_LISTPUSH(words, (char *)(utext + k), 2);
3403         }
3404       }
3405       break;
3406     default:
3407       break;
3408     }
3409     i = j - 2;
3410   }
3411   for(i = 0; i < CB_LISTNUM(words); i++){
3412     word = (unsigned char *)CB_LISTVAL2(words, i, wsiz);
3413     if(est_char_category(word[0] * 0x100 + word[1]) == ESTEASTALPH && wsiz == 2 &&
3414        i < CB_LISTNUM(words) - 1){
3415       next = (unsigned char *)cblistval(words, i + 1, &nsiz);
3416       if(nsiz > 4) nsiz = 4;
3417       if(est_char_category(next[0] * 0x100 + next[1]) == ESTEASTALPH && nsiz > 2) nsiz = 2;
3418       CB_MALLOC(tmp, wsiz + nsiz + 1);
3419       memcpy(tmp, word, wsiz);
3420       memcpy(tmp + wsiz, next, nsiz);
3421       cblistover(words, i, tmp, wsiz + nsiz);
3422       free(tmp);
3423     }
3424   }
3425   for(i = 0; i < CB_LISTNUM(words); i++){
3426     word = (unsigned char *)CB_LISTVAL2(words, i, wsiz);
3427     if(!tail && wsiz == 2 && i == CB_LISTNUM(words) - 1){
3428       if(est_char_category(word[0] * 0x100 + word[1]) == ESTEASTALPH) continue;
3429     }
3430     tmp = est_uconv_out((char *)word, wsiz, &tsiz);
3431     CB_LISTPUSHBUF(list, tmp, tsiz);
3432   }
3433   CB_LISTCLOSE(words);
3434   free(utext);
3435 }
3436 
3437 
3438 /* Break a sentence of text and extract words using perfect N-gram analyzer. */
est_break_text_perfng(const char * text,CBLIST * list,int norm,int tail)3439 void est_break_text_perfng(const char *text, CBLIST *list, int norm, int tail){
3440   CBLIST *words;
3441   const unsigned char *word, *next;
3442   unsigned char *utext;
3443   char *tmp;
3444   int i, j, k, size, cc, wsiz, nsiz, tsiz;
3445   assert(text && list);
3446   utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
3447   if(norm) est_normalize_text(utext, size, &size);
3448   est_canonicalize_text(utext, size, FALSE);
3449   CB_LISTOPEN(words);
3450   for(i = 0; i < size; i += 2){
3451     cc = est_char_category_perfng(utext[i] * 0x100 + utext[i+1]);
3452     for(j = i + 2; j < size; j += 2){
3453       if(est_char_category_perfng(utext[j] * 0x100 + utext[j+1]) != cc) break;
3454     }
3455     switch(cc){
3456     case ESTEASTALPH:
3457       for(k = i; k < j; k += 2){
3458         if(j - k >= 4){
3459           CB_LISTPUSH(words, (char *)(utext + k), 4);
3460         } else {
3461           CB_LISTPUSH(words, (char *)(utext + k), 2);
3462         }
3463       }
3464       break;
3465     default:
3466       break;
3467     }
3468     i = j - 2;
3469   }
3470   for(i = 0; i < CB_LISTNUM(words); i++){
3471     word = (unsigned char *)CB_LISTVAL2(words, i, wsiz);
3472     if(est_char_category_perfng(word[0] * 0x100 + word[1]) == ESTEASTALPH && wsiz == 2 &&
3473        i < CB_LISTNUM(words) - 1){
3474       next = (unsigned char *)cblistval(words, i + 1, &nsiz);
3475       if(nsiz > 4) nsiz = 4;
3476       if(est_char_category_perfng(next[0] * 0x100 + next[1]) == ESTEASTALPH && nsiz > 2) nsiz = 2;
3477       CB_MALLOC(tmp, wsiz + nsiz + 1);
3478       memcpy(tmp, word, wsiz);
3479       memcpy(tmp + wsiz, next, nsiz);
3480       cblistover(words, i, tmp, wsiz + nsiz);
3481       free(tmp);
3482     }
3483   }
3484   for(i = 0; i < CB_LISTNUM(words); i++){
3485     word = (unsigned char *)CB_LISTVAL2(words, i, wsiz);
3486     if(!tail && wsiz == 2 && i == CB_LISTNUM(words) - 1){
3487       if(est_char_category_perfng(word[0] * 0x100 + word[1]) == ESTEASTALPH) continue;
3488     }
3489     tmp = est_uconv_out((char *)word, wsiz, &tsiz);
3490     CB_LISTPUSHBUF(list, tmp, tsiz);
3491   }
3492   CB_LISTCLOSE(words);
3493   free(utext);
3494 }
3495 
3496 
3497 /* Make a snippet of an arbitrary string. */
est_str_make_snippet(const char * str,const CBLIST * words,int wwidth,int hwidth,int awidth)3498 char *est_str_make_snippet(const char *str, const CBLIST *words,
3499                            int wwidth, int hwidth, int awidth){
3500   assert(str && words && wwidth >= 0 && hwidth >= 0 && awidth >= 0);
3501   return est_make_snippet(str, strlen(str), words, wwidth, hwidth, awidth);
3502 }
3503 
3504 
3505 /* Break a sentence of text and extract words, using character category analyzer.
3506    `text' specifies a sentence of text.
3507    `list' specifies a list object to which extract words are added.
3508    `norm' specifies whether to normalize the text. */
est_break_text_chrcat(const char * text,CBLIST * list,int norm)3509 void est_break_text_chrcat(const char *text, CBLIST *list, int norm){
3510   unsigned char *utext;
3511   char *tmp;
3512   int i, j, size, cc, tsiz;
3513   assert(text && list);
3514   utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
3515   if(norm) est_normalize_text(utext, size, &size);
3516   est_canonicalize_text(utext, size, FALSE);
3517   for(i = 0; i < size; i += 2){
3518     cc = est_char_category_chrcat(utext[i] * 0x100 + utext[i+1]);
3519     for(j = i + 2; j < size; j += 2){
3520       if(est_char_category_chrcat(utext[j] * 0x100 + utext[j+1]) != cc &&
3521          (cc != ESTWESTALPH || utext[j] != 0x00 || utext[j+1] != 0x2d) &&
3522          (cc != ESTHIRAGANA || utext[j] != 0x30 || utext[j+1] != 0xfc)) break;
3523     }
3524     if(cc != ESTSPACECHR){
3525       tmp = est_uconv_out((char *)(utext + i), j - i, &tsiz);
3526       CB_LISTPUSHBUF(list, tmp, tsiz);
3527     }
3528     i = j - 2;
3529   }
3530   free(utext);
3531 }
3532 
3533 
3534 /* Convert the character encoding of a string. */
est_iconv(const char * ptr,int size,const char * icode,const char * ocode,int * sp,int * mp)3535 char *est_iconv(const char *ptr, int size,
3536                 const char *icode, const char *ocode, int *sp, int *mp){
3537   iconv_t ic;
3538   char *obuf, *wp, *rp;
3539   size_t isiz, osiz;
3540   int miss;
3541   assert(ptr && icode && ocode);
3542   if(size < 0) size = strlen(ptr);
3543   if(icode[0] == 'x' && icode[1] == '-'){
3544     if(!cbstricmp(icode, "x-sjis")){
3545       icode = "Shift_JIS";
3546     } else if(!cbstricmp(icode, "x-ujis")){
3547       icode = "EUC-JP";
3548     } else if(!cbstricmp(icode, "x-euc-jp")){
3549       icode = "EUC-JP";
3550     }
3551   } else if(icode[0] == 'w' || icode[0] == 'W'){
3552     if(!cbstricmp(icode, "windows-31j")){
3553       icode = "CP932";
3554     }
3555   }
3556   if(ocode[0] == 'x' && ocode[1] == '-'){
3557     if(!cbstricmp(ocode, "x-sjis")){
3558       ocode = "Shift_JIS";
3559     } else if(!cbstricmp(ocode, "x-ujis")){
3560       ocode = "EUC-JP";
3561     } else if(!cbstricmp(ocode, "x-euc-jp")){
3562       ocode = "EUC-JP";
3563     }
3564   } else if(ocode[0] == 'w' || ocode[0] == 'W'){
3565     if(!cbstricmp(ocode, "windows-31j")){
3566       ocode = "CP932";
3567     }
3568   }
3569   if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) return NULL;
3570   isiz = size;
3571   osiz = isiz * 5;
3572   CB_MALLOC(obuf, osiz + 1);
3573   wp = obuf;
3574   rp = (char *)ptr;
3575   miss = 0;
3576   while(isiz > 0){
3577     if(iconv(ic, (void *)&rp, &isiz, &wp, &osiz) == -1){
3578       if(errno == EILSEQ && (*rp == 0x5c || *rp == 0x7e)){
3579         *wp = *rp;
3580         wp++;
3581         rp++;
3582         isiz--;
3583       } else if(errno == EILSEQ || errno == EINVAL){
3584         rp++;
3585         isiz--;
3586         miss++;
3587       } else {
3588         break;
3589       }
3590     }
3591   }
3592   *wp = '\0';
3593   if(sp) *sp = wp - obuf;
3594   if(mp) *mp = miss;
3595   if(iconv_close(ic) == -1){
3596     free(obuf);
3597     return NULL;
3598   }
3599   return obuf;
3600 }
3601 
3602 
3603 /* Detect the encoding of a string automatically. */
est_enc_name(const char * ptr,int size,int plang)3604 const char *est_enc_name(const char *ptr, int size, int plang){
3605   const char *hypo;
3606   int i, lim, miss, ascii, cr;
3607   assert(ptr);
3608   if(size < 0) size = strlen(ptr);
3609   if(size > ESTICCHECKSIZ) size = ESTICCHECKSIZ;
3610   if(size >= 2 && (!memcmp(ptr, "\xfe\xff", 2) || !memcmp(ptr, "\xff\xfe", 2))) return "UTF-16";
3611   ascii = TRUE;
3612   cr = FALSE;
3613   lim = size - 1;
3614   for(i = 0; i < lim; i += 2){
3615     if(ptr[i] == 0x0) return "UTF-16BE";
3616     if(ptr[i+1] == 0x0) return "UTF-16LE";
3617     if(ptr[i] < 0x0 || ptr[i] == 0x1b){
3618       ascii = FALSE;
3619     } else if(ptr[i] == 0xd){
3620       cr = TRUE;
3621     }
3622   }
3623   if(ascii) return "US-ASCII";
3624   switch(plang){
3625   case ESTLANGEN:
3626     if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
3627     return "ISO-8859-1";
3628   case ESTLANGJA:
3629     lim = size - 3;
3630     for(i = 0; i < lim; i++){
3631       if(ptr[i] == 0x1b){
3632         i++;
3633         if(ptr[i] == '(' && strchr("BJHI", ptr[i+1])) return "ISO-2022-JP";
3634         if(ptr[i] == '$' && strchr("@B(", ptr[i+1])) return "ISO-2022-JP";
3635       }
3636     }
3637     if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
3638     hypo = NULL;
3639     if(cr){
3640       if((miss = est_enc_miss(ptr, size, "Shift_JIS", "EUC-JP")) < 1) return "Shift_JIS";
3641       if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "Shift_JIS";
3642       if((miss = est_enc_miss(ptr, size, "EUC-JP", "UTF-16BE")) < 1) return "EUC-JP";
3643       if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "EUC-JP";
3644     } else {
3645       if((miss = est_enc_miss(ptr, size, "EUC-JP", "UTF-16BE")) < 1) return "EUC-JP";
3646       if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "EUC-JP";
3647       if((miss = est_enc_miss(ptr, size, "Shift_JIS", "EUC-JP")) < 1) return "Shift_JIS";
3648       if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "Shift_JIS";
3649     }
3650     if((miss = est_enc_miss(ptr, size, "UTF-8", "UTF-16BE")) < 1) return "UTF-8";
3651     if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "UTF-8";
3652     if((miss = est_enc_miss(ptr, size, "CP932", "UTF-16BE")) < 1) return "CP932";
3653     if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "CP932";
3654     return hypo ? hypo : "ISO-8859-1";
3655   case ESTLANGZH:
3656     if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
3657     if(est_enc_miss(ptr, size, "EUC-CN", "UTF-16BE") < 1) return "EUC-CN";
3658     if(est_enc_miss(ptr, size, "BIG5", "UTF-16BE") < 1) return "BIG5";
3659     return "ISO-8859-1";
3660   case ESTLANGKO:
3661     if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
3662     if(est_enc_miss(ptr, size, "EUC-KR", "UTF-16BE") < 1) return "EUC-KR";
3663     return "ISO-8859-1";
3664   default:
3665     break;
3666   }
3667   return "ISO-8859-1";
3668 }
3669 
3670 
3671 /* Convert a UTF-8 string into UTF-16BE. */
est_uconv_in(const char * ptr,int size,int * sp)3672 char *est_uconv_in(const char *ptr, int size, int *sp){
3673   const unsigned char *rp;
3674   char *rbuf, *wp;
3675   assert(ptr && size >= 0 && sp);
3676   rp = (unsigned char *)ptr;
3677   CB_MALLOC(rbuf, size * 2 + 1);
3678   wp = rbuf;
3679   while(rp < (unsigned char *)ptr + size){
3680     if(*rp < 0x7f){
3681       *(wp++) = 0x00;
3682       *(wp++) = *rp;
3683       rp += 1;
3684     } else if(*rp < 0xdf){
3685       if(rp >= (unsigned char *)ptr + size - 1) break;
3686       *(wp++) = (rp[0] & 0x1f) >> 2;
3687       *(wp++) = (rp[0] << 6) | (rp[1] & 0x3f);
3688       rp += 2;
3689     } else if(*rp < 0xf0){
3690       if(rp >= (unsigned char *)ptr + size - 2) break;
3691       *(wp++) = (rp[0] << 4) | ((rp[1] & 0x3f) >> 2);
3692       *(wp++) = (rp[1] << 6) | (rp[2] & 0x3f);
3693       rp += 3;
3694     } else if(*rp < 0xf8){
3695       if(rp >= (unsigned char *)ptr + size - 3) break;
3696       *(wp++) = 0x00;
3697       *(wp++) = '?';
3698       rp += 4;
3699     } else if(*rp < 0xfb){
3700       if(rp >= (unsigned char *)ptr + size - 4) break;
3701       *(wp++) = 0x00;
3702       *(wp++) = '?';
3703       rp += 5;
3704     } else if(*rp < 0xfd){
3705       if(rp >= (unsigned char *)ptr + size - 5) break;
3706       *(wp++) = 0x00;
3707       *(wp++) = '?';
3708       rp += 6;
3709     } else {
3710       break;
3711     }
3712   }
3713   *wp = '\0';
3714   *sp = wp - rbuf;
3715   return rbuf;
3716 }
3717 
3718 
3719 /* Convert a UTF-16BE string into UTF-8. */
est_uconv_out(const char * ptr,int size,int * sp)3720 char *est_uconv_out(const char *ptr, int size, int *sp){
3721   const unsigned char *rp;
3722   char *rbuf, *wp;
3723   int c;
3724   assert(ptr && size >= 0);
3725   if(size % 2 != 0) size--;
3726   rp = (unsigned char *)ptr;
3727   CB_MALLOC(rbuf, size * 2 + 1);
3728   wp = rbuf;
3729   while(rp < (unsigned char *)ptr + size){
3730     c = rp[0] * 0x100 + rp[1];
3731     if(c < 0x0080){
3732       *(wp++) = rp[1];
3733     } else if(c < 0x0900){
3734       *(wp++) = 0xc0 | (rp[0] << 2) | ((rp[1] >> 6) & 0x03);
3735       *(wp++) = 0x80 | (rp[1] & 0x3f);
3736     } else {
3737       *(wp++) = 0xe0 | ((rp[0] >> 4) & 0x0f);
3738       *(wp++) = 0x80 | ((rp[0] & 0x0f) << 2) | ((rp[1] >> 6) & 0x03);
3739       *(wp++) = 0x80 | (rp[1] & 0x3f);
3740     }
3741     rp += 2;
3742   }
3743   *wp = '\0';
3744   if(sp) *sp = wp - rbuf;
3745   return rbuf;
3746 }
3747 
3748 
3749 /* Compress a serial object with ZLIB. */
est_deflate(const char * ptr,int size,int * sp,int mode)3750 char *est_deflate(const char *ptr, int size, int *sp, int mode){
3751 #if ESTUSEZLIB
3752   z_stream zs;
3753   char *buf;
3754   unsigned char obuf[ESTIOBUFSIZ];
3755   int rv, asiz, bsiz, osiz;
3756   if(size < 0) size = strlen(ptr);
3757   zs.zalloc = Z_NULL;
3758   zs.zfree = Z_NULL;
3759   zs.opaque = Z_NULL;
3760   switch(mode){
3761   case -1:
3762     if(deflateInit2(&zs, 5, Z_DEFLATED, -15, 7, Z_DEFAULT_STRATEGY) != Z_OK)
3763       return NULL;
3764     break;
3765   case 1:
3766     if(deflateInit2(&zs, 6, Z_DEFLATED, 15 + 16, 9, Z_DEFAULT_STRATEGY) != Z_OK)
3767       return NULL;
3768     break;
3769   default:
3770     if(deflateInit2(&zs, 6, Z_DEFLATED, 15, 8, Z_DEFAULT_STRATEGY) != Z_OK)
3771       return NULL;
3772     break;
3773   }
3774   asiz = size + 16;
3775   if(asiz < ESTIOBUFSIZ) asiz = ESTIOBUFSIZ;
3776   CB_MALLOC(buf, asiz);
3777   bsiz = 0;
3778   zs.next_in = (unsigned char *)ptr;
3779   zs.avail_in = size;
3780   zs.next_out = obuf;
3781   zs.avail_out = ESTIOBUFSIZ;
3782   while((rv = deflate(&zs, Z_FINISH)) == Z_OK){
3783     osiz = ESTIOBUFSIZ - zs.avail_out;
3784     if(bsiz + osiz > asiz){
3785       asiz = asiz * 2 + osiz;
3786       CB_REALLOC(buf, asiz);
3787     }
3788     memcpy(buf + bsiz, obuf, osiz);
3789     bsiz += osiz;
3790     zs.next_out = obuf;
3791     zs.avail_out = ESTIOBUFSIZ;
3792   }
3793   if(rv != Z_STREAM_END){
3794     free(buf);
3795     deflateEnd(&zs);
3796     return NULL;
3797   }
3798   osiz = ESTIOBUFSIZ - zs.avail_out;
3799   if(bsiz + osiz + 1 > asiz){
3800     asiz = asiz * 2 + osiz;
3801     CB_REALLOC(buf, asiz);
3802   }
3803   memcpy(buf + bsiz, obuf, osiz);
3804   bsiz += osiz;
3805   buf[bsiz] = '\0';
3806   if(mode == -1) bsiz++;
3807   *sp = bsiz;
3808   deflateEnd(&zs);
3809   return buf;
3810 #else
3811   if(size < 0) size = strlen(ptr);
3812   *sp = size;
3813   return cbmemdup(ptr, size);
3814 #endif
3815 }
3816 
3817 
3818 /* Decompress a serial object compressed with ZLIB. */
est_inflate(const char * ptr,int size,int * sp,int mode)3819 char *est_inflate(const char *ptr, int size, int *sp, int mode){
3820 #if ESTUSEZLIB
3821   z_stream zs;
3822   char *buf;
3823   unsigned char obuf[ESTIOBUFSIZ];
3824   int rv, asiz, bsiz, osiz;
3825   zs.zalloc = Z_NULL;
3826   zs.zfree = Z_NULL;
3827   zs.opaque = Z_NULL;
3828   switch(mode){
3829   case -1:
3830     if(inflateInit2(&zs, -15) != Z_OK) return NULL;
3831     break;
3832   case 1:
3833     if(inflateInit2(&zs, 15 + 16) != Z_OK) return NULL;
3834     break;
3835   default:
3836     if(inflateInit2(&zs, 15) != Z_OK) return NULL;
3837     break;
3838   }
3839   asiz = size * 2 + 16;
3840   if(asiz < ESTIOBUFSIZ) asiz = ESTIOBUFSIZ;
3841   CB_MALLOC(buf, asiz);
3842   bsiz = 0;
3843   zs.next_in = (unsigned char *)ptr;
3844   zs.avail_in = size;
3845   zs.next_out = obuf;
3846   zs.avail_out = ESTIOBUFSIZ;
3847   while((rv = inflate(&zs, Z_NO_FLUSH)) == Z_OK){
3848     osiz = ESTIOBUFSIZ - zs.avail_out;
3849     if(bsiz + osiz >= asiz){
3850       asiz = asiz * 2 + osiz;
3851       CB_REALLOC(buf, asiz);
3852     }
3853     memcpy(buf + bsiz, obuf, osiz);
3854     bsiz += osiz;
3855     zs.next_out = obuf;
3856     zs.avail_out = ESTIOBUFSIZ;
3857   }
3858   if(rv != Z_STREAM_END){
3859     free(buf);
3860     inflateEnd(&zs);
3861     return NULL;
3862   }
3863   osiz = ESTIOBUFSIZ - zs.avail_out;
3864   if(bsiz + osiz >= asiz){
3865     asiz = asiz * 2 + osiz;
3866     CB_REALLOC(buf, asiz);
3867   }
3868   memcpy(buf + bsiz, obuf, osiz);
3869   bsiz += osiz;
3870   buf[bsiz] = '\0';
3871   if(sp) *sp = bsiz;
3872   inflateEnd(&zs);
3873   return buf;
3874 #else
3875   if(sp) *sp = size;
3876   return cbmemdup(ptr, size);
3877 #endif
3878 }
3879 
3880 
3881 /* Compress a serial object with LZO. */
est_lzoencode(const char * ptr,int size,int * sp)3882 char *est_lzoencode(const char *ptr, int size, int *sp){
3883 #if ESTUSELZO
3884   char wrkmem[LZO1X_1_MEM_COMPRESS];
3885   lzo_bytep buf;
3886   lzo_uint bsiz;
3887   if(size < 0) size = strlen(ptr);
3888   CB_MALLOC(buf, size + size / 16 + 80);
3889   if(lzo1x_1_compress((lzo_bytep)ptr, size, buf, &bsiz, wrkmem) != LZO_E_OK){
3890     free(buf);
3891     return NULL;
3892   }
3893   buf[bsiz] = '\0';
3894   *sp = bsiz;
3895   return (char *)buf;
3896 #else
3897   if(size < 0) size = strlen(ptr);
3898   *sp = size;
3899   return cbmemdup(ptr, size);
3900 #endif
3901 }
3902 
3903 
3904 /* Decompress a serial object compressed with LZO. */
est_lzodecode(const char * ptr,int size,int * sp)3905 char *est_lzodecode(const char *ptr, int size, int *sp){
3906 #if ESTUSELZO
3907   lzo_bytep buf;
3908   lzo_uint bsiz;
3909   int rat, rv;
3910   rat = 4;
3911   while(TRUE){
3912     bsiz = (size + 256) * rat + 3;
3913     CB_MALLOC(buf, bsiz + 1);
3914     rv = lzo1x_decompress_safe((lzo_bytep)(ptr), size, buf, &bsiz, NULL);
3915     if(rv == LZO_E_OK){
3916       break;
3917     } else if(rv == LZO_E_OUTPUT_OVERRUN){
3918       free(buf);
3919       rat *= 2;
3920     } else {
3921       free(buf);
3922       return NULL;
3923     }
3924   }
3925   buf[bsiz] = '\0';
3926   if(sp) *sp = bsiz;
3927   return (char *)buf;
3928 #else
3929   if(sp) *sp = size;
3930   return cbmemdup(ptr, size);
3931 #endif
3932 }
3933 
3934 
3935 /* Compress a serial object with BZIP2. */
est_bzencode(const char * ptr,int size,int * sp)3936 char *est_bzencode(const char *ptr, int size, int *sp){
3937 #if ESTUSEBZIP
3938   bz_stream zs;
3939   char *buf, obuf[ESTIOBUFSIZ];
3940   int rv, asiz, bsiz, osiz;
3941   if(size < 0) size = strlen(ptr);
3942   zs.bzalloc = NULL;
3943   zs.bzfree = NULL;
3944   zs.opaque = NULL;
3945   if(BZ2_bzCompressInit(&zs, 9, 0, 30) != BZ_OK) return NULL;
3946   asiz = size + 16;
3947   if(asiz < ESTIOBUFSIZ) asiz = ESTIOBUFSIZ;
3948   CB_MALLOC(buf, asiz);
3949   bsiz = 0;
3950   zs.next_in = (char *)ptr;
3951   zs.avail_in = size;
3952   zs.next_out = obuf;
3953   zs.avail_out = ESTIOBUFSIZ;
3954   while((rv = BZ2_bzCompress(&zs, BZ_FINISH)) == BZ_FINISH_OK){
3955     osiz = ESTIOBUFSIZ - zs.avail_out;
3956     if(bsiz + osiz > asiz){
3957       asiz = asiz * 2 + osiz;
3958       CB_REALLOC(buf, asiz);
3959     }
3960     memcpy(buf + bsiz, obuf, osiz);
3961     bsiz += osiz;
3962     zs.next_out = obuf;
3963     zs.avail_out = ESTIOBUFSIZ;
3964   }
3965   if(rv != BZ_STREAM_END){
3966     free(buf);
3967     BZ2_bzCompressEnd(&zs);
3968     return NULL;
3969   }
3970   osiz = ESTIOBUFSIZ - zs.avail_out;
3971   if(bsiz + osiz + 1 > asiz){
3972     asiz = asiz * 2 + osiz;
3973     CB_REALLOC(buf, asiz);
3974   }
3975   memcpy(buf + bsiz, obuf, osiz);
3976   bsiz += osiz;
3977   buf[bsiz] = '\0';
3978   *sp = bsiz;
3979   BZ2_bzCompressEnd(&zs);
3980   return buf;
3981 #else
3982   if(size < 0) size = strlen(ptr);
3983   *sp = size;
3984   return cbmemdup(ptr, size);
3985 #endif
3986 }
3987 
3988 
3989 /* Decompress a serial object compressed with BZIP2. */
est_bzdecode(const char * ptr,int size,int * sp)3990 char *est_bzdecode(const char *ptr, int size, int *sp){
3991 #if ESTUSEBZIP
3992   bz_stream zs;
3993   char *buf, obuf[ESTIOBUFSIZ];
3994   int rv, asiz, bsiz, osiz;
3995   zs.bzalloc = NULL;
3996   zs.bzfree = NULL;
3997   zs.opaque = NULL;
3998   if(BZ2_bzDecompressInit(&zs, 0, 0) != BZ_OK) return NULL;
3999   asiz = size * 2 + 16;
4000   if(asiz < ESTIOBUFSIZ) asiz = ESTIOBUFSIZ;
4001   CB_MALLOC(buf, asiz);
4002   bsiz = 0;
4003   zs.next_in = (char *)ptr;
4004   zs.avail_in = size;
4005   zs.next_out = obuf;
4006   zs.avail_out = ESTIOBUFSIZ;
4007   while((rv = BZ2_bzDecompress(&zs)) == BZ_OK){
4008     osiz = ESTIOBUFSIZ - zs.avail_out;
4009     if(bsiz + osiz >= asiz){
4010       asiz = asiz * 2 + osiz;
4011       CB_REALLOC(buf, asiz);
4012     }
4013     memcpy(buf + bsiz, obuf, osiz);
4014     bsiz += osiz;
4015     zs.next_out = obuf;
4016     zs.avail_out = ESTIOBUFSIZ;
4017   }
4018   if(rv != BZ_STREAM_END){
4019     free(buf);
4020     BZ2_bzDecompressEnd(&zs);
4021     return NULL;
4022   }
4023   osiz = ESTIOBUFSIZ - zs.avail_out;
4024   if(bsiz + osiz >= asiz){
4025     asiz = asiz * 2 + osiz;
4026     CB_REALLOC(buf, asiz);
4027   }
4028   memcpy(buf + bsiz, obuf, osiz);
4029   bsiz += osiz;
4030   buf[bsiz] = '\0';
4031   if(sp) *sp = bsiz;
4032   BZ2_bzDecompressEnd(&zs);
4033   return buf;
4034 #else
4035   if(sp) *sp = size;
4036   return cbmemdup(ptr, size);
4037 #endif
4038 }
4039 
4040 
4041 /* Get the border string for draft data of documents. */
est_border_str(void)4042 const char *est_border_str(void){
4043   static int first = TRUE;
4044   static char border[ESTPATHBUFSIZ];
4045   int t, p;
4046   if(first){
4047     t = (int)(time(NULL) + est_random() * INT_MAX);
4048     p = (int)(getpid() + est_random() * INT_MAX);
4049     sprintf(border, "--------[%08X%08X]--------",
4050             dpouterhash((char *)&t, sizeof(int)), dpouterhash((char *)&p, sizeof(int)));
4051     first = FALSE;
4052   }
4053   return border;
4054 }
4055 
4056 
4057 /* Get the real random number. */
est_random(void)4058 double est_random(void){
4059   static int first = TRUE;
4060   int num;
4061   if(first && !est_random_ifp){
4062     if((est_random_ifp = fopen("/dev/urandom", "rb")) != NULL){
4063       atexit(est_random_fclose);
4064     } else {
4065       srand(getpid());
4066     }
4067     first = FALSE;
4068   }
4069   if(est_random_ifp){
4070     fread(&num, sizeof(int), 1, est_random_ifp);
4071     return (num & INT_MAX) / (double)INT_MAX;
4072   }
4073   return rand() / (double)RAND_MAX;
4074 }
4075 
4076 
4077 /* Get the random number in normal distribution. */
est_random_nd(void)4078 double est_random_nd(void){
4079   double d;
4080   d = (sqrt(-2 * log(1.0 - est_random())) * cos(3.1415926535 * 2 * est_random()) + 6.0) / 12.0;
4081   if(d > 1.0) d = 1.0;
4082   if(d < 0.0) d = 0.0;
4083   return d;
4084 }
4085 
4086 
4087 /* Get an MD5 hash string of a key string. */
est_make_crypt(const char * key)4088 char *est_make_crypt(const char *key){
4089   md5_state_t ms;
4090   char digest[32], str[64], *wp;
4091   int i;
4092   assert(key);
4093   md5_init(&ms);
4094   md5_append(&ms, (md5_byte_t *)key, strlen(key));
4095   md5_finish(&ms, (md5_byte_t *)digest);
4096   wp = str;
4097   for(i = 0; i < 16; i++){
4098     wp += sprintf(wp, "%02x", ((unsigned char *)digest)[i]);
4099   }
4100   return cbmemdup(str, -1);
4101 }
4102 
4103 
4104 /* Check whether a key matches an MD5 hash string. */
est_match_crypt(const char * key,const char * hash)4105 int est_match_crypt(const char *key, const char *hash){
4106   char *khash;
4107   int rv;
4108   assert(key && hash);
4109   khash = est_make_crypt(key);
4110   rv = !strcmp(khash, hash);
4111   free(khash);
4112   return rv;
4113 }
4114 
4115 
4116 /* Create a regular expression object. */
est_regex_new(const char * str)4117 void *est_regex_new(const char *str){
4118   regex_t regex;
4119   int options;
4120   assert(str);
4121   options = REG_EXTENDED | REG_NOSUB;
4122   if(str[0] == '*' && str[1] == 'I' && str[2] == ':'){
4123     options |= REG_ICASE;
4124     str += 3;
4125   }
4126   if(regcomp(&regex, str, options) != 0) return NULL;
4127   return cbmemdup((char *)&regex, sizeof(regex_t));
4128 }
4129 
4130 
4131 /* Delete a regular expression object. */
est_regex_delete(void * regex)4132 void est_regex_delete(void *regex){
4133   assert(regex);
4134   regfree(regex);
4135   free(regex);
4136 }
4137 
4138 
4139 /* Check whether a regular expression matches a string. */
est_regex_match(const void * regex,const char * str)4140 int est_regex_match(const void *regex, const char *str){
4141   assert(regex && str);
4142   return regexec(regex, str, 0, NULL, 0) == 0;
4143 }
4144 
4145 
4146 /* Check whether a regular expression matches a string. */
est_regex_match_str(const char * rstr,const char * tstr)4147 int est_regex_match_str(const char *rstr, const char *tstr){
4148   void *regex;
4149   int rv;
4150   assert(rstr && tstr);
4151   if(!(regex = est_regex_new(rstr))) return FALSE;
4152   rv = est_regex_match(regex, tstr);
4153   est_regex_delete(regex);
4154   return rv;
4155 }
4156 
4157 
4158 /* Replace each substring matching a regular expression string. */
est_regex_replace(const char * str,const char * bef,const char * aft)4159 char *est_regex_replace(const char *str, const char *bef, const char *aft){
4160   regex_t regex;
4161   regmatch_t subs[256];
4162   CBDATUM *datum;
4163   const char *sp, *rp;
4164   int options, first, num;
4165   assert(str && bef && aft);
4166   options = REG_EXTENDED;
4167   if(bef[0] == '*' && bef[1] == 'I' && bef[2] == ':'){
4168     options |= REG_ICASE;
4169     bef += 3;
4170   }
4171   if(bef[0] == '\0' || regcomp(&regex, bef, options) != 0) return cbmemdup(str, -1);
4172   if(regexec(&regex, str, ESTREGSUBMAX, subs, 0) != 0){
4173     regfree(&regex);
4174     return cbmemdup(str, -1);
4175   }
4176   sp = str;
4177   CB_DATUMOPEN(datum);
4178   first = TRUE;
4179   while(sp[0] != '\0' && regexec(&regex, sp, 10, subs, first ? 0 : REG_NOTBOL) == 0){
4180     first = FALSE;
4181     if(subs[0].rm_so == -1) break;
4182     CB_DATUMCAT(datum, sp, subs[0].rm_so);
4183     for(rp = aft; *rp != '\0'; rp++){
4184       if(*rp == '\\'){
4185         if(rp[1] >= '0' && rp[1] <= '9'){
4186           num = rp[1] - '0';
4187           if(subs[num].rm_so != -1 && subs[num].rm_eo != -1)
4188             CB_DATUMCAT(datum, sp + subs[num].rm_so, subs[num].rm_eo - subs[num].rm_so);
4189           ++rp;
4190         } else if(rp[1] != '\0'){
4191           CB_DATUMCAT(datum, ++rp, 1);
4192         }
4193       } else if(*rp == '&'){
4194         CB_DATUMCAT(datum, sp + subs[0].rm_so, subs[0].rm_eo - subs[0].rm_so);
4195       } else {
4196         CB_DATUMCAT(datum, rp, 1);
4197       }
4198     }
4199     sp += subs[0].rm_eo;
4200     if(subs[0].rm_eo < 1) break;
4201   }
4202   CB_DATUMCAT(datum, sp, strlen(sp));
4203   regfree(&regex);
4204   return cbdatumtomalloc(datum, NULL);
4205 }
4206 
4207 
4208 /* Duplicate a document object. */
est_doc_dup(ESTDOC * doc)4209 ESTDOC *est_doc_dup(ESTDOC *doc){
4210   ESTDOC *ndoc;
4211   assert(doc);
4212   CB_MALLOC(ndoc, sizeof(ESTDOC));
4213   ndoc->id = doc->id;
4214   ndoc->attrs = doc->attrs ? cbmapdup(doc->attrs) : NULL;
4215   ndoc->dtexts = doc->dtexts ? cblistdup(doc->dtexts) : NULL;
4216   ndoc->kwords = doc->kwords ? cbmapdup(doc->kwords) : NULL;
4217   return ndoc;
4218 }
4219 
4220 
4221 /* Set the ID number of a document object. */
est_doc_set_id(ESTDOC * doc,int id)4222 void est_doc_set_id(ESTDOC *doc, int id){
4223   assert(doc);
4224   doc->id = id;
4225 }
4226 
4227 
4228 /* Get the hidden texts of a document object. */
est_doc_hidden_texts(ESTDOC * doc)4229 const char *est_doc_hidden_texts(ESTDOC *doc){
4230   const char *rv;
4231   assert(doc);
4232   rv = doc->attrs ? cbmapget(doc->attrs, "", 0, NULL) : NULL;
4233   return rv ? rv : "";
4234 }
4235 
4236 
4237 /* Reduce the texts to fit to the specified size. */
est_doc_slim(ESTDOC * doc,int len)4238 void est_doc_slim(ESTDOC *doc, int len){
4239   const char *vbuf;
4240   unsigned char *tbuf;
4241   int i, vsiz, tsiz;
4242   assert(doc && len >= 0);
4243   if(!doc->dtexts) return;
4244   if(doc->attrs && cbmapget(doc->attrs, "", 0, &vsiz)) len -= vsiz;
4245   for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
4246     vbuf = CB_LISTVAL2(doc->dtexts, i, vsiz);
4247     len -= vsiz;
4248     if(len < 0){
4249       tbuf = (unsigned char *)cbmemdup(vbuf, vsiz);
4250       tsiz = vsiz > -len ? vsiz + len : 0;
4251       if(tsiz > 0){
4252         while(tsiz < vsiz){
4253           if(tbuf[tsiz] <= ' ' || (tbuf[tsiz] & 0xf0) == 0xe0) break;
4254           tsiz++;
4255         }
4256       }
4257       while(CB_LISTNUM(doc->dtexts) > i){
4258         CB_LISTDROP(doc->dtexts);
4259       }
4260       CB_LISTPUSHBUF(doc->dtexts, (char *)tbuf, tsiz);
4261       break;
4262     }
4263   }
4264 }
4265 
4266 
4267 /* Check whether a docuemnt object is empty. */
est_doc_is_empty(ESTDOC * doc)4268 int est_doc_is_empty(ESTDOC *doc){
4269   assert(doc);
4270   if((!doc->dtexts || CB_LISTNUM(doc->dtexts) < 1) &&
4271      (!doc->attrs || !cbmapget(doc->attrs, "", 0, NULL))) return TRUE;
4272   return FALSE;
4273 }
4274 
4275 
4276 /* Duplicate a condition object. */
est_cond_dup(ESTCOND * cond)4277 ESTCOND *est_cond_dup(ESTCOND *cond){
4278   ESTCOND *ncond;
4279   assert(cond);
4280   CB_MALLOC(ncond, sizeof(ESTCOND));
4281   ncond->phrase = cond->phrase ? cbmemdup(cond->phrase, -1) : NULL;
4282   ncond->gstep = cond->gstep;
4283   ncond->tfidf = cond->tfidf;
4284   ncond->pmode = cond->pmode;
4285   ncond->cbxpn = cond->cbxpn;
4286   ncond->attrs = cond->attrs ? cblistdup(cond->attrs) : NULL;
4287   ncond->order = cond->order ? cbmemdup(cond->order, -1) : NULL;
4288   ncond->max = cond->max;
4289   ncond->skip = cond->skip;
4290   ncond->auxmin = cond->auxmin;
4291   ncond->auxwords = cond->auxwords ? cbmapdup(cond->auxwords) : NULL;
4292   ncond->scfb = cond->scfb;
4293   ncond->scores = cond->scores ?
4294     (int *)cbmemdup((char *)cond->scores, cond->snum * sizeof(int)) : NULL;
4295   ncond->snum = cond->snum;
4296   ncond->nscores = cond->nscores;
4297   ncond->nsnum = cond->nsnum;
4298   ncond->opts = cond->opts;
4299   ncond->ecllim = cond->ecllim;
4300   ncond->shadows = cond->shadows ? cbmapdup(cond->shadows) : NULL;
4301   ncond->distinct = cond->distinct ? cbmemdup(cond->distinct, -1) : NULL;
4302   ncond->mask = cond->mask;
4303   return ncond;
4304 }
4305 
4306 
4307 /* Get the phrase of a condition object. */
est_cond_phrase(ESTCOND * cond)4308 const char *est_cond_phrase(ESTCOND *cond){
4309   assert(cond);
4310   return cond->phrase;
4311 }
4312 
4313 
4314 /* Get a list object of attribute expressions of a condition object. */
est_cond_attrs(ESTCOND * cond)4315 const CBLIST *est_cond_attrs(ESTCOND *cond){
4316   assert(cond);
4317   return cond->attrs;
4318 }
4319 
4320 
4321 /* Get the order expression of a condition object. */
est_cond_order(ESTCOND * cond)4322 const char *est_cond_order(ESTCOND *cond){
4323   assert(cond);
4324   return cond->order;
4325 }
4326 
4327 
4328 /* Get the maximum number of retrieval of a condition object. */
est_cond_max(ESTCOND * cond)4329 int est_cond_max(ESTCOND *cond){
4330   assert(cond);
4331   return cond->max;
4332 }
4333 
4334 
4335 /* Get the number of skipped documents of a condition object. */
est_cond_skip(ESTCOND * cond)4336 int est_cond_skip(ESTCOND *cond){
4337   assert(cond);
4338   return cond->skip;
4339 }
4340 
4341 
4342 /* Get the options of a condition object. */
est_cond_options(ESTCOND * cond)4343 int est_cond_options(ESTCOND *cond){
4344   assert(cond);
4345   return cond->opts;
4346 }
4347 
4348 
4349 /* Get permission to adopt result of the auxiliary index. */
est_cond_auxiliary(ESTCOND * cond)4350 int est_cond_auxiliary(ESTCOND *cond){
4351   assert(cond);
4352   return cond->auxmin;
4353 }
4354 
4355 
4356 /* Get the attribute distinction filter. */
est_cond_distinct(ESTCOND * cond)4357 const char *est_cond_distinct(ESTCOND *cond){
4358   assert(cond);
4359   return cond->distinct;
4360 }
4361 
4362 
4363 /* Get the mask of targets of meta search. */
est_cond_mask(ESTCOND * cond)4364 int est_cond_mask(ESTCOND *cond){
4365   assert(cond);
4366   return cond->mask;
4367 }
4368 
4369 
4370 /* Get the score of a document corresponding to a condition object. */
est_cond_score(ESTCOND * cond,int index)4371 int est_cond_score(ESTCOND *cond, int index){
4372   assert(cond);
4373   if(!cond->scores || index < 0 || index >= cond->snum) return -1;
4374   return cond->scores[index];
4375 }
4376 
4377 
4378 /* Get the score array of corresponding documents of a condition object. */
est_cond_scores(ESTCOND * cond,int * nump)4379 const int *est_cond_scores(ESTCOND *cond, int *nump){
4380   assert(cond && nump);
4381   *nump = cond->snum;
4382   return cond->scores;
4383 }
4384 
4385 
4386 /* Set the narrowing scores of a condition object. */
est_cond_set_narrowing_scores(ESTCOND * cond,const int * scores,int num)4387 void est_cond_set_narrowing_scores(ESTCOND *cond, const int *scores, int num){
4388   assert(cond && scores && num >= 0);
4389   cond->nscores = scores;
4390   cond->nsnum = num;
4391 }
4392 
4393 
4394 /* Check whether a condition object has used the auxiliary index. */
est_cond_auxiliary_word(ESTCOND * cond,const char * word)4395 int est_cond_auxiliary_word(ESTCOND *cond, const char *word){
4396   assert(cond && word);
4397   if(!cond->auxwords) return FALSE;
4398   if(word[0] != '\0') return cbmapget(cond->auxwords, word, -1, NULL) != NULL;
4399   return cbmaprnum(cond->auxwords) > 0;
4400 }
4401 
4402 
4403 /* Get an array of ID numbers of eclipsed docuemnts of a document in a condition object. */
est_cond_shadows(ESTCOND * cond,int id,int * np)4404 const int *est_cond_shadows(ESTCOND *cond, int id, int *np){
4405   const char *vbuf;
4406   int vsiz;
4407   assert(cond && id > 0 && np);
4408   if(!cond->shadows || !(vbuf = cbmapget(cond->shadows, (char *)&id, sizeof(int), &vsiz))){
4409     *np = 0;
4410     return (int *)"";
4411   }
4412   *np = vsiz / sizeof(int);
4413   return (int *)vbuf;
4414 }
4415 
4416 
4417 /* Set the callback function for query expansion. */
est_cond_set_expander(ESTCOND * cond,void (* func)(const char *,CBLIST *))4418 void est_cond_set_expander(ESTCOND *cond, void (*func)(const char *, CBLIST *)){
4419   assert(cond && func);
4420   cond->cbxpn = func;
4421 }
4422 
4423 
4424 /* Set the error code of a database. */
est_db_set_ecode(ESTDB * db,int ecode)4425 void est_db_set_ecode(ESTDB *db, int ecode){
4426   assert(db);
4427   est_set_ecode(&(db->ecode), ecode, __LINE__);
4428 }
4429 
4430 
4431 /* Check whether an option is set. */
est_db_check_option(ESTDB * db,int option)4432 int est_db_check_option(ESTDB *db, int option){
4433   assert(db);
4434   switch(option){
4435   case ESTDBREADER:
4436     return !dpwritable(db->metadb);
4437   case ESTDBWRITER:
4438     return dpwritable(db->metadb);
4439   case ESTDBCREAT:
4440     return -1;
4441   case ESTDBTRUNC:
4442     return -1;
4443   case ESTDBNOLCK:
4444     return -1;
4445   case ESTDBLCKNB:
4446     return -1;
4447   case ESTDBPERFNG:
4448     return db->amode == ESTDFPERFNG;
4449   case ESTDBCHRCAT:
4450     return db->amode == ESTDFCHRCAT;
4451   case ESTDBSMALL:
4452     return -1;
4453   case ESTDBLARGE:
4454     return -1;
4455   case ESTDBHUGE:
4456     return -1;
4457   case ESTDBHUGE2:
4458     return -1;
4459   case ESTDBHUGE3:
4460     return -1;
4461   case ESTDBSCVOID:
4462     return db->smode == ESTDFSCVOID;
4463   case ESTDBSCINT:
4464     return db->smode == ESTDFSCINT;
4465   case ESTDBSCASIS:
4466     return db->smode == ESTDFSCASIS;
4467   default:
4468     break;
4469   }
4470   return -1;
4471 }
4472 
4473 
4474 /* Get the inode number of a database. */
est_db_inode(ESTDB * db)4475 int est_db_inode(ESTDB *db){
4476   assert(db);
4477   return db->inode;
4478 }
4479 
4480 
4481 /* Set the entity data of a document in a database. */
est_db_set_doc_entity(ESTDB * db,int id,const char * ptr,int size)4482 int est_db_set_doc_entity(ESTDB *db, int id, const char *ptr, int size){
4483   int err;
4484   assert(db && id > 0);
4485   if(!dpwritable(db->metadb)){
4486     est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
4487     return FALSE;
4488   }
4489   err = FALSE;
4490   if(ptr){
4491     if(!crputlob(db->textdb, (char *)&id, sizeof(int), ptr, size, CR_DOVER)){
4492       est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
4493       err = TRUE;
4494     }
4495   } else {
4496     if(!croutlob(db->textdb, (char *)&id, sizeof(int))){
4497       est_set_ecode(&(db->ecode), dpecode == DP_ENOITEM ? ESTENOITEM : ESTEDB, __LINE__);
4498       err = TRUE;
4499     }
4500   }
4501   return err ? FALSE : TRUE;
4502 }
4503 
4504 
4505 /* Set the maximum number of expansion of wild cards. */
est_db_set_wildmax(ESTDB * db,int num)4506 void est_db_set_wildmax(ESTDB *db, int num){
4507   assert(db && num >= 0);
4508   db->wildmax = num;
4509 }
4510 
4511 
4512 /* Get the entity data of a document in a database. */
est_db_get_doc_entity(ESTDB * db,int id,int * sp)4513 char *est_db_get_doc_entity(ESTDB *db, int id, int *sp){
4514   char *ptr;
4515   assert(db && id > 0 && sp);
4516   if(!(ptr = crgetlob(db->textdb, (char *)&id, sizeof(int), 0, -1, sp))){
4517     est_set_ecode(&(db->ecode), dpecode == DP_ENOITEM ? ESTENOITEM : ESTEDB, __LINE__);
4518     return NULL;
4519   }
4520   return ptr;
4521 }
4522 
4523 
4524 /* Add a piece of meta data to a database. */
est_db_add_meta(ESTDB * db,const char * name,const char * value)4525 void est_db_add_meta(ESTDB *db, const char *name, const char *value){
4526   assert(db && name);
4527   if(!dpwritable(db->metadb)){
4528     est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
4529     return;
4530   }
4531   if(!db->metacc) est_db_prepare_meta(db);
4532   if(value){
4533     cbmapput(db->metacc, name, -1, value, -1, TRUE);
4534   } else {
4535     cbmapout(db->metacc, name, -1);
4536   }
4537 }
4538 
4539 
4540 /* Get a list of names of meta data of a database. */
est_db_meta_names(ESTDB * db)4541 CBLIST *est_db_meta_names(ESTDB *db){
4542   assert(db);
4543   if(!db->metacc) est_db_prepare_meta(db);
4544   return cbmapkeys(db->metacc);
4545 }
4546 
4547 
4548 /* Get the value of a piece of meta data of a database. */
est_db_meta(ESTDB * db,const char * name)4549 char *est_db_meta(ESTDB *db, const char *name){
4550   const char *vbuf;
4551   int vsiz;
4552   assert(db && name);
4553   if(!db->metacc) est_db_prepare_meta(db);
4554   if(!(vbuf = cbmapget(db->metacc, name, -1, &vsiz))) return NULL;
4555   return cbmemdup(vbuf, vsiz);
4556 }
4557 
4558 
4559 /* Extract keywords of a document object. */
est_db_etch_doc(ESTDB * db,ESTDOC * doc,int max)4560 CBMAP *est_db_etch_doc(ESTDB *db, ESTDOC *doc, int max){
4561   ESTKEYSC *scores;
4562   CBMAP *keys, *umap;
4563   CBLIST *words;
4564   const char *text, *word, *vbuf;
4565   const unsigned char *uword;
4566   char numbuf[ESTNUMBUFSIZ];
4567   int i, wsiz, num, smax, snum, vsiz;
4568   assert(doc && max >= 0);
4569   if(!doc->dtexts) return cbmapopenex(1);
4570   keys = cbmapopenex(max * 2 + 1);
4571   CB_LISTOPEN(words);
4572   for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
4573     if(i < 0){
4574       if(!doc->attrs || !(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
4575     } else {
4576       text = CB_LISTVAL(doc->dtexts, i);
4577     }
4578     if(db){
4579       switch(db->amode){
4580       case ESTDFPERFNG:
4581         est_break_text_perfng(text, words, FALSE, FALSE);
4582         break;
4583       case ESTDFCHRCAT:
4584         est_break_text_chrcat(text, words, FALSE);
4585         break;
4586       default:
4587         est_break_text(text, words, FALSE, FALSE);
4588         break;
4589       }
4590     } else {
4591       est_break_text(text, words, FALSE, FALSE);
4592     }
4593   }
4594   umap = cbmapopenex(CB_LISTNUM(words) + 1);
4595   for(i = 0; i < CB_LISTNUM(words); i++){
4596     word = CB_LISTVAL2(words, i, wsiz);
4597     if(wsiz > ESTWORDMAXLEN) continue;
4598     num = (vbuf = cbmapget(umap, word, wsiz, NULL)) ? *(int *)vbuf + 1 : 1;
4599     cbmapput(umap, word, wsiz, (char *)&num, sizeof(int), TRUE);
4600   }
4601   CB_MALLOC(scores, cbmaprnum(umap) * sizeof(ESTKEYSC) + 1);
4602   snum = 0;
4603   cbmapiterinit(umap);
4604   while((uword = (unsigned char *)cbmapiternext(umap, &wsiz)) != NULL){
4605     scores[snum].word = (char *)uword;
4606     scores[snum].wsiz = wsiz;
4607     scores[snum].pt = (vbuf = cbmapiterval((char *)uword, NULL)) ? *(int *)vbuf : 0;
4608     if(uword[0] >= 0xe3){
4609       if(wsiz <= 3){
4610         scores[snum].pt /= 2;
4611         if((uword[0] == 0xe3 && (uword[1] == 0x80 || uword[1] == 0x81 ||
4612                                  (uword[1] == 0x82 && uword[2] <= 0x9f))) ||
4613            (uword[0] == 0xef && uword[1] >= 0xbc)) scores[snum].pt /= 2;
4614       } else {
4615         if((uword[0] == 0xe3 && (uword[1] == 0x80 || uword[1] == 0x81 ||
4616                                  (uword[1] == 0x82 && uword[2] <= 0x9f))) ||
4617            (uword[0] == 0xef && uword[1] >= 0xbc)) scores[snum].pt /= 2;
4618         if((uword[3] == 0xe3 && (uword[4] == 0x80 || uword[4] == 0x81 ||
4619                                  (uword[4] == 0x82 && uword[5] <= 0x9f))) ||
4620            (uword[3] == 0xef && uword[4] >= 0xbc)) scores[snum].pt /= 2;
4621       }
4622     } else if((uword[0] > '\0' && uword[0] <= '/') || (uword[0] >= ':' && uword[0] <= '@') ||
4623               (uword[0] >= '[' && uword[0] <= '`') || (uword[0] >= '{' && uword[0] <= '~')){
4624       scores[snum].pt /= 25;
4625       if(wsiz <= 1) scores[snum].pt /= 2;
4626     } else {
4627       switch(wsiz){
4628       case 1:
4629         scores[snum].pt /= 9;
4630         break;
4631       case 2:
4632         scores[snum].pt /= 5;
4633         break;
4634       case 3:
4635         scores[snum].pt /= 3;
4636         break;
4637       case 4:
4638         scores[snum].pt /= 2;
4639         break;
4640       case 5:
4641         scores[snum].pt /= 1.5;
4642         break;
4643       case 6:
4644         scores[snum].pt /= 1.25;
4645         break;
4646       }
4647     }
4648     snum++;
4649   }
4650   qsort(scores, snum, sizeof(ESTKEYSC), est_keysc_compare);
4651   smax = max * (db ? ESTKEYSCALW : 1) + 1;
4652   snum = snum > smax ? smax : snum;
4653   if(db){
4654     for(i = 0; i < snum; i++){
4655       if((vbuf = cbmapget(db->keycc, scores[i].word, scores[i].wsiz, NULL)) != NULL){
4656         cbmapmove(db->keycc, scores[i].word, scores[i].wsiz, FALSE);
4657         vsiz = *(int *)vbuf;
4658       } else {
4659         if(db->dfdb){
4660           if((vsiz = dpgetwb(db->dfdb, scores[i].word, scores[i].wsiz,
4661                              0, ESTNUMBUFSIZ - 1, numbuf)) > 0){
4662             numbuf[vsiz] = '\0';
4663             vsiz = atoi(numbuf);
4664           } else {
4665             vsiz = 0;
4666           }
4667         } else {
4668           vsiz = est_idx_vsiz(db->idxdb, scores[i].word, scores[i].wsiz);
4669           if(cbmapget(db->idxcc, scores[i].word, scores[i].wsiz, &num)) vsiz += num;
4670         }
4671         cbmapput(db->keycc, scores[i].word, scores[i].wsiz, (char *)&vsiz, sizeof(int), FALSE);
4672       }
4673       scores[i].pt *= 100000.0 / pow(vsiz + 64, 0.6);
4674     }
4675     if(db->kcmnum >= 0 && cbmaprnum(db->keycc) > db->kcmnum){
4676       num = db->kcmnum * 0.1 + 1;
4677       cbmapiterinit(db->keycc);
4678       for(i = 0; i < num && (word = cbmapiternext(db->keycc, &wsiz)) != NULL; i++){
4679         cbmapout(db->keycc, word, wsiz);
4680       }
4681     }
4682     qsort(scores, snum, sizeof(ESTKEYSC), est_keysc_compare);
4683   }
4684   for(i = 0; i < snum && i < max; i++){
4685     vsiz = sprintf(numbuf, "%d", scores[i].pt > 0 ? scores[i].pt : 1);
4686     cbmapput(keys, scores[i].word, scores[i].wsiz, numbuf, vsiz, FALSE);
4687   }
4688   free(scores);
4689   cbmapclose(umap);
4690   CB_LISTCLOSE(words);
4691   return keys;
4692 }
4693 
4694 
4695 /* Retrieve a map object of keywords. */
est_db_put_keywords(ESTDB * db,int id,CBMAP * kwords,double weight)4696 int est_db_put_keywords(ESTDB *db, int id, CBMAP *kwords, double weight){
4697   const char *kbuf;
4698   char *mbuf;
4699   int err, ksiz, pair[2], msiz;
4700   assert(db && id > 0 && kwords && weight >= 0.0);
4701   if(!dpwritable(db->metadb)){
4702     est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
4703     return FALSE;
4704   }
4705   if(crvsiz(db->attrdb, (char *)&id, sizeof(int)) == -1){
4706     est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
4707     return FALSE;
4708   }
4709   err = FALSE;
4710   if(!est_db_out_keywords(db, id) && db->ecode != ESTENOITEM) err = TRUE;
4711   pair[0] = id;
4712   cbmapiterinit(kwords);
4713   while((kbuf = cbmapiternext(kwords, &ksiz)) != NULL){
4714     if(ksiz < 1 || (kbuf[0] >= '\0' && kbuf[0] <= ' ')) continue;
4715     pair[1] = (int)(atoi(cbmapiterval(kbuf, NULL)) * weight);
4716     cbmapputcat(db->auxcc, kbuf, ksiz, (char *)pair, sizeof(pair));
4717     db->icsiz += sizeof(pair);
4718   }
4719   mbuf = cbmapdump(kwords, &msiz);
4720   if(!est_crput(db->kwddb, db->zmode, id, mbuf, msiz, CR_DOVER)){
4721     est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
4722     db->fatal = TRUE;
4723     err = TRUE;
4724   }
4725   free(mbuf);
4726   return err ? FALSE : TRUE;
4727 }
4728 
4729 
4730 /* Remove keywords of a document. */
est_db_out_keywords(ESTDB * db,int id)4731 int est_db_out_keywords(ESTDB *db, int id){
4732   CBMAP *kwords;
4733   const char *word;
4734   char wbuf[ESTWORDMAXLEN+3], *tbuf;
4735   int wsiz;
4736   assert(db && id > 0);
4737   if(!dpwritable(db->metadb)){
4738     est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
4739     return FALSE;
4740   }
4741   if(id >= ESTPDOCIDMIN){
4742     est_set_ecode(&(db->ecode), ESTEINVAL, __LINE__);
4743     return FALSE;
4744   }
4745   if((kwords = est_db_get_keywords(db, id)) != NULL){
4746     cbmapiterinit(kwords);
4747     while((word = cbmapiternext(kwords, &wsiz)) != NULL){
4748       if(wsiz > ESTWORDMAXLEN){
4749         tbuf = cbsprintf(" %s", word);
4750         cbmapput(db->outcc, tbuf, wsiz + 1, "", 0, FALSE);
4751         free(tbuf);
4752       } else {
4753         sprintf(wbuf, " %s", word);
4754         cbmapput(db->outcc, wbuf, wsiz + 1, "", 0, FALSE);
4755       }
4756     }
4757     cbmapclose(kwords);
4758   }
4759   if(!est_crout(db->kwddb, id)){
4760     if(dpecode == DP_ENOITEM){
4761       est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
4762     } else {
4763       est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
4764       db->fatal = TRUE;
4765     }
4766     return FALSE;
4767   }
4768   cbmapout(db->veccc, (char *)&id, sizeof(int));
4769   return TRUE;
4770 }
4771 
4772 
4773 /* Retrieve a map object of keywords. */
est_db_get_keywords(ESTDB * db,int id)4774 CBMAP *est_db_get_keywords(ESTDB *db, int id){
4775   CBMAP *kwords;
4776   const char *cbuf;
4777   char *mbuf;
4778   int i, csiz, msiz, num;
4779   assert(db && id > 0);
4780   if((cbuf = cbmapget(db->veccc, (char *)&id, sizeof(int), &csiz)) != NULL){
4781     cbmapmove(db->veccc, (char *)&id, sizeof(int), FALSE);
4782     return cbmapload(cbuf, csiz);
4783   }
4784   if(!(mbuf = est_crget(db->kwddb, db->zmode, id, &msiz))){
4785     est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
4786     return NULL;
4787   }
4788   kwords = cbmapload(mbuf, msiz);
4789   if(db->vcmnum > 0) cbmapput(db->veccc, (char *)&id, sizeof(int), mbuf, msiz, TRUE);
4790   free(mbuf);
4791   if(cbmaprnum(db->veccc) > db->vcmnum){
4792     num = cbmaprnum(db->veccc) * 0.1 + 1;
4793     cbmapiterinit(db->veccc);
4794     for(i = 0; i < num && (cbuf = cbmapiternext(db->veccc, NULL)) != NULL; i++){
4795       cbmapout(db->veccc, cbuf, sizeof(int));
4796     }
4797   }
4798   return kwords;
4799 }
4800 
4801 
4802 /* Mesure the total size of each inner records of a stored document. */
est_db_measure_doc(ESTDB * db,int id,int parts)4803 int est_db_measure_doc(ESTDB *db, int id, int parts){
4804   int sum, num;
4805   assert(db && id > 0);
4806   sum = 0;
4807   if((parts & ESTMDATTR) && (num = crvsiz(db->attrdb, (char *)&id, sizeof(int))) > 0) sum += num;
4808   if((parts & ESTMDTEXT) && (num = crvsiz(db->textdb, (char *)&id, sizeof(int))) > 0) sum += num;
4809   if((parts & ESTMDKWD) && (num = crvsiz(db->kwddb, (char *)&id, sizeof(int))) > 0) sum += num;
4810   return sum;
4811 }
4812 
4813 
4814 /* Initialize the iterator of a database. */
est_db_iter_init(ESTDB * db,const char * prev)4815 int est_db_iter_init(ESTDB *db, const char *prev){
4816   char *vbuf;
4817   assert(db);
4818   if(prev){
4819     if(!vlcurjump(db->listdb, prev, -1, VL_JFORWARD)) return dpecode == DP_ENOITEM;
4820     if((vbuf = vlcurkey(db->listdb, NULL)) != NULL){
4821       if(strcmp(prev, vbuf) >= 0) vlcurnext(db->listdb);
4822       free(vbuf);
4823     }
4824     return TRUE;
4825   }
4826   return vlcurfirst(db->listdb) || dpecode == DP_ENOITEM;
4827 }
4828 
4829 
4830 /* Get the next ID of the iterator of a database. */
est_db_iter_next(ESTDB * db)4831 int est_db_iter_next(ESTDB *db){
4832   char *vbuf;
4833   int id;
4834   assert(db);
4835   if(!(vbuf = vlcurval(db->listdb, NULL))){
4836     if(dpecode == DP_ENOITEM){
4837       est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
4838       return 0;
4839     } else {
4840       est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
4841       db->fatal = TRUE;
4842       return -1;
4843     }
4844   }
4845   id = atoi(vbuf);
4846   free(vbuf);
4847   vlcurnext(db->listdb);
4848   return id;
4849 }
4850 
4851 
4852 /* Initialize the word iterator of a database. */
est_db_word_iter_init(ESTDB * db)4853 int est_db_word_iter_init(ESTDB *db){
4854   assert(db);
4855   if(!vlcurfirst(db->fwmdb) && dpecode != DP_ENOITEM){
4856     est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
4857     db->fatal = TRUE;
4858     return FALSE;
4859   }
4860   return TRUE;
4861 }
4862 
4863 
4864 /* Get the next word of the word iterator of a database. */
est_db_word_iter_next(ESTDB * db)4865 char *est_db_word_iter_next(ESTDB *db){
4866   char *word;
4867   assert(db);
4868   if(!(word = vlcurkey(db->fwmdb, NULL))){
4869     if(dpecode == DP_ENOITEM){
4870       est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
4871     } else {
4872       est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
4873       db->fatal = TRUE;
4874     }
4875     return NULL;
4876   }
4877   vlcurnext(db->fwmdb);
4878   return word;
4879 }
4880 
4881 
4882 /* Get the size of the record of a word. */
est_db_word_rec_size(ESTDB * db,const char * word)4883 int est_db_word_rec_size(ESTDB *db, const char *word){
4884   int num;
4885   assert(db && word);
4886   if(!cbmapget(db->idxcc, word, -1, &num)) num = 0;
4887   return est_idx_vsiz(db->idxdb, word, strlen(word)) + num;
4888 }
4889 
4890 
4891 /* Get the number of unique keywords in a database. */
est_db_keyword_num(ESTDB * db)4892 int est_db_keyword_num(ESTDB *db){
4893   int wnum;
4894   assert(db);
4895   wnum = vlrnum(db->xfmdb);
4896   return wnum > 0 ? wnum : 0;
4897 }
4898 
4899 
4900 /* Initialize the keyword iterator of a database. */
est_db_keyword_iter_init(ESTDB * db)4901 int est_db_keyword_iter_init(ESTDB *db){
4902   assert(db);
4903   if(!vlcurfirst(db->xfmdb) && dpecode != DP_ENOITEM){
4904     est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
4905     db->fatal = TRUE;
4906     return FALSE;
4907   }
4908   return TRUE;
4909 }
4910 
4911 
4912 /* Get the next keyword of the word iterator of a database. */
est_db_keyword_iter_next(ESTDB * db)4913 char *est_db_keyword_iter_next(ESTDB *db){
4914   char *word;
4915   assert(db);
4916   if(!(word = vlcurkey(db->xfmdb, NULL))){
4917     if(dpecode == DP_ENOITEM){
4918       est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
4919     } else {
4920       est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
4921       db->fatal = TRUE;
4922     }
4923     return NULL;
4924   }
4925   vlcurnext(db->xfmdb);
4926   return word;
4927 }
4928 
4929 
4930 /* Get the size of the record of a keyword. */
est_db_keyword_rec_size(ESTDB * db,const char * word)4931 int est_db_keyword_rec_size(ESTDB *db, const char *word){
4932   const char *kbuf;
4933   assert(db && word);
4934   return (kbuf = vlgetcache(db->xfmdb, word, -1, NULL)) != NULL ? atoi(kbuf) : 0;
4935 }
4936 
4937 
4938 /* Search documents corresponding a keyword for a database. */
est_db_keyword_search(ESTDB * db,const char * word,int * nump)4939 int *est_db_keyword_search(ESTDB *db, const char *word, int *nump){
4940   int i, *res, rnum;
4941   assert(db && word && nump);
4942   if(!(res = (int *)vlget(db->auxdb, word, -1, &rnum))){
4943     *nump = 0;
4944     CB_MALLOC(res, 1);
4945     return res;
4946   }
4947   rnum /= sizeof(int) * 2;
4948   for(i = 0; i < rnum; i++){
4949     res[i] = res[i*2];
4950   }
4951   *nump = rnum;
4952   return res;
4953 }
4954 
4955 
4956 /* Get the number of records in the cache memory of a database. */
est_db_cache_num(ESTDB * db)4957 int est_db_cache_num(ESTDB *db){
4958   assert(db);
4959   return cbmaprnum(db->idxcc);
4960 }
4961 
4962 
4963 /* Get the size of used cache region. */
est_db_used_cache_size(ESTDB * db)4964 int est_db_used_cache_size(ESTDB *db){
4965   assert(db);
4966   return (db->icsiz + (cbmaprnum(db->idxcc) + cbmaprnum(db->auxcc)) *
4967           (sizeof(CBMAPDATUM) + ESTWORDAVGLEN)) * ESTMEMIRATIO;
4968 }
4969 
4970 
4971 /* Set the special cache for narrowing and sorting with document attributes. */
est_db_set_special_cache(ESTDB * db,const char * name,int num)4972 void est_db_set_special_cache(ESTDB *db, const char *name, int num){
4973   assert(db && name && num >= 0);
4974   if(db->spacc){
4975     free(db->scname);
4976     cbmapclose(db->spacc);
4977   }
4978   db->spacc = cbmapopenex(num + 1);
4979   db->scmnum = num;
4980   db->scname = cbmemdup(name, -1);
4981 }
4982 
4983 
4984 /* Set the callback function for database events. */
est_db_set_informer(ESTDB * db,void (* func)(const char *,void *),void * opaque)4985 void est_db_set_informer(ESTDB *db, void (*func)(const char *, void *), void *opaque){
4986   assert(db && func);
4987   db->infocb = func;
4988   db->infoop = opaque;
4989   est_db_inform(db, "status");
4990 }
4991 
4992 
4993 /* Fill the cache for keys for TF-IDF. */
est_db_fill_key_cache(ESTDB * db)4994 void est_db_fill_key_cache(ESTDB *db){
4995   const char *kbuf;
4996   char *msg;
4997   int i, ksiz, vsiz;
4998   assert(db);
4999   vlcurfirst(db->fwmdb);
5000   for(i = 0; (kbuf = vlcurkeycache(db->fwmdb, &ksiz)) != NULL; i++){
5001     vsiz = est_idx_vsiz(db->idxdb, kbuf, ksiz);
5002     cbmapput(db->keycc, kbuf, ksiz, (char *)&vsiz, sizeof(int), TRUE);
5003     vlcurnext(db->fwmdb);
5004     if(i % ESTCCCBFREQ == 0){
5005       msg = cbsprintf("filling the key cache for TF-IDF (%d)", i + 1);
5006       est_db_inform(db, msg);
5007       free(msg);
5008     }
5009   }
5010   db->kcmnum = -1;
5011 }
5012 
5013 
5014 /* Set the database of document frequency. */
est_db_set_dfdb(ESTDB * db,DEPOT * dfdb)5015 void est_db_set_dfdb(ESTDB *db, DEPOT *dfdb){
5016   assert(db);
5017   db->dfdb = dfdb;
5018 }
5019 
5020 
5021 /* Clear the result cache. */
est_db_refresh_rescc(ESTDB * db)5022 void est_db_refresh_rescc(ESTDB *db){
5023   ESTSCORE sc;
5024   const char *word;
5025   int size;
5026   assert(db);
5027   sc.id = -1;
5028   sc.score = 0;
5029   sc.value = NULL;
5030   cbmapiterinit(db->rescc);
5031   while((word = cbmapiternext(db->rescc, &size)) != NULL){
5032     cbmapput(db->rescc, word, size, (char *)&sc, sizeof(ESTSCORE), TRUE);
5033   }
5034 }
5035 
5036 
5037 /* Charge the result cache. */
est_db_charge_rescc(ESTDB * db,int max)5038 void est_db_charge_rescc(ESTDB *db, int max){
5039   CBLIST *words;
5040   const char *word, *vbuf;
5041   int i, num, size, vsiz;
5042   assert(db);
5043   if(max < 0) max = INT_MAX;
5044   CB_LISTOPEN(words);
5045   cbmapiterinit(db->rescc);
5046   while((word = cbmapiternext(db->rescc, &size)) != NULL){
5047     CB_MAPITERVAL(vbuf, word, vsiz);
5048     if(vsiz == sizeof(ESTSCORE) && ((ESTSCORE *)vbuf)->id == -1) CB_LISTPUSH(words, word, size);
5049   }
5050   num = CB_LISTNUM(words);
5051   for(i = 0; i < max && i < num; i++){
5052     word = cblistval(words, num - i - 1, &size);
5053     free(est_search_union(db, word, 1, NULL, &size, NULL, TRUE, -1, NULL));
5054   }
5055   CB_LISTCLOSE(words);
5056 }
5057 
5058 
5059 /* Get a list of words in the result cache. */
est_db_list_rescc(ESTDB * db)5060 CBLIST *est_db_list_rescc(ESTDB *db){
5061   CBLIST *words;
5062   const char *word;
5063   int size;
5064   assert(db);
5065   CB_LISTOPEN(words);
5066   cbmapiterinit(db->rescc);
5067   while((word = cbmapiternext(db->rescc, &size)) != NULL){
5068     cblistunshift(words, word, size);
5069   }
5070   return words;
5071 }
5072 
5073 
5074 /* Get the number of pseudo documents in a database. */
est_db_pseudo_doc_num(ESTDB * db)5075 int est_db_pseudo_doc_num(ESTDB *db){
5076   assert(db);
5077   return cblistnum(db->pdocs);
5078 }
5079 
5080 
5081 /* Get a list of expressions of attribute indexes of a database. */
est_db_attr_index_exprs(ESTDB * db)5082 CBLIST *est_db_attr_index_exprs(ESTDB *db){
5083   ESTATTRIDX *attridx;
5084   CBLIST *list;
5085   const char *kbuf;
5086   char *expr;
5087   assert(db);
5088   list = cblistopen();
5089   cbmapiterinit(db->aidxs);
5090   while((kbuf = cbmapiternext(db->aidxs, NULL)) != NULL){
5091     attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
5092     switch(attridx->type){
5093     case ESTIDXATTRSTR:
5094       expr = cbsprintf("%s=str", kbuf);
5095       break;
5096     case ESTIDXATTRNUM:
5097       expr = cbsprintf("%s=num", kbuf);
5098       break;
5099     default:
5100       expr = cbsprintf("%s=seq", kbuf);
5101       break;
5102     }
5103     CB_LISTPUSHBUF(list, expr, strlen(expr));
5104   }
5105   return list;
5106 }
5107 
5108 
5109 /* Interrupt long time processing. */
est_db_interrupt(ESTDB * db)5110 void est_db_interrupt(ESTDB *db){
5111   assert(db);
5112   db->intflag = TRUE;
5113 }
5114 
5115 
5116 /* Repair a broken database directory. */
est_db_repair(const char * name,int options,int * ecp)5117 int est_db_repair(const char *name, int options, int *ecp){
5118   ESTDB *db;
5119   DEPOT *depot, *metadb;
5120   CURIA *curia, *attrdb, *textdb, *kwddb;
5121   VILLA *villa, *listdb;
5122   CBLIST *list;
5123   CBMAP *aidxs, *attrs;
5124   ESTATTRIDX attridx, *attridxp;
5125   void *aidxdb;
5126   const char *elem, *abuf;
5127   char path[ESTPATHBUFSIZ], *kbuf, vbuf[ESTNUMBUFSIZ], *dec, *mbuf;
5128   int i, err, idmax, flags, zmode, dnum, dseq, ksiz, vsiz, type, id, msiz, esiz, asiz;
5129   assert(name && ecp);
5130   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTMETADBNAME);
5131   if(est_inode(path) == -1){
5132     est_set_ecode(ecp, ESTEIO, __LINE__);
5133     return FALSE;
5134   }
5135   if(!(options & ESTRPSTRICT) && (depot= dpopen(path, DP_OWRITER, -1)) != NULL){
5136     dpclose(depot);
5137   } else {
5138     dprepair(path);
5139   }
5140   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTIDXDBNAME);
5141   if(est_inode(path) == -1){
5142     est_set_ecode(ecp, ESTEIO, __LINE__);
5143     return FALSE;
5144   }
5145   if((list = cbdirlist(path)) != NULL){
5146     for(i = 1; i < CB_LISTNUM(list); i++){
5147       elem = CB_LISTVAL(list, i);
5148       if(elem[0] < '0' || elem[0] > '9') continue;
5149       sprintf(path, "%s%c%s%c%s", name, ESTPATHCHR, ESTIDXDBNAME, ESTPATHCHR, elem);
5150       if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5151         vlclose(villa);
5152       } else {
5153         vlrepair(path, VL_CMPLEX);
5154       }
5155     }
5156     CB_LISTCLOSE(list);
5157   }
5158   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTFWMDBNAME);
5159   if(est_inode(path) == -1){
5160     est_set_ecode(ecp, ESTEIO, __LINE__);
5161     return FALSE;
5162   }
5163   if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5164     vlclose(villa);
5165   } else {
5166     vlrepair(path, VL_CMPLEX);
5167   }
5168   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTAUXDBNAME);
5169   if(est_inode(path) == -1){
5170     est_set_ecode(ecp, ESTEIO, __LINE__);
5171     return FALSE;
5172   }
5173   if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5174     vlclose(villa);
5175   } else {
5176     vlrepair(path, VL_CMPLEX);
5177   }
5178   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTXFMDBNAME);
5179   if(est_inode(path) == -1){
5180     est_set_ecode(ecp, ESTEIO, __LINE__);
5181     return FALSE;
5182   }
5183   if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5184     vlclose(villa);
5185   } else {
5186     vlrepair(path, VL_CMPLEX);
5187   }
5188   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTATTRDBNAME);
5189   if(est_inode(path) == -1){
5190     est_set_ecode(ecp, ESTEIO, __LINE__);
5191     return FALSE;
5192   }
5193   if(!(options & ESTRPSTRICT) && (curia = cropen(path, CR_OWRITER, -1, -1)) != NULL){
5194     crclose(curia);
5195   } else {
5196     crrepair(path);
5197   }
5198   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTTEXTDBNAME);
5199   if(est_inode(path) == -1){
5200     est_set_ecode(ecp, ESTEIO, __LINE__);
5201     return FALSE;
5202   }
5203   if(!(options & ESTRPSTRICT) && (curia = cropen(path, CR_OWRITER, -1, -1)) != NULL){
5204     crclose(curia);
5205   } else {
5206     crrepair(path);
5207   }
5208   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTKWDDBNAME);
5209   if(est_inode(path) == -1){
5210     est_set_ecode(ecp, ESTEIO, __LINE__);
5211     return FALSE;
5212   }
5213   if(!(options & ESTRPSTRICT) && (curia = cropen(path, CR_OWRITER, -1, -1)) != NULL){
5214     crclose(curia);
5215   } else {
5216     crrepair(path);
5217   }
5218   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTLISTDBNAME);
5219   if(est_inode(path) == -1){
5220     est_set_ecode(ecp, ESTEIO, __LINE__);
5221     return FALSE;
5222   }
5223   if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5224     vlclose(villa);
5225   } else {
5226     vlrepair(path, VL_CMPLEX);
5227   }
5228   if((list = cbdirlist(name)) != NULL){
5229     for(i = 0; i < CB_LISTNUM(list); i++){
5230       elem = CB_LISTVAL(list, i);
5231       if(cbstrfwmatch(elem, ESTAISEQPREF)){
5232         sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
5233         if(!(options & ESTRPSTRICT) && (depot = dpopen(path, DP_OWRITER, -1)) != NULL){
5234           dpclose(depot);
5235         } else {
5236           dprepair(path);
5237         }
5238       } else if(cbstrfwmatch(elem, ESTAISTRPREF)){
5239         sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
5240         if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5241           vlclose(villa);
5242         } else {
5243           vlrepair(path, VL_CMPLEX);
5244         }
5245       } else if(cbstrfwmatch(elem, ESTAINUMPREF)){
5246         sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
5247         if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5248           vlclose(villa);
5249         } else {
5250           vlrepair(path, VL_CMPLEX);
5251         }
5252       }
5253     }
5254     CB_LISTCLOSE(list);
5255   }
5256   if((options & ESTRPSHODDY) && (db = est_db_open(name, ESTDBWRITER, ecp)) != NULL){
5257     if(!est_db_close(db, ecp)) return FALSE;
5258     return TRUE;
5259   }
5260   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTMETADBNAME);
5261   metadb = dpopen(path, DP_OWRITER, -1);
5262   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTATTRDBNAME);
5263   attrdb = cropen(path, CR_OWRITER, -1, -1);
5264   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTTEXTDBNAME);
5265   textdb = cropen(path, CR_OWRITER, -1, -1);
5266   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTKWDDBNAME);
5267   kwddb = cropen(path, CR_OWRITER, -1, -1);
5268   sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTLISTDBNAME);
5269   listdb = vlopen(path, VL_OWRITER, VL_CMPLEX);
5270   if(!attrdb || !textdb || !kwddb || !listdb){
5271     if(listdb) vlclose(listdb);
5272     if(kwddb) crclose(kwddb);
5273     if(textdb) crclose(textdb);
5274     if(attrdb) crclose(attrdb);
5275     if(metadb) dpclose(metadb);
5276     est_set_ecode(ecp, ESTEDB, __LINE__);
5277     return FALSE;
5278   }
5279   aidxs = cbmapopenex(ESTMINIBNUM);
5280   if((list = cbdirlist(name)) != NULL){
5281     for(i = 0; i < CB_LISTNUM(list); i++){
5282       elem = CB_LISTVAL(list, i);
5283       dec = NULL;
5284       type = -1;
5285       if(cbstrfwmatch(elem, ESTAISEQPREF)){
5286         dec = est_hex_decode(elem + strlen(ESTAISEQPREF));
5287         type = ESTIDXATTRSEQ;
5288       } else if(cbstrfwmatch(elem, ESTAISTRPREF)){
5289         dec = est_hex_decode(elem + strlen(ESTAISTRPREF));
5290         type = ESTIDXATTRSTR;
5291       } else if(cbstrfwmatch(elem, ESTAINUMPREF)){
5292         dec = est_hex_decode(elem + strlen(ESTAINUMPREF));
5293         type = ESTIDXATTRNUM;
5294       }
5295       if(dec){
5296         sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
5297         switch(type){
5298         case ESTIDXATTRSTR:
5299           if((aidxdb = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5300             vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
5301             vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
5302             attridx.db = aidxdb;
5303             attridx.type = type;
5304             cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
5305           }
5306           break;
5307         case ESTIDXATTRNUM:
5308           if((aidxdb = vlopen(path, VL_OWRITER, est_aidx_numcmp)) != NULL){
5309             vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
5310             vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
5311             attridx.db = aidxdb;
5312             attridx.type = type;
5313             cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
5314           }
5315           break;
5316         default:
5317           if((aidxdb = dpopen(path, DP_OWRITER, crbnum(attrdb) / ESTAIBDIAM)) != NULL){
5318             dpsetfbpsiz(aidxdb, ESTAIDXDPFBP);
5319             attridx.db = aidxdb;
5320             attridx.type = type;
5321             cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
5322           }
5323           break;
5324         }
5325         free(dec);
5326       }
5327     }
5328     CB_LISTCLOSE(list);
5329   }
5330   err = FALSE;
5331   idmax = 0;
5332   if((vsiz = dpgetwb(metadb, ESTKEYDNUM, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
5333     vbuf[vsiz] = '\0';
5334     idmax = atoi(vbuf);
5335   }
5336   flags = dpgetflags(metadb);
5337   zmode = 0;
5338   if(flags & ESTDFZLIB){
5339     zmode = ESTDFZLIB;
5340   } else if(flags & ESTDFLZO){
5341     zmode = ESTDFLZO;
5342   } else if(flags & ESTDFBZIP){
5343     zmode = ESTDFBZIP;
5344   }
5345   dnum = 0;
5346   dseq = 0;
5347   CB_LISTOPEN(list);
5348   if(!criterinit(attrdb)) err = TRUE;
5349   while((kbuf = criternext(attrdb, &ksiz)) != NULL){
5350     if(ksiz == sizeof(int) && (id = *(int *)kbuf) > 0 && id <= idmax &&
5351        crvsiz(attrdb, kbuf, ksiz) > 0 && crvsiz(textdb, kbuf, ksiz) > 0){
5352       dnum++;
5353       if(dseq < id) dseq = id;
5354       if(options & ESTRPSTRICT){
5355         if((mbuf = est_crget(attrdb, zmode, id, &msiz)) != NULL){
5356           attrs = cbmapload(mbuf, msiz);
5357           if((elem = cbmapget(attrs, ESTDATTRURI, -1, &esiz)) != NULL){
5358             vsiz = sprintf(vbuf, "%d", id);
5359             vlput(listdb, elem, esiz, vbuf, vsiz, VL_DKEEP);
5360           }
5361           if(cbmaprnum(aidxs) > 0){
5362             cbmapiterinit(aidxs);
5363             while((abuf = cbmapiternext(aidxs, &asiz)) != NULL){
5364               if(!(elem = cbmapget(attrs, abuf, asiz, &esiz))) continue;
5365               attridxp = (ESTATTRIDX *)cbmapiterval(abuf, NULL);
5366               switch(attridxp->type){
5367               case ESTIDXATTRSTR:
5368               case ESTIDXATTRNUM:
5369                 est_aidx_attr_put(attridxp->db, id, elem, esiz);
5370                 break;
5371               default:
5372                 est_aidx_seq_put(attridxp->db, id, elem, esiz);
5373                 break;
5374               }
5375             }
5376           }
5377           cbmapclose(attrs);
5378           free(mbuf);
5379         }
5380       }
5381     } else {
5382       CB_LISTPUSH(list, kbuf, ksiz);
5383     }
5384     free(kbuf);
5385   }
5386   if(dpecode != DP_ENOITEM) err = TRUE;
5387   for(i = 0; i < CB_LISTNUM(list); i++){
5388     elem = CB_LISTVAL2(list, i, esiz);
5389     crout(attrdb, elem, esiz);
5390     crout(textdb, elem, esiz);
5391     crout(kwddb, elem, esiz);
5392   }
5393   CB_LISTCLOSE(list);
5394   sprintf(vbuf, "%d", dseq);
5395   if(!dpput(metadb, ESTKEYDSEQ, -1, vbuf, -1, DP_DOVER)) err = TRUE;
5396   sprintf(vbuf, "%d", dnum);
5397   if(!dpput(metadb, ESTKEYDNUM, -1, vbuf, -1, DP_DOVER)) err = TRUE;
5398   cbmapiterinit(aidxs);
5399   while((elem = cbmapiternext(aidxs, NULL)) != NULL){
5400     attridxp = (ESTATTRIDX *)cbmapiterval(elem, NULL);
5401     switch(attridxp->type){
5402     case ESTIDXATTRSTR:
5403     case ESTIDXATTRNUM:
5404       if(!vlclose(attridxp->db)) err = TRUE;
5405       break;
5406     default:
5407       if(!dpclose(attridxp->db)) err = TRUE;
5408       break;
5409     }
5410   }
5411   cbmapclose(aidxs);
5412   if(!vlclose(listdb)) err = TRUE;
5413   if(!crclose(kwddb)) err = TRUE;
5414   if(!crclose(textdb)) err = TRUE;
5415   if(!crclose(attrdb)) err = TRUE;
5416   if(!dpclose(metadb)) err = TRUE;
5417   if(err){
5418     est_set_ecode(ecp, ESTEDB, __LINE__);
5419     return FALSE;
5420   }
5421   return err ? FALSE : TRUE;
5422 }
5423 
5424 
5425 /* Extract words for snippet from hints of search. */
est_hints_to_words(CBMAP * hints)5426 CBLIST *est_hints_to_words(CBMAP *hints){
5427   CBLIST *words;
5428   const char *kbuf;
5429   int ksiz;
5430   assert(hints);
5431   CB_LISTOPEN(words);
5432   cbmapiterinit(hints);
5433   while((kbuf = cbmapiternext(hints, &ksiz)) != NULL){
5434     if(ksiz < 1 || atoi(cbmapget(hints, kbuf, ksiz, NULL)) < 0) continue;
5435     CB_LISTPUSH(words, kbuf, ksiz);
5436   }
5437   return words;
5438 }
5439 
5440 
5441 /* Add a record into a result map for logical operation. */
est_resmap_add(CBMAP * map,const char * key,int score,int method)5442 void est_resmap_add(CBMAP *map, const char *key, int score, int method){
5443   int elem[2], *ep, size;
5444   assert(map && key);
5445   size = strlen(key);
5446   if((ep = (int *)cbmapget(map, key, size, NULL)) != NULL){
5447     elem[0] = ep[0] + 1;
5448     switch(method){
5449     case ESTRMLOSUM:
5450       elem[1] = ep[1] + score;
5451       break;
5452     case ESTRMLOMAX:
5453       elem[1] = score > ep[1] ? score : ep[1];
5454       break;
5455     case ESTRMLOMIN:
5456       elem[1] = score < ep[1] ? score : ep[1];
5457       break;
5458     case ESTRMLOAVG:
5459       elem[1] = (ep[1] * (ep[0] - 1) + score) / ep[0];
5460       break;
5461     default:
5462       elem[1] = score;
5463       break;
5464     }
5465   } else {
5466     elem[0] = 1;
5467     elem[1] = score;
5468   }
5469   cbmapput(map, key, size, (char *)&elem, sizeof(int) * 2, TRUE);
5470 }
5471 
5472 
5473 
5474 /* Compare two result elements by score.
5475    `ap' specifies the pointer to one element.
5476    `bp' specifies the pointer to the other element.
5477    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_resmapelem_compare(const void * ap,const void * bp)5478 static int est_resmapelem_compare(const void *ap, const void *bp){
5479   assert(ap && bp);
5480   return ((ESTRESMAPELEM *)bp)->score - ((ESTRESMAPELEM *)ap)->score;
5481 }
5482 
5483 
5484 /* Dump a result list of a result map for logical operation. */
est_resmap_dump(CBMAP * map,int min,int * nump)5485 ESTRESMAPELEM *est_resmap_dump(CBMAP *map, int min, int *nump){
5486   ESTRESMAPELEM *elems;
5487   const char *key, *vbuf;
5488   int num, vsiz;
5489   assert(map && min >= 0 && nump);
5490   CB_MALLOC(elems, cbmaprnum(map) * sizeof(ESTRESMAPELEM) + 1);
5491   num = 0;
5492   cbmapiterinit(map);
5493   while((key = cbmapiternext(map, NULL)) != NULL){
5494     CB_MAPITERVAL(vbuf, key, vsiz);
5495     if(((int *)vbuf)[0] < min) continue;
5496     elems[num].key = key;
5497     elems[num].score = ((int *)vbuf)[1];
5498     num++;
5499   }
5500   qsort(elems, num, sizeof(ESTRESMAPELEM), est_resmapelem_compare);
5501   *nump = num;
5502   return elems;
5503 }
5504 
5505 
5506 /* Reset the environment of the process. */
est_proc_env_reset(void)5507 void est_proc_env_reset(void){
5508   char *value, *pbuf;
5509   cbstdiobin();
5510   putenv("LANG=C");
5511   putenv("LANGUAGE=C");
5512   putenv("LC_CTYPE=C");
5513   putenv("LC_COLLATE=C");
5514   putenv("LC_TIME=C");
5515   putenv("LC_NUMERIC=C");
5516   putenv("LC_MONETARY=C");
5517   putenv("LC_ALL=C");
5518   putenv("EST_VERSION=" _EST_VERSION);
5519   if((value = getenv("PATH")) != NULL){
5520     if(ESTPATHCHR == '\\'){
5521       pbuf = cbsprintf("PATH=%s;C:\\hyperestraier;D:\\hyperestraier;E:\\hyperestraier", value);
5522     } else {
5523       pbuf = cbsprintf("PATH=%s:/bin:/sbin:/usr/bin:/usr/sbin:"
5524                        "/usr/local/bin:/usr/local/sbin", value);
5525     }
5526     putenv(pbuf);
5527     cbglobalgc(pbuf, free);
5528   }
5529 }
5530 
5531 
5532 /* Make a directory. */
est_mkdir(const char * path)5533 int est_mkdir(const char *path){
5534 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
5535   assert(path);
5536   return mkdir(path) == 0 ? TRUE : FALSE;
5537 #else
5538   assert(path);
5539   return mkdir(path, ESTDIRMODE) == 0 ? TRUE : FALSE;
5540 #endif
5541 }
5542 
5543 
5544 /* Remove a directory and its contents recursively. */
est_rmdir_rec(const char * path)5545 int est_rmdir_rec(const char *path){
5546   CBLIST *files;
5547   const char *file;
5548   char pbuf[ESTPATHBUFSIZ];
5549   int i;
5550   assert(path);
5551   if((files = cbdirlist(path)) != NULL){
5552     for(i = 0; i < CB_LISTNUM(files); i++){
5553       file = CB_LISTVAL(files, i);
5554       if(!strcmp(file, ESTCDIRSTR) || !strcmp(file, ESTPDIRSTR)) continue;
5555       sprintf(pbuf, "%s%c%s", path, ESTPATHCHR, file);
5556       if(unlink(pbuf) == -1) est_rmdir_rec(pbuf);
5557     }
5558     CB_LISTCLOSE(files);
5559   }
5560   return rmdir(path) == 0;
5561 }
5562 
5563 
5564 /* Get the canonicalized absolute pathname of a file. */
est_realpath(const char * path)5565 char *est_realpath(const char *path){
5566 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
5567   char pbuf[ESTPATHBUFSIZ*2], *p;
5568   assert(path);
5569   if(GetFullPathName(path, ESTPATHBUFSIZ, pbuf, &p) == 0){
5570     if((((path[0] >= 'A' && path[0] <= 'Z') || (path[0] >= 'a' && path[0] <= 'z')) &&
5571         path[1] == ':' && path[2] == ESTPATHCHR) || path[0] == ESTPATHCHR ||
5572        GetFullPathName(ESTCDIRSTR, ESTPATHBUFSIZ, pbuf, &p) == 0){
5573       sprintf(pbuf, "%s", path);
5574     } else {
5575       sprintf(pbuf + strlen(pbuf), "%c%s", ESTPATHCHR, path);
5576     }
5577   }
5578   return cbmemdup(pbuf, -1);
5579 #else
5580   char pbuf[ESTPATHBUFSIZ*2];
5581   assert(path);
5582   if(!realpath(path, pbuf)){
5583     if(path[0] == ESTPATHCHR || !realpath(ESTCDIRSTR, pbuf)){
5584       sprintf(pbuf, "%s", path);
5585     } else {
5586       sprintf(pbuf + strlen(pbuf), "%c%s", ESTPATHCHR, path);
5587     }
5588   }
5589   return cbmemdup(pbuf, -1);
5590 #endif
5591 }
5592 
5593 
5594 /* Get the inode number of a file. */
est_inode(const char * path)5595 int est_inode(const char *path){
5596 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
5597   char pbuf[ESTPATHBUFSIZ*2], *p;
5598   int inode;
5599   struct stat sbuf;
5600   assert(path);
5601   if(stat(path, &sbuf) == -1) return -1;
5602   if(GetFullPathName(path, ESTPATHBUFSIZ*2, pbuf, &p) != 0){
5603     inode = 11003;
5604     for(p = pbuf; *p != '\0'; p++){
5605       inode = inode * 31 + *(unsigned char *)p;
5606     }
5607     return (inode * 911) & 0x7FFF;
5608   }
5609   return -1;
5610 #else
5611   struct stat sbuf;
5612   assert(path);
5613   if(stat(path, &sbuf) == -1) return -1;
5614   return sbuf.st_ino & INT_MAX;
5615 #endif
5616 }
5617 
5618 
5619 /* Change modification time of a file. */
est_utime(const char * path,time_t mtime)5620 int est_utime(const char *path, time_t mtime){
5621   struct utimbuf buf;
5622   assert(path);
5623   if(mtime < 0) return utime(path, NULL) == 0;
5624   buf.actime = mtime;
5625   buf.modtime = mtime;
5626   return utime(path, &buf) == 0;
5627 }
5628 
5629 
5630 
5631 /* Get the time of day in milliseconds. */
est_gettimeofday(void)5632 double est_gettimeofday(void){
5633 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
5634   SYSTEMTIME st;
5635   struct tm ts;
5636   GetLocalTime(&st);
5637   memset(&ts, 0, sizeof(ts));
5638   ts.tm_year = st.wYear - 1900;
5639   ts.tm_mon = st.wMonth - 1;
5640   ts.tm_mday = st.wDay;
5641   ts.tm_hour = st.wHour;
5642   ts.tm_min = st.wMinute;
5643   ts.tm_sec = st.wSecond;
5644   return (double)mktime(&ts) * 1000 + (double)st.wMilliseconds;
5645 #else
5646   struct timeval tv;
5647   struct timezone tz;
5648   if(gettimeofday(&tv, &tz) == -1) return 0.0;
5649   return (double)tv.tv_sec * 1000 + (double)tv.tv_usec / 1000;
5650 #endif
5651 }
5652 
5653 
5654 /* Suspend execution for microsecond intervals. */
est_usleep(unsigned long usec)5655 void est_usleep(unsigned long usec){
5656 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
5657   Sleep(usec / 1000);
5658 #else
5659   usleep(usec);
5660 #endif
5661 }
5662 
5663 
5664 /* Set a signal handler. */
est_signal(int signum,void (* sighandler)(int))5665 void est_signal(int signum, void (*sighandler)(int)){
5666 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
5667   static int first = TRUE;
5668   int i;
5669   assert(signum >= 0 && sighandler);
5670   if(first){
5671     for(i = 1; i < ESTSIGNUM; i++){
5672       est_signal_handlers[i] = NULL;
5673     }
5674     SetConsoleCtrlHandler((PHANDLER_ROUTINE)est_signal_dispatch, TRUE);
5675     first = FALSE;
5676   }
5677   if(signum >= ESTSIGNUM) return;
5678   if(sighandler == SIG_IGN){
5679     signal(signum, SIG_IGN);
5680   } else if(sighandler == SIG_DFL){
5681     signal(signum, SIG_DFL);
5682   } else {
5683     signal(signum, (void (*)(int))est_signal_dispatch);
5684     est_signal_handlers[signum] = sighandler;
5685   }
5686 #else
5687   static int first = TRUE;
5688   struct sigaction act;
5689   int i;
5690   assert(signum >= 0 && sighandler);
5691   if(first){
5692     for(i = 1; i < ESTSIGNUM; i++){
5693       est_signal_handlers[i] = NULL;
5694     }
5695     first = FALSE;
5696   }
5697   if(signum >= ESTSIGNUM) return;
5698   memset(&act, 0, sizeof(act));
5699   if(sighandler == SIG_IGN){
5700     act.sa_handler = SIG_IGN;
5701   } else if(sighandler == SIG_DFL){
5702     act.sa_handler = SIG_DFL;
5703   } else {
5704     act.sa_handler = (void (*)(int))est_signal_dispatch;
5705     est_signal_handlers[signum] = sighandler;
5706   }
5707   sigemptyset(&act.sa_mask);
5708   act.sa_flags = 0;
5709   sigaction(signum, &act, NULL);
5710 #endif
5711 }
5712 
5713 
5714 /* Send a signal to a process. */
est_kill(int pid,int sig)5715 int est_kill(int pid, int sig){
5716 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
5717   return FALSE;
5718 #else
5719   return kill(pid, sig) == 0;
5720 #endif
5721 }
5722 
5723 
5724 /* Get the load ratio of the physical memory. */
est_memory_usage(void)5725 double est_memory_usage(void){
5726 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_) || defined(_SYS_CYGWIN_)
5727   MEMORYSTATUS sbuf;
5728   sbuf.dwLength = sizeof(MEMORYSTATUS);
5729   GlobalMemoryStatus(&sbuf);
5730   return sbuf.dwMemoryLoad / 100.0;
5731 #else
5732   return 0.0;
5733 #endif
5734 }
5735 
5736 
5737 /* get the media type of an extention */
est_ext_type(const char * ext)5738 const char *est_ext_type(const char *ext){
5739   static const char *list[] = {
5740     ".txt", "text/plain", ".txt.en", "text/plain",
5741     ".txt.ja", "text/plain", ".asc", "text/plain",
5742     ".in", "text/plain", ".c", "text/plain",
5743     ".h", "text/plain", ".cc", "text/plain",
5744     ".java", "text/plain", ".sh", "text/plain",
5745     ".pl", "text/plain", ".py", "text/plain",
5746     ".rb", "text/plain", ".idl", "text/plain",
5747     ".csv", "text/plain", ".log", "text/plain",
5748     ".conf", "text/plain", ".rc", "text/plain",
5749     ".ini", "text/plain", ".html", "text/html",
5750     ".htm", "text/html", ".xhtml", "text/html",
5751     ".xht", "text/html", ".css", "text/css",
5752     ".js", "text/javascript", ".tsv", "text/tab-separated-values",
5753     ".eml", "message/rfc822", ".mime", "message/rfc822",
5754     ".mht", "message/rfc822", ".mhtml", "message/rfc822",
5755     ".sgml", "application/sgml", ".sgm", "application/sgml",
5756     ".xml", "application/xml", ".xsl", "application/xml",
5757     ".xslt", "application/xslt+xml", ".xhtml", "application/xhtml+xml",
5758     ".xht", "application/xhtml+xml", ".rdf", "application/rdf+xml",
5759     ".rss", "application/rss+xml", ".dtd", "application/xml-dtd",
5760     ".rtf", "application/rtf", ".pdf", "application/pdf",
5761     ".ps", "application/postscript", ".eps", "application/postscript",
5762     ".doc", "application/msword", ".xls", "application/vnd.ms-excel",
5763     ".ppt", "application/vnd.ms-powerpoint", ".xdw", "application/vnd.fujixerox.docuworks",
5764     ".swf", "application/x-shockwave-flash", ".zip", "application/zip",
5765     ".tar", "application/x-tar", ".gz", "application/x-gzip",
5766     ".bz2", "application/octet-stream", ".z", "application/octet-stream",
5767     ".lha", "application/octet-stream", ".lzh", "application/octet-stream",
5768     ".cab", "application/octet-stream", ".rar", "application/octet-stream",
5769     ".sit", "application/octet-stream", ".bin", "application/octet-stream",
5770     ".o", "application/octet-stream", ".so", "application/octet-stream",
5771     ".exe", "application/octet-stream", ".dll", "application/octet-stream",
5772     ".class", "application/octet-stream", ".png", "image/png",
5773     ".gif", "image/gif", ".jpg", "image/jpeg",
5774     ".jpeg", "image/jpeg", ".tif", "image/tiff",
5775     ".tiff", "image/tiff", ".bmp", "image/bmp",
5776     ".au", "audio/basic", ".snd", "audio/basic",
5777     ".mid", "audio/midi", ".midi", "audio/midi",
5778     ".mp2", "audio/mpeg", ".mp3", "audio/mpeg",
5779     ".wav", "audio/x-wav", ".mpg", "video/mpeg",
5780     ".mpeg", "video/mpeg", ".qt", "video/quicktime",
5781     ".mov", "video/quicktime", ".avi", "video/x-msvideo",
5782     NULL
5783   };
5784   int i;
5785   assert(ext);
5786   for(i = 0; list[i]; i++){
5787     if(!cbstricmp(ext, list[i])) return list[i+1];
5788   }
5789   return "application/octet-stream";
5790 }
5791 
5792 
5793 /* Set a seed vector from a map object. */
est_vector_set_seed(CBMAP * svmap,int * svec,int vnum)5794 void est_vector_set_seed(CBMAP *svmap, int *svec, int vnum){
5795   const char *kbuf;
5796   int nnum, ksiz;
5797   assert(svmap && svec && vnum > 0);
5798   cbmapiterinit(svmap);
5799   nnum = 0;
5800   while(nnum < vnum){
5801     if((kbuf = cbmapiternext(svmap, &ksiz)) != NULL){
5802       if(ksiz < 1) continue;
5803       svec[nnum++] = atoi(cbmapiterval(kbuf, NULL));
5804     } else {
5805       svec[nnum++] = 0;
5806     }
5807   }
5808 }
5809 
5810 
5811 /* Set a target vector from a map object. */
est_vector_set_target(CBMAP * svmap,CBMAP * tvmap,int * tvec,int vnum)5812 void est_vector_set_target(CBMAP *svmap, CBMAP *tvmap, int *tvec, int vnum){
5813   const char *kbuf, *vbuf;
5814   int i, ksiz;
5815   assert(svmap && tvmap && tvec && vnum > 0);
5816   cbmapiterinit(svmap);
5817   for(i = 0; i < vnum; i++){
5818     if((kbuf = cbmapiternext(svmap, &ksiz)) != NULL){
5819       vbuf = cbmapget(tvmap, kbuf, ksiz, NULL);
5820       tvec[i] = vbuf ? atoi(vbuf) : 0;
5821     } else {
5822       tvec[i] = 0;
5823     }
5824   }
5825 }
5826 
5827 
5828 /* Get the cosine of the angle of two vectors. */
est_vector_cosine(const int * avec,const int * bvec,int vnum)5829 double est_vector_cosine(const int *avec, const int *bvec, int vnum){
5830   int i;
5831   double iprod, aabs, babs;
5832   assert(avec && bvec && vnum >= 0);
5833   iprod = 0.0;
5834   for(i = 0; i < vnum; i++){
5835     iprod += (double)avec[i] * (double)bvec[i];
5836   }
5837   aabs = 0.0;
5838   for(i = 0; i < vnum; i++){
5839     aabs += (double)avec[i] * (double)avec[i];
5840   }
5841   aabs = sqrt(aabs);
5842   babs = 0.0;
5843   for(i = 0; i < vnum; i++){
5844     babs += (double)bvec[i] * (double)bvec[i];
5845   }
5846   babs = sqrt(babs);
5847   if(iprod <= 0.0 || aabs < 1.0 || babs < 1.0) return 0.0;
5848   return iprod / (aabs * babs);
5849 }
5850 
5851 
5852 
5853 /*************************************************************************************************
5854  * private objects
5855  *************************************************************************************************/
5856 
5857 
5858 /* Set the error code.
5859    `ecp' specifies the pointer to a variable to be assigned.
5860    `value' specifies the error code to be assgined.
5861    `line' specifies the number of the line where the error happened. */
est_set_ecode(int * ecp,int value,int line)5862 static void est_set_ecode(int *ecp, int value, int line){
5863   char buf[ESTPATHBUFSIZ];
5864   assert(ecp && line > 0);
5865   *ecp = value;
5866   if(dpdbgfd >= 0){
5867     fflush(stdout);
5868     fflush(stderr);
5869     sprintf(buf, "* est_set_ecode: %d: [%d] %s\n", line, value, est_err_msg(value));
5870     write(dpdbgfd, buf, strlen(buf));
5871   }
5872 }
5873 
5874 
5875 /* Encode a string into hexadecimal.
5876    `str' specifies a string.
5877    The return value is the result hexadecimal string. */
est_hex_encode(const char * str)5878 static char *est_hex_encode(const char *str){
5879   char *res, *wp;
5880   assert(str);
5881   CB_MALLOC(res, strlen(str) * 2 + 1);
5882   wp = res;
5883   while(*str != '\0'){
5884     wp += sprintf(wp, "%02X", *(unsigned char *)str);
5885     str++;
5886   }
5887   *wp = '\0';
5888   return res;
5889 }
5890 
5891 
5892 /* Decode a hexadecimal string into original one.
5893    `str' specifies a hexadecimal string.
5894    The return value is the original string. */
est_hex_decode(const char * str)5895 static char *est_hex_decode(const char *str){
5896   char *res, *wp;
5897   int i, len;
5898   assert(str);
5899   len = strlen(str);
5900   CB_MALLOC(res, len + 1);
5901   wp = res;
5902   for(i = 0; i < len; i += 2){
5903     *(wp++) = (str[i] >= 'A' ? str[i] - 'A' + 10 : str[i] - '0') * 16 +
5904       (str[i+1] >= 'A' ? str[i+1] - 'A' + 10 : str[i+1] - '0');
5905   }
5906   *wp = '\0';
5907   return res;
5908 }
5909 
5910 
5911 /* Count the number of missing characters when converting.
5912    `ptr' specifies the pointer to a region.
5913    `size' specifies the size of the region.
5914    `icode' specifies the name of encoding of the input string.
5915    `ocode' specifies the name of encoding of the output string.
5916    The return value is the number of missing characters. */
est_enc_miss(const char * ptr,int size,const char * icode,const char * ocode)5917 static int est_enc_miss(const char *ptr, int size, const char *icode, const char *ocode){
5918   iconv_t ic;
5919   char obuf[ESTICCHECKSIZ], *wp, *rp;
5920   size_t isiz, osiz;
5921   int miss;
5922   assert(ptr && size >= 0 && icode && ocode);
5923   isiz = size;
5924   if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) return ESTICMISSMAX;
5925   miss = 0;
5926   rp = (char *)ptr;
5927   while(isiz > 0){
5928     osiz = ESTICCHECKSIZ;
5929     wp = obuf;
5930     if(iconv(ic, (void *)&rp, &isiz, &wp, &osiz) == -1){
5931       if(errno == EILSEQ || errno == EINVAL){
5932         rp++;
5933         isiz--;
5934         miss++;
5935         if(miss >= ESTICMISSMAX) break;
5936       } else {
5937         break;
5938       }
5939     }
5940   }
5941   if(iconv_close(ic) == -1) return ESTICMISSMAX;
5942   return miss;
5943 }
5944 
5945 
5946 /* Normalize a text.
5947    `utext' specifies a text whose encoding is UTF-16BE.
5948    `size' specifies the size of the text.
5949    `sp' specifies the pointer to a variable to which the size of the result is assigned. */
est_normalize_text(unsigned char * utext,int size,int * sp)5950 static void est_normalize_text(unsigned char *utext, int size, int *sp){
5951   int i, wi, b1, b2;
5952   assert(utext && size >= 0 && sp);
5953   wi = 0;
5954   for(i = 0; i < size - 1; i += 2){
5955     b1 = utext[i];
5956     b2 = utext[i+1];
5957     if(b1 == 0x0){
5958       if(b2 <= 0x8 || (b2 >= 0x0e && b2 <= 0x1f)){
5959         /* control characters */
5960         utext[wi++] = 0x0;
5961         utext[wi++] = 0x20;
5962       } else if(b2 == 0xa0){
5963         /* no-break space */
5964         utext[wi++] = 0x0;
5965         utext[wi++] = 0x20;
5966       } else {
5967         /* (otherwise) */
5968         utext[wi++] = b1;
5969         utext[wi++] = b2;
5970       }
5971     } else if(b1 == 0x20){
5972       if(b2 == 0x2){
5973         /* en space */
5974         utext[wi++] = 0x0;
5975         utext[wi++] = 0x20;
5976       } else if(b2 == 0x3){
5977         /* em space */
5978         utext[wi++] = 0x0;
5979         utext[wi++] = 0x20;
5980       } else if(b2 == 0x9){
5981         /* thin space */
5982         utext[wi++] = 0x0;
5983         utext[wi++] = 0x20;
5984       } else if(b2 == 0x10){
5985         /* hyphen */
5986         utext[wi++] = 0x0;
5987         utext[wi++] = 0x2d;
5988       } else if(b2 == 0x15){
5989         /* fullwidth horizontal line */
5990         utext[wi++] = 0x0;
5991         utext[wi++] = 0x2d;
5992       } else if(b2 == 0x19){
5993         /* apostrophe */
5994         utext[wi++] = 0x0;
5995         utext[wi++] = 0x27;
5996       } else if(b2 == 0x33){
5997         /* double quotes */
5998         utext[wi++] = 0x0;
5999         utext[wi++] = 0x22;
6000       } else {
6001         /* (otherwise) */
6002         utext[wi++] = b1;
6003         utext[wi++] = b2;
6004       }
6005     } else if(b1 == 0x22){
6006       if(b2 == 0x12){
6007         /* minus sign */
6008         utext[wi++] = 0x0;
6009         utext[wi++] = 0x2d;
6010       } else {
6011         /* (otherwise) */
6012         utext[wi++] = b1;
6013         utext[wi++] = b2;
6014       }
6015     } else if(b1 == 0x30){
6016       if(b2 == 0x0){
6017         /* fullwidth space */
6018         utext[wi++] = 0x0;
6019         utext[wi++] = 0x20;
6020       } else {
6021         /* (otherwise) */
6022         utext[wi++] = b1;
6023         utext[wi++] = b2;
6024       }
6025     } else if(b1 == 0xff){
6026       if(b2 == 0x01){
6027         /* fullwidth exclamation */
6028         utext[wi++] = 0x0;
6029         utext[wi++] = 0x21;
6030       } else if(b2 == 0x03){
6031         /* fullwidth igeta */
6032         utext[wi++] = 0x0;
6033         utext[wi++] = 0x23;
6034       } else if(b2 == 0x04){
6035         /* fullwidth dollar */
6036         utext[wi++] = 0x0;
6037         utext[wi++] = 0x24;
6038       } else if(b2 == 0x05){
6039         /* fullwidth parcent */
6040         utext[wi++] = 0x0;
6041         utext[wi++] = 0x25;
6042       } else if(b2 == 0x06){
6043         /* fullwidth ampersand */
6044         utext[wi++] = 0x0;
6045         utext[wi++] = 0x26;
6046       } else if(b2 == 0x0a){
6047         /* fullwidth asterisk */
6048         utext[wi++] = 0x0;
6049         utext[wi++] = 0x2a;
6050       } else if(b2 == 0x0b){
6051         /* fullwidth plus */
6052         utext[wi++] = 0x0;
6053         utext[wi++] = 0x2b;
6054       } else if(b2 == 0x0c){
6055         /* fullwidth comma */
6056         utext[wi++] = 0x0;
6057         utext[wi++] = 0x2c;
6058       } else if(b2 == 0x0e){
6059         /* fullwidth period */
6060         utext[wi++] = 0x0;
6061         utext[wi++] = 0x2e;
6062       } else if(b2 == 0x0f){
6063         /* fullwidth slash */
6064         utext[wi++] = 0x0;
6065         utext[wi++] = 0x2f;
6066       } else if(b2 == 0x1a){
6067         /* fullwidth colon */
6068         utext[wi++] = 0x0;
6069         utext[wi++] = 0x3a;
6070       } else if(b2 == 0x1b){
6071         /* fullwidth semicolon */
6072         utext[wi++] = 0x0;
6073         utext[wi++] = 0x3b;
6074       } else if(b2 == 0x1d){
6075         /* fullwidth equal */
6076         utext[wi++] = 0x0;
6077         utext[wi++] = 0x3d;
6078       } else if(b2 == 0x1f){
6079         /* fullwidth question */
6080         utext[wi++] = 0x0;
6081         utext[wi++] = 0x3f;
6082       } else if(b2 == 0x20){
6083         /* fullwidth atmark */
6084         utext[wi++] = 0x0;
6085         utext[wi++] = 0x40;
6086       } else if(b2 == 0x3c){
6087         /* fullwidth backslash */
6088         utext[wi++] = 0x0;
6089         utext[wi++] = 0x5c;
6090       } else if(b2 == 0x3e){
6091         /* fullwidth circumflex */
6092         utext[wi++] = 0x0;
6093         utext[wi++] = 0x5e;
6094       } else if(b2 == 0x3f){
6095         /* fullwidth underscore */
6096         utext[wi++] = 0x0;
6097         utext[wi++] = 0x5f;
6098       } else if(b2 == 0x5c){
6099         /* fullwidth vertical line */
6100         utext[wi++] = 0x0;
6101         utext[wi++] = 0x7c;
6102       } else if(b2 >= 0x21 && b2 <= 0x3a){
6103         /* fullwidth alphabets */
6104         utext[wi++] = 0x0;
6105         utext[wi++] = b2 - 0x21 + 0x41;
6106       } else if(b2 >= 0x41 && b2 <= 0x5a){
6107         /* fullwidth small alphabets */
6108         utext[wi++] = 0x0;
6109         utext[wi++] = b2 - 0x41 + 0x61;
6110       } else if(b2 >= 0x10 && b2 <= 0x19){
6111         /* fullwidth numbers */
6112         utext[wi++] = 0x0;
6113         utext[wi++] = b2 - 0x10 + 0x30;
6114       } else if(b2 == 0x61){
6115         /* halfwidth full stop */
6116         utext[wi++] = 0x30;
6117         utext[wi++] = 0x2;
6118       } else if(b2 == 0x62){
6119         /* halfwidth left corner */
6120         utext[wi++] = 0x30;
6121         utext[wi++] = 0xc;
6122       } else if(b2 == 0x63){
6123         /* halfwidth right corner */
6124         utext[wi++] = 0x30;
6125         utext[wi++] = 0xd;
6126       } else if(b2 == 0x64){
6127         /* halfwidth comma */
6128         utext[wi++] = 0x30;
6129         utext[wi++] = 0x1;
6130       } else if(b2 == 0x65){
6131         /* halfwidth middle dot */
6132         utext[wi++] = 0x30;
6133         utext[wi++] = 0xfb;
6134       } else if(b2 == 0x66){
6135         /* halfwidth wo */
6136         utext[wi++] = 0x30;
6137         utext[wi++] = 0xf2;
6138       } else if(b2 >= 0x67 && b2 <= 0x6b){
6139         /* halfwidth small a-o */
6140         utext[wi++] = 0x30;
6141         utext[wi++] = (b2 - 0x67) * 2 + 0xa1;
6142       } else if(b2 >= 0x6c && b2 <= 0x6e){
6143         /* halfwidth small ya-yo */
6144         utext[wi++] = 0x30;
6145         utext[wi++] = (b2 - 0x6c) * 2 + 0xe3;
6146       } else if(b2 == 0x6f){
6147         /* halfwidth small tu */
6148         utext[wi++] = 0x30;
6149         utext[wi++] = 0xc3;
6150       } else if(b2 == 0x70){
6151         /* halfwidth prolonged mark */
6152         utext[wi++] = 0x30;
6153         utext[wi++] = 0xfc;
6154       } else if(b2 >= 0x71 && b2 <= 0x75){
6155         /* halfwidth a-o */
6156         utext[wi++] = 0x30;
6157         utext[wi++] = (b2 - 0x71) * 2 + 0xa2;
6158         if(i + 2 < size - 1 && b2 == 0x73 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
6159           utext[wi-1] = 0xf4;
6160           i += 2;
6161         }
6162       } else if(b2 >= 0x76 && b2 <= 0x7a){
6163         /* halfwidth ka-ko */
6164         utext[wi++] = 0x30;
6165         utext[wi++] = (b2 - 0x76) * 2 + 0xab;
6166         if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
6167           utext[wi-1] += 1;
6168           i += 2;
6169         }
6170       } else if(b2 >= 0x7b && b2 <= 0x7f){
6171         /* halfwidth sa-so */
6172         utext[wi++] = 0x30;
6173         utext[wi++] = (b2 - 0x7b) * 2 + 0xb5;
6174         if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
6175           utext[wi-1] += 1;
6176           i += 2;
6177         }
6178       } else if(b2 >= 0x80 && b2 <= 0x84){
6179         /* halfwidth ta-to */
6180         utext[wi++] = 0x30;
6181         utext[wi++] = (b2 - 0x80) * 2 + 0xbf + (b2 >= 0x82 ? 1 : 0);
6182         if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
6183           utext[wi-1] += 1;
6184           i += 2;
6185         }
6186       } else if(b2 >= 0x85 && b2 <= 0x89){
6187         /* halfwidth na-no */
6188         utext[wi++] = 0x30;
6189         utext[wi++] = b2 - 0x85 + 0xca;
6190       } else if(b2 >= 0x8a && b2 <= 0x8e){
6191         /* halfwidth ha-ho */
6192         utext[wi++] = 0x30;
6193         utext[wi++] = (b2 - 0x8a) * 3 + 0xcf;
6194         if(i + 2 < size - 1){
6195           if(utext[i+2] == 0xff && utext[i+3] == 0x9e){
6196             utext[wi-1] += 1;
6197             i += 2;
6198           } else if(utext[i+2] == 0xff && utext[i+3] == 0x9f){
6199             utext[wi-1] += 2;
6200             i += 2;
6201           }
6202         }
6203       } else if(b2 >= 0x8f && b2 <= 0x93){
6204         /* halfwidth ma-mo */
6205         utext[wi++] = 0x30;
6206         utext[wi++] = b2 - 0x8f + 0xde;
6207       } else if(b2 >= 0x94 && b2 <= 0x96){
6208         /* halfwidth ya-yo */
6209         utext[wi++] = 0x30;
6210         utext[wi++] = (b2 - 0x94) * 2 + 0xe4;
6211       } else if(b2 >= 0x97 && b2 <= 0x9b){
6212         /* halfwidth ra-ro */
6213         utext[wi++] = 0x30;
6214         utext[wi++] = b2 - 0x97 + 0xe9;
6215       } else if(b2 == 0x9c){
6216         /* halfwidth wa */
6217         utext[wi++] = 0x30;
6218         utext[wi++] = 0xef;
6219       } else if(b2 == 0x9d){
6220         /* halfwidth wo */
6221         utext[wi++] = 0x30;
6222         utext[wi++] = 0xf3;
6223       } else {
6224         /* (otherwise) */
6225         utext[wi++] = b1;
6226         utext[wi++] = b2;
6227       }
6228     } else {
6229       /* (otherwise) */
6230       utext[wi++] = b1;
6231       utext[wi++] = b2;
6232     }
6233   }
6234   *sp = wi;
6235 }
6236 
6237 
6238 /* Canonicalize a text for search keys.
6239    `utext' specifies a text whose encoding is UTF-16BE.
6240    `size' specifies the size of the text.
6241    `funcspc' specifies whether to allow functional space characters. */
est_canonicalize_text(unsigned char * utext,int size,int funcspc)6242 static void est_canonicalize_text(unsigned char *utext, int size, int funcspc){
6243   int i;
6244   for(i = 0; i < size; i += 2){
6245     if(utext[i] == 0x0){
6246       if(utext[i+1] < ' '){
6247         /* functional spaces */
6248         if(!funcspc) utext[i+1] = ' ';
6249       } else if(utext[i+1] >= 'A' && utext[i+1] <= 'Z'){
6250         /* ascii */
6251         utext[i+1] += 'a' - 'A';
6252       } else if(utext[i+1] >= 0xc0){
6253         /* latin-1 supplement */
6254         if((utext[i+1] >= 0xc0 && utext[i+1] <= 0xd6) ||
6255            (utext[i+1] >= 0xd8 && utext[i+1] <= 0xde)) utext[i+1] += 0x20;
6256         if(utext[i+1] >= 0xe0 && utext[i+1] <= 0xe5){
6257           utext[i+1] = 'a';
6258         } else if(utext[i+1] == 0xe7){
6259           utext[i+1] = 'c';
6260         } else if(utext[i+1] >= 0xe8 && utext[i+1] <= 0xeb){
6261           utext[i+1] = 'e';
6262         } else if(utext[i+1] >= 0xec && utext[i+1] <= 0xef){
6263           utext[i+1] = 'i';
6264         } else if(utext[i+1] == 0xf1){
6265           utext[i+1] = 'n';
6266         } else if((utext[i+1] >= 0xf2 && utext[i+1] <= 0xf6) || utext[i+1] == 0xf8){
6267           utext[i+1] = 'o';
6268         } else if(utext[i+1] >= 0xf9 && utext[i+1] <= 0xfc){
6269           utext[i+1] = 'u';
6270         } else if(utext[i+1] == 0xfd || utext[i+1] == 0xff){
6271           utext[i+1] = 'y';
6272         }
6273       }
6274     } else if(utext[i] == 0x1){
6275       /* latin extended-a */
6276       if((utext[i+1] <= 0x36 && utext[i+1] % 2 == 0) ||
6277          (utext[i+1] >= 0x39 && utext[i+1] <= 0x47 && utext[i+1] % 2 == 1) ||
6278          (utext[i+1] >= 0x4a && utext[i+1] <= 0x76 && utext[i+1] % 2 == 0) ||
6279          (utext[i+1] >= 0x79 && utext[i+1] <= 0x7d && utext[i+1] % 2 == 1))
6280         utext[i+1] += 0x1;
6281       if(utext[i+1] <= 0x05){
6282         utext[i] = 0x0;
6283         utext[i+1] = 'a';
6284       } else if(utext[i+1] >= 0x06 && utext[i+1] <= 0x0d){
6285         utext[i] = 0x0;
6286         utext[i+1] = 'c';
6287       } else if(utext[i+1] >= 0x0e && utext[i+1] <= 0x11){
6288         utext[i] = 0x0;
6289         utext[i+1] = 'd';
6290       } else if(utext[i+1] >= 0x12 && utext[i+1] <= 0x1b){
6291         utext[i] = 0x0;
6292         utext[i+1] = 'e';
6293       } else if(utext[i+1] >= 0x1c && utext[i+1] <= 0x23){
6294         utext[i] = 0x0;
6295         utext[i+1] = 'g';
6296       } else if(utext[i+1] >= 0x24 && utext[i+1] <= 0x27){
6297         utext[i] = 0x0;
6298         utext[i+1] = 'h';
6299       } else if(utext[i+1] >= 0x28 && utext[i+1] <= 0x31){
6300         utext[i] = 0x0;
6301         utext[i+1] = 'i';
6302       } else if(utext[i+1] >= 0x34 && utext[i+1] <= 0x35){
6303         utext[i] = 0x0;
6304         utext[i+1] = 'j';
6305       } else if(utext[i+1] >= 0x36 && utext[i+1] <= 0x38){
6306         utext[i] = 0x0;
6307         utext[i+1] = 'k';
6308       } else if(utext[i+1] >= 0x39 && utext[i+1] <= 0x42){
6309         utext[i] = 0x0;
6310         utext[i+1] = 'l';
6311       } else if(utext[i+1] >= 0x43 && utext[i+1] <= 0x4b){
6312         utext[i] = 0x0;
6313         utext[i+1] = 'n';
6314       } else if(utext[i+1] >= 0x4c && utext[i+1] <= 0x51){
6315         utext[i] = 0x0;
6316         utext[i+1] = 'o';
6317       } else if(utext[i+1] >= 0x54 && utext[i+1] <= 0x59){
6318         utext[i] = 0x0;
6319         utext[i+1] = 'r';
6320       } else if((utext[i+1] >= 0x5a && utext[i+1] <= 0x61) || utext[i+1] == 0x7f){
6321         utext[i] = 0x0;
6322         utext[i+1] = 's';
6323       } else if(utext[i+1] >= 0x62 && utext[i+1] <= 0x67){
6324         utext[i] = 0x0;
6325         utext[i+1] = 't';
6326       } else if(utext[i+1] >= 0x68 && utext[i+1] <= 0x73){
6327         utext[i] = 0x0;
6328         utext[i+1] = 'u';
6329       } else if(utext[i+1] >= 0x74 && utext[i+1] <= 0x75){
6330         utext[i] = 0x0;
6331         utext[i+1] = 'w';
6332       } else if(utext[i+1] >= 0x76 && utext[i+1] <= 0x78){
6333         utext[i] = 0x0;
6334         utext[i+1] = 'y';
6335       } else if(utext[i+1] >= 0x79 && utext[i+1] <= 0x7e){
6336         utext[i] = 0x0;
6337         utext[i+1] = 'z';
6338       }
6339     } else if(utext[i] == 0x3){
6340       /* greek */
6341       if(utext[i+1] >= 0x91 && utext[i+1] <= 0xa9) utext[i+1] += 0x20;
6342     } else if(utext[i] == 0x4){
6343       /* cyrillic */
6344       if(utext[i+1] >= 0x10 && utext[i+1] <= 0x2f){
6345         utext[i+1] += 0x20;
6346       } else if(utext[i+1] <= 0x0f){
6347         utext[i+1] += 0x50;
6348       }
6349     } else if(utext[i] == 0xff){
6350       /* special */
6351       if(utext[i+1] >= 0xf0){
6352         utext[i] = 0x0;
6353         utext[i+1] = ' ';
6354       }
6355     }
6356   }
6357 }
6358 
6359 
6360 /* Categorize a character.
6361    `c' specifies the UCS number of a character.
6362    The return value is the category of the character. */
est_char_category(int c)6363 static int est_char_category(int c){
6364   /* ascii space */
6365   if(c <= 0x0020) return ESTSPACECHR;
6366   /* ascii alnum */
6367   if((c >= 0x0030 && c <= 0x0039) || (c >= 0x0041 && c <= 0x005a) ||
6368      (c >= 0x0061 && c <= 0x007a)) return ESTWESTALPH;
6369   /* latin */
6370   if((c >= 0x00c0 && c <= 0x00ff && c != 0x00d7 && c != 0x00f7) || (c >= 0x0100 && c <= 0x017f))
6371     return ESTWESTALPH;
6372   /* arabic and syrian */
6373   if(c >= 0x0600 && c <= 0x08ff) return ESTEASTALPH;
6374   /* south and south east asia */
6375   if((c >= 0x0900 && c <= 0x109f) || (c >= 0x1700 && c <= 0x1cff)) return ESTEASTALPH;
6376   /* cjk and surrogates */
6377   if((c >= 0x1100 && c <= 0x11ff) || (c >= 0x2e80 && c <= 0xdfff) ||
6378      (c >= 0xf900 && c <= 0xfaff) || (c >= 0xff00 && c <= 0xffef)) return ESTEASTALPH;
6379   /* asian presentation forms */
6380   if((c >= 0xfb50 && c <= 0xfdff) || (c >= 0xfe30 && c <= 0xfe4f) ||
6381      (c >= 0xfe70 && c <= 0xfeff)) return ESTEASTALPH;
6382   /* others */
6383   return ESTDELIMCHR;
6384 }
6385 
6386 
6387 /* Categorize a character for perfect N-gram analyzer.
6388    `c' specifies the UCS number of a character.
6389    The return value is the category of the character. */
est_char_category_perfng(int c)6390 static int est_char_category_perfng(int c){
6391   if(c <= 0x0020) return ESTSPACECHR;
6392   return ESTEASTALPH;
6393 }
6394 
6395 
6396 /* Categorize a character for character category analyzer.
6397    `c' specifies the UCS number of a character.
6398    The return value is the category of the character. */
est_char_category_chrcat(int c)6399 static int est_char_category_chrcat(int c){
6400   /* ascii space */
6401   if(c <= 0x0020) return ESTSPACECHR;
6402   /* ascii alnum */
6403   if((c >= 0x0030 && c <= 0x0039) || (c >= 0x0041 && c <= 0x005a) ||
6404      (c >= 0x0061 && c <= 0x007a)) return ESTWESTALPH;
6405   /* latin */
6406   if((c >= 0x00c0 && c <= 0x00ff && c != 0x00d7 && c != 0x00f7) || (c >= 0x0100 && c <= 0x017f))
6407     return ESTWESTALPH;
6408   /* arabic and syrian */
6409   if(c >= 0x0600 && c <= 0x08ff) return ESTEASTALPH;
6410   /* south and south east asia */
6411   if((c >= 0x0900 && c <= 0x109f) || (c >= 0x1700 && c <= 0x1cff)) return ESTEASTALPH;
6412   /* hiragana */
6413   if(c >= 0x3040 && c <= 0x309f) return ESTHIRAGANA;
6414   /* katakana */
6415   if(c >= 0x30a0 && c <= 0x30ff) return ESTKATAKANA;
6416   /* hangul */
6417   if((c >= 0x1100 && c <= 0x11ff) || (c >= 0x3130 && c <= 0x318f) ||
6418      (c >= 0xac00 && c <= 0xd7af)) return ESTHANGUL;
6419   /* kanji */
6420   if(c >= 0x4e00 && c <= 0x9faf) return ESTKANJI;
6421   /* other cjk and surrogates */
6422   if((c >= 0x2e80 && c <= 0xdfff) || (c >= 0xf900 && c <= 0xfaff) ||
6423      (c >= 0xff00 && c <= 0xffef)) return ESTEASTALPH;
6424   /* asian presentation forms */
6425   if((c >= 0xfb50 && c <= 0xfdff) || (c >= 0xfe30 && c <= 0xfe4f) ||
6426      (c >= 0xfe70 && c <= 0xfeff)) return ESTEASTALPH;
6427   /* others */
6428   return ESTDELIMCHR;
6429 }
6430 
6431 
6432 /* Make a snippet of an arbitrary string.
6433    `word' specifies a list object of words to be highlight.
6434    `wwidth' specifies whole width of the result.
6435    `hwidth' specifies width of strings picked up from the beginning of the text.
6436    `awidth' specifies width of strings picked up around each highlighted word.
6437    The return value is a snippet string of the string. */
est_make_snippet(const char * str,int len,const CBLIST * words,int wwidth,int hwidth,int awidth)6438 static char *est_make_snippet(const char *str, int len, const CBLIST *words,
6439                               int wwidth, int hwidth, int awidth){
6440   CBDATUM *res;
6441   CBMAP *counts;
6442   CBLIST *rwords;
6443   const char *word, *cval;
6444   const unsigned char *rword;
6445   unsigned char *rtext, *ctext;
6446   int i, j, k, bi, size, wsiz, rwsiz, mywidth, awsiz, csiz;
6447   assert(str && len >= 0 && words && wwidth >= 0 && hwidth >= 0 && awidth >= 0);
6448   CB_DATUMOPEN(res);
6449   CB_LISTOPEN(rwords);
6450   for(i = 0; i < CB_LISTNUM(words); i++){
6451     word = CB_LISTVAL2(words, i, wsiz);
6452     if(wsiz < 1 || !strcmp(word, ESTOPUVSET)) continue;
6453     rtext = (unsigned char *)est_uconv_in(word, wsiz, &size);
6454     est_canonicalize_text(rtext, size, TRUE);
6455     CB_LISTPUSHBUF(rwords, (char *)rtext, size);
6456   }
6457   rtext = (unsigned char *)est_uconv_in(str, len, &size);
6458   ctext = (unsigned char *)cbmemdup((char *)rtext, size);
6459   est_canonicalize_text(ctext, size, FALSE);
6460   mywidth = hwidth;
6461   if(CB_LISTNUM(rwords) < 1) mywidth *= 3;
6462   if(mywidth > wwidth) mywidth = wwidth;
6463   for(i = 0; i < size && mywidth > 0; i += 2){
6464     mywidth -= est_char_category(rtext[i] * 0x100 + rtext[i+1]) == ESTEASTALPH ? 2 : 1;
6465   }
6466   awsiz = size - i;
6467   if(awsiz > ESTWORDMAXLEN) awsiz = ESTWORDMAXLEN;
6468   est_snippet_add_text(rtext, ctext, i, awsiz, res, rwords);
6469   wwidth -= hwidth;
6470   bi = i + 2;
6471   CB_DATUMCAT(res, "\n", 1);
6472   if(awidth > 0){
6473     counts = cbmapopenex(ESTMINIBNUM);
6474     for(i = bi; i < size && wwidth >= 0; i += 2){
6475       for(j = 0; j < CB_LISTNUM(rwords); j++){
6476         rword = (unsigned char *)CB_LISTVAL2(rwords, j, rwsiz);
6477         if(est_str_fwmatch_wide(ctext + i, size - i, rword, rwsiz) > 0 &&
6478            (!(cval = cbmapget(counts, (char *)rword, rwsiz, &csiz)) ||
6479             csiz < (wwidth > awidth * 1.2 ? 2 : 1))){
6480           cbmapputcat(counts, (char *)rword, rwsiz, "*", 1);
6481           if(cbmaprnum(counts) >= CB_LISTNUM(rwords)){
6482             cbmapclose(counts);
6483             counts = cbmapopenex(ESTMINIBNUM);
6484           }
6485           mywidth = awidth / 2 + 1;
6486           for(k = i - 2; k >= bi && mywidth >= 0; k -= 2){
6487             mywidth -= est_char_category(rtext[k] * 0x100 + rtext[k+1]) == ESTEASTALPH ? 2 : 1;
6488           }
6489           bi = k;
6490           mywidth = awidth / 2 + 1;
6491           for(k = i + rwsiz + 2; k < size && mywidth >= 0; k += 2){
6492             mywidth -= est_char_category(rtext[k] * 0x100 + rtext[k+1]) == ESTEASTALPH ? 2 : 1;
6493           }
6494           if(k > size) k = size;
6495           est_snippet_add_text(rtext + bi, ctext + bi, k - bi, 0, res, rwords);
6496           wwidth -= awidth + rwsiz / 2;
6497           bi = k + 2;
6498           i = bi - 2;
6499           CB_DATUMCAT(res, "\n", 1);
6500           break;
6501         }
6502       }
6503     }
6504     cbmapclose(counts);
6505   }
6506   free(ctext);
6507   free(rtext);
6508   CB_LISTCLOSE(rwords);
6509   return cbdatumtomalloc(res, NULL);
6510 }
6511 
6512 
6513 /* Check whether a string is compsed of CJK characters only.
6514    `str' specifies a string of UTF-8.
6515    The return value is whether the string is compsed of CJK characters only. */
est_check_cjk_only(const char * str)6516 static int est_check_cjk_only(const char *str){
6517   const unsigned char *rp;
6518   int size;
6519   rp = (unsigned char *)str;
6520   size = strlen(str);
6521   while(rp < (unsigned char *)str + size){
6522     if(*rp < 0x7f){
6523       return FALSE;
6524     } else if(*rp < 0xdf){
6525       return FALSE;
6526     } else if(*rp < 0xf0){
6527       if(rp >= (unsigned char *)str + size - 2) break;
6528       rp += 3;
6529     } else if(*rp < 0xf8){
6530       if(rp >= (unsigned char *)str + size - 3) break;
6531       rp += 4;
6532     } else if(*rp < 0xfb){
6533       if(rp >= (unsigned char *)str + size - 4) break;
6534       rp += 5;
6535     } else if(*rp < 0xfd){
6536       if(rp >= (unsigned char *)str + size - 5) break;
6537       rp += 6;
6538     } else {
6539       break;
6540     }
6541   }
6542   return TRUE;
6543 }
6544 
6545 
6546 /* Convert a simplified phrase into complete form.
6547    `sphrase' specifies a simplified phrase.
6548    The return value is the complete form of the phrase. */
est_phrase_from_simple(const char * sphrase)6549 static char *est_phrase_from_simple(const char *sphrase){
6550   CBDATUM *datum;
6551   const char *oper, *rp, *pv;
6552   unsigned char *utext;
6553   char *rtext;
6554   int size, quote, lw;
6555   assert(sphrase);
6556   CB_DATUMOPEN(datum);
6557   utext = (unsigned char *)est_uconv_in(sphrase, strlen(sphrase), &size);
6558   est_normalize_text(utext, size, &size);
6559   est_canonicalize_text(utext, size, FALSE);
6560   rtext = est_uconv_out((char *)utext, size, NULL);
6561   cbstrsqzspc(rtext);
6562   quote = FALSE;
6563   oper = NULL;
6564   lw = FALSE;
6565   for(rp = rtext; *rp != '\0'; rp++){
6566     if(*rp == '"'){
6567       if(oper){
6568         CB_DATUMCAT(datum, oper, strlen(oper));
6569         oper = NULL;
6570       }
6571       quote = !quote;
6572       continue;
6573     }
6574     if(quote){
6575       CB_DATUMCAT(datum, rp, 1);
6576       continue;
6577     }
6578     switch(*rp){
6579     case ' ':
6580       if(!oper) oper = " AND ";
6581       lw = FALSE;
6582       break;
6583     case '&':
6584       oper = " AND ";
6585       lw = FALSE;
6586       break;
6587     case '|':
6588       oper = " OR ";
6589       lw = FALSE;
6590       break;
6591     case '!':
6592       oper = " ANDNOT ";
6593       lw = FALSE;
6594       break;
6595     default:
6596       if(oper){
6597         CB_DATUMCAT(datum, oper, strlen(oper));
6598         oper = NULL;
6599       }
6600       if(!lw){
6601         pv = rp;
6602         while(*pv != '\0' && *pv != ' '){
6603           pv++;
6604         }
6605         if(pv > rp + 1 && pv[-1] == '*'){
6606           if(rp[0] == '*'){
6607             CB_DATUMCAT(datum, ESTOPWCRX " ",  strlen(ESTOPWCRX) + 1);
6608           } else {
6609             CB_DATUMCAT(datum, ESTOPWCBW " ",  strlen(ESTOPWCBW) + 1);
6610           }
6611         } else if(pv > rp + 1 && rp[0] == '*'){
6612           if(pv[-1] == '*'){
6613             CB_DATUMCAT(datum, ESTOPWCRX " ",  strlen(ESTOPWCRX) + 1);
6614           } else {
6615             CB_DATUMCAT(datum, ESTOPWCEW " ",  strlen(ESTOPWCEW) + 1);
6616           }
6617         }
6618       }
6619       if(*rp != '*' || (lw && rp[1] != '\0' && rp[1] != ' ')) CB_DATUMCAT(datum, rp, 1);
6620       lw = TRUE;
6621     }
6622   }
6623   free(rtext);
6624   free(utext);
6625   return cbdatumtomalloc(datum, NULL);
6626 }
6627 
6628 
6629 /* Convert a rough phrase into complete form.
6630    `rphrase' specifies a simplified phrase.
6631    The return value is the complete form of the phrase. */
est_phrase_from_rough(const char * rphrase)6632 static char *est_phrase_from_rough(const char *rphrase){
6633   CBDATUM *datum;
6634   const char *oper, *rp;
6635   unsigned char *utext;
6636   char *rtext;
6637   int size, quote, lw;
6638   assert(rphrase);
6639   CB_DATUMOPEN(datum);
6640   utext = (unsigned char *)est_uconv_in(rphrase, strlen(rphrase), &size);
6641   est_normalize_text(utext, size, &size);
6642   est_canonicalize_text(utext, size, FALSE);
6643   rtext = est_uconv_out((char *)utext, size, NULL);
6644   cbstrsqzspc(rtext);
6645   quote = FALSE;
6646   oper = NULL;
6647   lw = FALSE;
6648   for(rp = rtext; *rp != '\0'; rp++){
6649     if(*rp == '"'){
6650       if(oper){
6651         CB_DATUMCAT(datum, oper, strlen(oper));
6652         oper = NULL;
6653       }
6654       quote = !quote;
6655       continue;
6656     }
6657     if(quote){
6658       CB_DATUMCAT(datum, rp, 1);
6659       continue;
6660     }
6661     switch(*rp){
6662     case ' ':
6663       if(!oper) oper = " AND ";
6664       lw = FALSE;
6665       break;
6666     case '&':
6667       oper = " AND ";
6668       lw = FALSE;
6669       break;
6670     case '|':
6671       oper = " OR ";
6672       lw = FALSE;
6673       break;
6674     case '-':
6675       if(lw){
6676         CB_DATUMCAT(datum, rp, 1);
6677       } else {
6678         oper = " ANDNOT ";
6679       }
6680       break;
6681     default:
6682       if(oper){
6683         CB_DATUMCAT(datum, oper, strlen(oper));
6684         oper = NULL;
6685       }
6686       CB_DATUMCAT(datum, rp, 1);
6687       lw = TRUE;
6688     }
6689   }
6690   free(rtext);
6691   free(utext);
6692   return cbdatumtomalloc(datum, NULL);
6693 }
6694 
6695 
6696 /* Convert a union phrase into complete form.
6697    `uphrase' specifies a simplified phrase.
6698    The return value is the complete form of the phrase. */
est_phrase_from_union(const char * uphrase)6699 static char *est_phrase_from_union(const char *uphrase){
6700   CBDATUM *datum;
6701   CBLIST *terms;
6702   const char *term;
6703   unsigned char *utext;
6704   char *rtext;
6705   int i, size;
6706   assert(uphrase);
6707   CB_DATUMOPEN(datum);
6708   utext = (unsigned char *)est_uconv_in(uphrase, strlen(uphrase), &size);
6709   est_normalize_text(utext, size, &size);
6710   est_canonicalize_text(utext, size, FALSE);
6711   rtext = est_uconv_out((char *)utext, size, NULL);
6712   cbstrsqzspc(rtext);
6713   terms = cbsplit(rtext, -1, " ");
6714   for(i = 0; i < CB_LISTNUM(terms); i++){
6715     term = CB_LISTVAL2(terms, i, size);
6716     if(size < 1) continue;
6717     if(CB_DATUMSIZE(datum) > 0) CB_DATUMCAT(datum, " OR ", 4);
6718     CB_DATUMCAT(datum, term, size);
6719   }
6720   CB_LISTCLOSE(terms);
6721   free(rtext);
6722   free(utext);
6723   return cbdatumtomalloc(datum, NULL);
6724 }
6725 
6726 
6727 /* Convert a intersection phrase into complete form.
6728    `iphrase' specifies a simplified phrase.
6729    The return value is the complete form of the phrase. */
est_phrase_from_isect(const char * iphrase)6730 static char *est_phrase_from_isect(const char *iphrase){
6731   CBDATUM *datum;
6732   CBLIST *terms;
6733   const char *term;
6734   unsigned char *utext;
6735   char *rtext;
6736   int i, size;
6737   assert(iphrase);
6738   CB_DATUMOPEN(datum);
6739   utext = (unsigned char *)est_uconv_in(iphrase, strlen(iphrase), &size);
6740   est_normalize_text(utext, size, &size);
6741   est_canonicalize_text(utext, size, FALSE);
6742   rtext = est_uconv_out((char *)utext, size, NULL);
6743   cbstrsqzspc(rtext);
6744   terms = cbsplit(rtext, -1, " ");
6745   for(i = 0; i < CB_LISTNUM(terms); i++){
6746     term = CB_LISTVAL2(terms, i, size);
6747     if(size < 1) continue;
6748     if(CB_DATUMSIZE(datum) > 0) CB_DATUMCAT(datum, " AND ", 5);
6749     CB_DATUMCAT(datum, term, size);
6750   }
6751   CB_LISTCLOSE(terms);
6752   free(rtext);
6753   free(utext);
6754   return cbdatumtomalloc(datum, NULL);
6755 }
6756 
6757 
6758 /* Add a string to a snippet.
6759    `rtext' specifies a raw text.
6760    `ctext' specifies a canonicalized text.
6761    `size' specifies the size of the raw text and the canonicalized text.
6762    `awsiz' specifies the size of allowance for matching words.
6763    `res' specifies a datum object for the result.
6764    `rwords' specifies a list object of raw words. */
est_snippet_add_text(const unsigned char * rtext,const unsigned char * ctext,int size,int awsiz,CBDATUM * res,const CBLIST * rwords)6765 static void est_snippet_add_text(const unsigned char *rtext, const unsigned char *ctext,
6766                                  int size, int awsiz, CBDATUM *res, const CBLIST *rwords){
6767   const unsigned char *rword;
6768   char *orig;
6769   int i, j, bi, rwsiz, step, osiz;
6770   bi = 0;
6771   for(i = 0; i < size; i += 2){
6772     for(j = 0; j < CB_LISTNUM(rwords); j++){
6773       rword = (unsigned char *)CB_LISTVAL2(rwords, j, rwsiz);
6774       if((step = est_str_fwmatch_wide(ctext + i, size + awsiz - i, rword, rwsiz)) > 0){
6775         if(i - bi > 0){
6776           orig = est_uconv_out((char *)rtext + bi, i - bi, &osiz);
6777           CB_DATUMCAT(res, orig, osiz);
6778           CB_DATUMCAT(res, "\n", 1);
6779           free(orig);
6780         }
6781         orig = est_uconv_out((char *)rtext + i, step, &osiz);
6782         CB_DATUMCAT(res, orig, osiz);
6783         free(orig);
6784         CB_DATUMCAT(res, "\t", 1);
6785         orig = est_uconv_out((char *)rword, rwsiz, &osiz);
6786         CB_DATUMCAT(res, orig, osiz);
6787         free(orig);
6788         CB_DATUMCAT(res, "\n", 1);
6789         bi = i + step;
6790         i = bi - 2;
6791         break;
6792       }
6793     }
6794   }
6795   if(i - bi > 0){
6796     orig = est_uconv_out((char *)rtext + bi, i - bi, &osiz);
6797     CB_DATUMCAT(res, orig, osiz);
6798     CB_DATUMCAT(res, "\n", 1);
6799     free(orig);
6800   }
6801 }
6802 
6803 
6804 /* Check whether a string begins with a key.
6805    `string' specifies a target string whose encoding is UTF-16BE.
6806    `size' specifies the size of the target string.
6807    `key' specifies a key string whose encoding is UTF-16BE.
6808    `ksiz' specifies the size of the key string.
6809    `key' specifies the pointer
6810    The return value is the number of characters of the corresponding string, or 0 if the target
6811    string does not begin with the key. */
est_str_fwmatch_wide(const unsigned char * str,int size,const unsigned char * key,int ksiz)6812 static int est_str_fwmatch_wide(const unsigned char *str, int size,
6813                                 const unsigned char *key, int ksiz){
6814   int si, ki;
6815   assert(str && size >= 0 && key && ksiz >= 0);
6816   if(size < 2 || ksiz < 2 || (str[0] == 0x0 && str[1] <= 0x20)) return 0;
6817   si = 0;
6818   ki = 0;
6819   while(ki < ksiz){
6820     if(si >= size) return 0;
6821     if(str[si] == 0x0 && str[si+1] <= 0x20){
6822       si += 2;
6823       continue;
6824     }
6825     if(key[ki] == 0x0 && key[ki+1] <= 0x20){
6826       ki += 2;
6827       continue;
6828     }
6829     if(str[si] != key[ki] || str[si+1] != key[ki+1]) return 0;
6830     si += 2;
6831     ki += 2;
6832   }
6833   return si;
6834 }
6835 
6836 
6837 /* Find the first occurrence of a substring ignoring space characters.
6838    `haystack' specifies a target string.
6839    `needle' specifies a substring.
6840    The the pointer to the first occurrence. */
est_strstr_sparse(const char * haystack,const char * needle)6841 static char *est_strstr_sparse(const char *haystack, const char *needle){
6842   const char *hp, *np;
6843   assert(haystack && needle);
6844   while(*needle > '\0' && *needle <= ' '){
6845     needle++;
6846   }
6847   if(needle[0] == '\0') return (char *)haystack;
6848   while((haystack = strchr(haystack, *needle)) != NULL){
6849     hp = haystack;
6850     np = needle;
6851     while(TRUE){
6852       while(*hp > '\0' && *hp <= ' '){
6853         hp++;
6854       }
6855       while(*np > '\0' && *np <= ' '){
6856         np++;
6857       }
6858       if(*np == '\0') return (char *)haystack;
6859       if(*hp != *np || *hp == '\0') break;
6860       hp++;
6861       np++;
6862     }
6863     haystack++;
6864   }
6865   return NULL;
6866 }
6867 
6868 
6869 /* Get the last ID number in an index record.
6870    `vbuf' specifies the pointer to the value of a record.
6871    `vsiz' specifies the size of the value.
6872    `smode' specifies a mode of score type.
6873    The return value is the last ID number in a record. */
est_idx_rec_last_id(const char * vbuf,int vsiz,int smode)6874 static int est_idx_rec_last_id(const char *vbuf, int vsiz, int smode){
6875   const char *rp, *ep, *sp;
6876   int cid, vnum, vstep;
6877   assert(vbuf && vsiz >= 0);
6878   cid = 0;
6879   rp = vbuf;
6880   ep = vbuf + vsiz;
6881   while(rp < ep){
6882     EST_READ_VNUMBUF(rp, vnum, vstep);
6883     cid += vnum + 1;
6884     rp += vstep;
6885     sp = rp;
6886     switch(smode){
6887     case ESTDFSCVOID:
6888       break;
6889     default:
6890       rp++;
6891       break;
6892     case ESTDFSCINT:
6893     case ESTDFSCASIS:
6894       rp += sizeof(int);
6895       break;
6896     }
6897     while(*rp != 0x00){
6898       rp += 2;
6899     }
6900     rp++;
6901   }
6902   return cid;
6903 }
6904 
6905 
6906 /* Encode a raw index record into a gap form.
6907    `datum' specifies a datum to store the result.
6908    `vbuf' specifies the pointer to the value of a raw index record.
6909    `vsiz' specifies the size of the value of the record.
6910    `lid' specifies the last ID number in the existing record.
6911    `smode' specifies a mode of score type. */
est_encode_idx_rec(CBDATUM * datum,const char * vbuf,int vsiz,int lid,int smode)6912 static void est_encode_idx_rec(CBDATUM *datum, const char *vbuf, int vsiz, int lid, int smode){
6913   const char *rp, *ep, *sp;
6914   char nbuf[ESTNUMBUFSIZ];
6915   int cid, vstep;
6916   assert(datum && vbuf && vsiz >= 0);
6917   rp = vbuf;
6918   ep = vbuf + vsiz;
6919   while(rp < ep){
6920     EST_READ_VNUMBUF(rp, cid, vstep);
6921     rp += vstep;
6922     sp = rp;
6923     switch(smode){
6924     case ESTDFSCVOID:
6925       break;
6926     default:
6927       rp++;
6928       break;
6929     case ESTDFSCINT:
6930     case ESTDFSCASIS:
6931       rp += sizeof(int);
6932       break;
6933     }
6934     while(*rp != 0x00){
6935       rp += 2;
6936     }
6937     rp++;
6938     EST_SET_VNUMBUF(vstep, nbuf, cid - lid - 1);
6939     CB_DATUMCAT(datum, nbuf, vstep);
6940     CB_DATUMCAT(datum, sp, rp - sp);
6941     lid = cid;
6942   }
6943 }
6944 
6945 
6946 /* Decode a gap index record into a raw form.
6947    `datum' specifies a datum to store the result.
6948    `vbuf' specifies the pointer to the value of a gap index record.
6949    `vsiz' specifies the size of the value of the record.
6950    `smode' specifies a mode of score type. */
est_decode_idx_rec(CBDATUM * datum,const char * vbuf,int vsiz,int smode)6951 static void est_decode_idx_rec(CBDATUM *datum, const char *vbuf, int vsiz, int smode){
6952   const char *rp, *ep, *sp;
6953   char nbuf[ESTNUMBUFSIZ];
6954   int cid, vnum, vstep;
6955   assert(datum && vbuf && vsiz >= 0);
6956   rp = vbuf;
6957   ep = vbuf + vsiz;
6958   cid = 0;
6959   while(rp < ep){
6960     EST_READ_VNUMBUF(rp, vnum, vstep);
6961     cid += vnum + 1;
6962     rp += vstep;
6963     sp = rp;
6964     switch(smode){
6965     case ESTDFSCVOID:
6966       break;
6967     default:
6968       rp++;
6969       break;
6970     case ESTDFSCINT:
6971     case ESTDFSCASIS:
6972       rp += sizeof(int);
6973       break;
6974     }
6975     while(*rp != 0x00){
6976       rp += 2;
6977     }
6978     rp++;
6979     EST_SET_VNUMBUF(vstep, nbuf, cid);
6980     CB_DATUMCAT(datum, nbuf, vstep);
6981     CB_DATUMCAT(datum, sp, rp - sp);
6982   }
6983 }
6984 
6985 
6986 /* Open the inverted index.
6987    `name' specifies the name of a directory.
6988    `omode' specifies an open mode of Villa.
6989    `dnum' specifies the number of database files.
6990    The return value is a database object of the database. */
est_idx_open(const char * name,int omode,int dnum)6991 static ESTIDX *est_idx_open(const char *name, int omode, int dnum){
6992   ESTIDX *idx;
6993   CBLIST *files;
6994   const char *file;
6995   char path[ESTPATHBUFSIZ];
6996   int i, crdnum;
6997   assert(name && dnum > 0);
6998   if(dnum > ESTIDXDMAX) dnum = ESTIDXDMAX;
6999   CB_MALLOC(idx, sizeof(ESTIDX));
7000   if((omode & VL_OCREAT) && !est_mkdir(name) && errno != EEXIST) return NULL;
7001   if((omode & VL_OTRUNC) && (files = cbdirlist(name)) != NULL){
7002     for(i = 0; i < CB_LISTNUM(files); i++){
7003       file = CB_LISTVAL(files, i);
7004       if(!strcmp(file, ESTCDIRSTR) || !strcmp(file, ESTPDIRSTR)) continue;
7005       sprintf(path, "%s%c%s", name, ESTPATHCHR, file);
7006       if(unlink(path) == -1) est_rmdir_rec(path);
7007     }
7008     CB_LISTCLOSE(files);
7009   }
7010   for(i = 0; i < dnum; i++){
7011     sprintf(path, "%s%c%04d", name, ESTPATHCHR, i + 1);
7012     crdnum = vlcrdnum;
7013     vlcrdnum = ESTVLCRDNUM;
7014     if(!(idx->dbs[i] = vlopen(path, omode, VL_CMPLEX))){
7015       while(--i >= 0){
7016         vlclose(idx->dbs[i]);
7017       }
7018       vlcrdnum = crdnum;
7019       return NULL;
7020     }
7021     vlcrdnum = crdnum;
7022   }
7023   idx->name = cbmemdup(name, -1);
7024   idx->omode = omode;
7025   idx->dnum = dnum;
7026   idx->cdb = idx->dbs[dnum-1];
7027   return idx;
7028 }
7029 
7030 
7031 /* Close the inverted index.
7032    `idx' specifies an object of the inverted index.
7033    The return value is true if success, else it is false. */
est_idx_close(ESTIDX * idx)7034 static int est_idx_close(ESTIDX *idx){
7035   int i, err;
7036   assert(idx);
7037   err = FALSE;
7038   for(i = 0; i < idx->dnum; i++){
7039     if(!vlclose(idx->dbs[i])) err = TRUE;
7040   }
7041   free(idx->name);
7042   free(idx);
7043   return err ? FALSE : TRUE;
7044 }
7045 
7046 
7047 /* Set the tuning parameters of the inverted index.
7048    `idx' specifies an object of the inverted index.
7049    `lrecmax' specifies the max number of records in a leaf node of B+ tree.
7050    `nidxmax' specifies the max number of indexes in a non-leaf node of B+ tree.
7051    `lcnum' specifies the max number of caching leaf nodes.
7052    `ncnum' specifies the max number of caching non-leaf nodes.
7053    `fbpsiz' specifies the size of the free block pool.
7054    Other parameters are same with `vlsettuning' of Villa. */
est_idx_set_tuning(ESTIDX * idx,int lrecmax,int nidxmax,int lcnum,int ncnum,int fbpsiz)7055 static void est_idx_set_tuning(ESTIDX *idx, int lrecmax, int nidxmax, int lcnum, int ncnum,
7056                                int fbpsiz){
7057   int i;
7058   assert(idx);
7059   for(i = 0; i < idx->dnum; i++){
7060     vlsettuning(idx->dbs[i], lrecmax, nidxmax, lcnum, ncnum);
7061     if(fbpsiz > 0) vlsetfbpsiz(idx->dbs[i], fbpsiz);
7062   }
7063 }
7064 
7065 
7066 /* Increment the inverted index.
7067    `idx' specifies an object of the inverted index. */
est_idx_increment(ESTIDX * idx)7068 static void est_idx_increment(ESTIDX *idx){
7069   char path[ESTPATHBUFSIZ];
7070   int i, min, size, crdnum;
7071   assert(idx);
7072   min = INT_MAX;
7073   for(i = 0; i < idx->dnum; i++){
7074     size = vlfsiz(idx->cdb);
7075     if(size < min) min = size;
7076   }
7077   if(idx->dnum >= ESTIDXDMAX || (idx->dnum >= ESTIDXDSTD && min < ESTIDXDBMAX)){
7078     est_idx_set_current(idx);
7079     return;
7080   }
7081   sprintf(path, "%s%c%04d", idx->name, ESTPATHCHR, idx->dnum + 1);
7082   crdnum = vlcrdnum;
7083   vlcrdnum = ESTVLCRDNUM;
7084   if((idx->dbs[idx->dnum] = vlopen(path, idx->omode | VL_OCREAT | VL_OTRUNC, VL_CMPLEX)) != NULL){
7085     idx->cdb = idx->dbs[idx->dnum];
7086     idx->dnum++;
7087   }
7088   vlcrdnum = crdnum;
7089 }
7090 
7091 
7092 /* Get the number of files of the inverted index.
7093    The return the number of files of the inverted index. */
est_idx_dnum(ESTIDX * idx)7094 static int est_idx_dnum(ESTIDX *idx){
7095   assert(idx);
7096   return idx->dnum;
7097 }
7098 
7099 
7100 /* Add a record to the inverted index.
7101    `idx' specifies an object of the inverted index.
7102    `word' specifies a word.
7103    `vbuf' specifies the pointer to the value of a record.
7104    `vsiz' specifies the size of the value.
7105    `smode' specifies a mode of score type.
7106    The return value is true if success, else it is false. */
est_idx_add(ESTIDX * idx,const char * word,int wsiz,const char * vbuf,int vsiz,int smode)7107 static int est_idx_add(ESTIDX *idx, const char *word, int wsiz,
7108                        const char *vbuf, int vsiz, int smode){
7109   CBDATUM *datum;
7110   const char *obuf;
7111   int rv, lid, osiz;
7112   assert(idx && word && wsiz >= 0 && vbuf && vsiz >= 0);
7113   CB_DATUMOPEN(datum);
7114   lid = 0;
7115   if((obuf = vlgetcache(idx->cdb, word, wsiz, &osiz)) != NULL)
7116     lid = est_idx_rec_last_id(obuf, osiz, smode);
7117   est_encode_idx_rec(datum, vbuf, vsiz, lid, smode);
7118   rv = vlput(idx->cdb, word, wsiz, CB_DATUMPTR(datum), CB_DATUMSIZE(datum), VL_DCAT);
7119   CB_DATUMCLOSE(datum);
7120   return rv;
7121 }
7122 
7123 
7124 /* Store a record to a file of the inverted index.
7125    `idx' specifies an object of the inverted index.
7126    `inum' specifies the index of a file of the inverted index.
7127    `word' specifies a word.
7128    `vbuf' specifies the pointer to the value of a record.
7129    `vsiz' specifies the size of the value.
7130    The return value is true if success, else it is false. */
est_idx_put_one(ESTIDX * idx,int inum,const char * word,int wsiz,const char * vbuf,int vsiz)7131 static int est_idx_put_one(ESTIDX *idx, int inum, const char *word, int wsiz,
7132                            const char *vbuf, int vsiz){
7133   assert(idx && inum >= 0 && word && wsiz >= 0 && vbuf && vsiz >= 0);
7134   return vsiz > 0 ? vlput(idx->dbs[inum], word, wsiz, vbuf, vsiz, VL_DOVER) :
7135     (vlout(idx->dbs[inum], word, wsiz) || dpecode == DP_ENOITEM);
7136 }
7137 
7138 
7139 /* Remove a record from the inverted index.
7140    `idx' specifies an object of the inverted index.
7141    `word' specifies a word.
7142    `wsiz' specifies the size of the word.
7143    The return value is true if success, else it is false.  Even if no item correspongs, it is
7144    success. */
est_idx_out(ESTIDX * idx,const char * word,int wsiz)7145 static int est_idx_out(ESTIDX *idx, const char *word, int wsiz){
7146   int i, err;
7147   assert(idx && word && wsiz >= 0);
7148   err = FALSE;
7149   for(i = 0; i < idx->dnum; i++){
7150     if(!vlout(idx->dbs[i], word, wsiz) && dpecode != DP_ENOITEM) err = TRUE;
7151   }
7152   return err ? FALSE : TRUE;
7153 }
7154 
7155 
7156 /* Get a record from the inverted index.
7157    `idx' specifies an object of the inverted index.
7158    `word' specifies a word.
7159    `wsiz' specifies the size of the word.
7160    `sp' specifies the pointer to a variable to which the size of the region of the return value
7161    is assigned.
7162    `smode' specifies a mode of score type.
7163    The return value is the pointer to the region of the value of the corresponding record.
7164    if no item correspongs, empty region is returned. */
est_idx_scan(ESTIDX * idx,const char * word,int wsiz,int * sp,int smode)7165 static char *est_idx_scan(ESTIDX *idx, const char *word, int wsiz, int *sp, int smode){
7166   CBDATUM *datum;
7167   const char *vbuf;
7168   int i, vsiz;
7169   assert(idx && word && wsiz >= 0 && sp);
7170   CB_DATUMOPEN(datum);
7171   for(i = 0; i < idx->dnum; i++){
7172     if(!(vbuf = vlgetcache(idx->dbs[i], word, wsiz, &vsiz))) continue;
7173     est_decode_idx_rec(datum, vbuf, vsiz, smode);
7174   }
7175   return cbdatumtomalloc(datum, sp);
7176 }
7177 
7178 
7179 /* Get a record from a file of the inverted index.
7180    `idx' specifies an object of the inverted index.
7181    `inum' specifies the index of a file of the inverted index.
7182    `word' specifies a word.
7183    `wsiz' specifies the size of the word.
7184    `sp' specifies the pointer to a variable to which the size of the region of the return value
7185    is assigned.
7186    The return value is the pointer to the region of the value of the corresponding record.
7187    if no item correspongs, `NULL' is returned.  Because the region of the return value is
7188    volatile, it sould be copied immediately. */
est_idx_get_one(ESTIDX * idx,int inum,const char * word,int wsiz,int * sp)7189 static const char *est_idx_get_one(ESTIDX *idx, int inum, const char *word, int wsiz, int *sp){
7190   assert(idx && inum >= 0 && word && wsiz >= 0 && sp);
7191   return vlgetcache(idx->dbs[inum], word, wsiz, sp);
7192 }
7193 
7194 
7195 /* Get the size of the value of a record in the inverted index.
7196    `idx' specifies an object of the inverted index.
7197    `word' specifies a word.
7198    `wsiz' specifies the size of the word.
7199    The return value is the size of the value of the corresponding record.
7200    if no item correspongs, 0 is returned. */
est_idx_vsiz(ESTIDX * idx,const char * word,int wsiz)7201 static int est_idx_vsiz(ESTIDX *idx, const char *word, int wsiz){
7202   int i, sum, vsiz;
7203   assert(idx && word && wsiz >= 0);
7204   sum = 0;
7205   for(i = 0; i < idx->dnum; i++){
7206     if((vsiz = vlvsiz(idx->dbs[i], word, wsiz)) < 1) continue;
7207     sum += vsiz;
7208   }
7209   return sum;
7210 }
7211 
7212 
7213 /* Get the number of division of the inverted index.
7214    `idx' specifies an object of the inverted index.
7215    The return value is the number of division of the inverted index. */
est_idx_num(ESTIDX * idx)7216 static int est_idx_num(ESTIDX *idx){
7217   assert(idx);
7218   return idx->dnum;
7219 }
7220 
7221 
7222 /* Get the size of the inverted index.
7223    `idx' specifies an object of the inverted index.
7224    The return value is the size of the inverted index. */
est_idx_size(ESTIDX * idx)7225 static double est_idx_size(ESTIDX *idx){
7226   int i;
7227   double size;
7228   assert(idx);
7229   size = 0;
7230   for(i = 0; i < idx->dnum; i++){
7231     size += vlfsiz(idx->dbs[i]);
7232   }
7233   return size;
7234 }
7235 
7236 
7237 /* Get the size of the current file of the inverted index.
7238    `idx' specifies an object of the inverted index.
7239    The return value is the size of the current file of the inverted index. */
est_idx_size_current(ESTIDX * idx)7240 static int est_idx_size_current(ESTIDX *idx){
7241   assert(idx);
7242   return vlfsiz(idx->cdb);
7243 }
7244 
7245 
7246 /* Synchronize updating contents of the inverted index on memory.
7247    `idx' specifies an object of the inverted index.
7248    The return value is true if success, else it is false. */
est_idx_memflush(ESTIDX * idx)7249 static int est_idx_memflush(ESTIDX *idx){
7250   int i;
7251   assert(idx);
7252   for(i = 0; i < idx->dnum; i++){
7253     if(!vlmemflush(idx->dbs[i])) return FALSE;
7254   }
7255   return TRUE;
7256 }
7257 
7258 
7259 /* Syncronize the inverted index.
7260    `idx' specifies an object of the inverted index.
7261    The return value is true if success, else it is false. */
est_idx_sync(ESTIDX * idx)7262 static int est_idx_sync(ESTIDX *idx){
7263   int i;
7264   assert(idx);
7265   for(i = 0; i < idx->dnum; i++){
7266     if(!vlsync(idx->dbs[i])) return FALSE;
7267   }
7268   return TRUE;
7269 }
7270 
7271 
7272 /* Optimize the inverted index.
7273    `idx' specifies an object of the inverted index.
7274    The return value is true if success, else it is false. */
est_idx_optimize(ESTIDX * idx)7275 static int est_idx_optimize(ESTIDX *idx){
7276   int i;
7277   assert(idx);
7278   for(i = 0; i < idx->dnum; i++){
7279     if(!vloptimize(idx->dbs[i])) return FALSE;
7280   }
7281   return TRUE;
7282 }
7283 
7284 
7285 /* Set the current database to the smallest one in the inverted index.
7286    `idx' specifies an object of the inverted index. */
est_idx_set_current(ESTIDX * idx)7287 static void est_idx_set_current(ESTIDX *idx){
7288   int i, size, min;
7289   assert(idx);
7290   min = vlfsiz(idx->cdb);
7291   for(i = 0; i < idx->dnum; i++){
7292     if((size = vlfsiz(idx->dbs[i])) < min){
7293       idx->cdb = idx->dbs[i];
7294       min = size;
7295     }
7296   }
7297 }
7298 
7299 
7300 /* Store a record related to the ID number of a document.
7301    `curia' specifies a database object.
7302    `zmode' specifies a compression mode.
7303    `id' specifies the ID number of a document.
7304    `vbuf' specifies the pointer to the value of a record.
7305    `vsiz' specifies the size of the value.
7306    The return value is true if success, else it is false. */
est_crput(CURIA * curia,int zmode,int id,const char * vbuf,int vsiz,int dmode)7307 static int est_crput(CURIA *curia, int zmode, int id, const char *vbuf, int vsiz, int dmode){
7308   char *zbuf;
7309   int zsiz;
7310   assert(curia && id > 0 && vbuf && vsiz >= 0);
7311   switch(zmode){
7312   case ESTDFZLIB:
7313     if(!(zbuf = est_deflate(vbuf, vsiz, &zsiz, -1))){
7314       dpecode = ESTEMISC;
7315       return FALSE;
7316     }
7317     if(!crput(curia, (char *)&id, sizeof(int), zbuf, zsiz, dmode)){
7318       free(zbuf);
7319       return FALSE;
7320     }
7321     free(zbuf);
7322     break;
7323   case ESTDFLZO:
7324     if(!(zbuf = est_lzoencode(vbuf, vsiz, &zsiz))){
7325       dpecode = ESTEMISC;
7326       return FALSE;
7327     }
7328     if(!crput(curia, (char *)&id, sizeof(int), zbuf, zsiz, dmode)){
7329       free(zbuf);
7330       return FALSE;
7331     }
7332     free(zbuf);
7333     break;
7334   case ESTDFBZIP:
7335     if(!(zbuf = est_bzencode(vbuf, vsiz, &zsiz))){
7336       dpecode = ESTEMISC;
7337       return FALSE;
7338     }
7339     if(!crput(curia, (char *)&id, sizeof(int), zbuf, zsiz, dmode)){
7340       free(zbuf);
7341       return FALSE;
7342     }
7343     free(zbuf);
7344     break;
7345   default:
7346     if(!crput(curia, (char *)&id, sizeof(int), vbuf, vsiz, dmode)) return FALSE;
7347     break;
7348   }
7349   return TRUE;
7350 }
7351 
7352 
7353 /* Remove a record related to the ID number of a document.
7354    `curia' specifies a database object.
7355    `id' specifies the ID number of a document.
7356    The return value is true if success, else it is false. */
est_crout(CURIA * curia,int id)7357 static int est_crout(CURIA *curia, int id){
7358   assert(curia && id > 0);
7359   return crout(curia, (char *)&id, sizeof(int));
7360 }
7361 
7362 
7363 /* Get a record related to the ID number of a document.
7364    `curia' specifies a database object.
7365    `zmode' specifies a compression mode.
7366    `id' specifies the ID number of a document.
7367    `sp' specifies the pointer to a variable to which the size of the region of the return value
7368    is assigned.
7369    The return value is the pointer to the region of the value of the corresponding record. */
est_crget(CURIA * curia,int zmode,int id,int * sp)7370 static char *est_crget(CURIA *curia, int zmode, int id, int *sp){
7371   char *zbuf, *vbuf;
7372   int zsiz;
7373   assert(curia && id > 0 && sp);
7374   switch(zmode){
7375   case ESTDFZLIB:
7376     if(!(zbuf = crget(curia, (char *)&id, sizeof(int), 0, -1, &zsiz))) return NULL;
7377     if(!(vbuf = est_inflate(zbuf, zsiz, sp, -1))){
7378       free(zbuf);
7379       return NULL;
7380     }
7381     free(zbuf);
7382     break;
7383   case ESTDFLZO:
7384     if(!(zbuf = crget(curia, (char *)&id, sizeof(int), 0, -1, &zsiz))) return NULL;
7385     if(!(vbuf = est_lzodecode(zbuf, zsiz, sp))){
7386       free(zbuf);
7387       return NULL;
7388     }
7389     free(zbuf);
7390     break;
7391   case ESTDFBZIP:
7392     if(!(zbuf = crget(curia, (char *)&id, sizeof(int), 0, -1, &zsiz))) return NULL;
7393     if(!(vbuf = est_bzdecode(zbuf, zsiz, sp))){
7394       free(zbuf);
7395       return NULL;
7396     }
7397     free(zbuf);
7398     break;
7399   default:
7400     if(!(vbuf = crget(curia, (char *)&id, sizeof(int), 0, -1, sp))) return NULL;
7401     break;
7402   }
7403   return vbuf;
7404 }
7405 
7406 
7407 /* Add an attribute of a document to a sequencial attribute index.
7408    `db' specifies a handle of a sequencial attribute index.
7409    `id' specifies the ID number of a document.
7410    `vbuf' specifies the pointer to the attribute value.
7411    `vsiz' specifies the size of the attribute value.
7412    The return value is true if success, else it is false. */
est_aidx_seq_put(DEPOT * db,int id,const char * vbuf,int vsiz)7413 static int est_aidx_seq_put(DEPOT *db, int id, const char *vbuf, int vsiz){
7414   int err;
7415   assert(db && id >= 0 && vbuf && vsiz >= 0);
7416   err = FALSE;
7417   if(!dpput(db, (char *)&id, sizeof(int), vbuf, vsiz, DP_DKEEP)) err = TRUE;
7418   return err ? FALSE : TRUE;
7419 }
7420 
7421 
7422 /* Remove an attribute of a document from a sequencial attribute index.
7423    `db' specifies a handle of a sequencial attribute index.
7424    `id' specifies the ID number of a document.
7425    The return value is true if success, else it is false. */
est_aidx_seq_out(DEPOT * db,int id)7426 static int est_aidx_seq_out(DEPOT *db, int id){
7427   int err;
7428   assert(db && id >= 0);
7429   err = FALSE;
7430   if(!dpout(db, (char *)&id, sizeof(int))) err = TRUE;
7431   return err ? FALSE : TRUE;
7432 }
7433 
7434 
7435 /* Retrieve the value of an attribute of a document in a sequencial attribute index.
7436    `db' specifies a handle of a sequencial attribute index.
7437    `id' specifies the ID number of a document.
7438    The return value is the value of the attribute or `NULL' if no attribute. */
est_aidx_seq_get(DEPOT * db,int id,int * sp)7439 static char *est_aidx_seq_get(DEPOT *db, int id, int *sp){
7440   assert(db && id >= 0 && sp);
7441   return dpget(db, (char *)&id, sizeof(int), 0, -1, sp);
7442 }
7443 
7444 
7445 /* Narrow scores of search candidates with a sequencial attribute index.
7446    `db' specifies a handle of a sequencial attribute index.
7447    `pdocs' specifies a list of pseudo documents.
7448    `cop' specifies the pointer to the operator.
7449    `sign' specifies the sign of operation.
7450    `oval' specifies the operation value.
7451    `osiz' specifies the size of the operation value
7452    `sval' specifies the operation value of small cases.
7453    `ssiz' specifies the size of the operation value of small cases.
7454    `regex' specifies the regular expressions.
7455    `onum' specifies the numeric value.
7456    `scores' specifies an array of scores of search candidates.
7457    `snum' specifies the number of the array.
7458    `limit' specifies the limit number to check.
7459    `restp' specifies the pointer to a variable to which rest number to be checked is assigned.
7460    The return value is the new number of the array. */
est_aidx_seq_narrow(DEPOT * db,const CBLIST * pdocs,const char * cop,int sign,const char * oval,int osiz,const char * sval,int ssiz,const void * regex,int onum,ESTSCORE * scores,int snum,int limit,int * restp)7461 static int est_aidx_seq_narrow(DEPOT *db, const CBLIST *pdocs, const char *cop, int sign,
7462                                const char *oval, int osiz, const char *sval, int ssiz,
7463                                const void *regex, int onum, ESTSCORE *scores, int snum,
7464                                int limit, int *restp){
7465   char vbuf[ESTAIKBUFSIZ];
7466   int i, nnum, vsiz;
7467   assert(db && cop && oval && osiz >= 0 && scores && snum >= 0 && limit >= 0 && restp);
7468   nnum = 0;
7469   for(i = 0; i < snum; i++){
7470     if(nnum >= limit){
7471       *restp = snum - i;
7472       break;
7473     }
7474     if(scores[i].id >= ESTPDOCIDMIN){
7475       scores[nnum].id = scores[i].id;
7476       scores[nnum].score = scores[i].score;
7477       nnum++;
7478       continue;
7479     }
7480     if((vsiz = dpgetwb(db, (char *)&(scores[i].id), sizeof(int), 0, ESTAIKBUFSIZ - 1, vbuf)) < 0)
7481       continue;
7482     vbuf[vsiz] = '\0';
7483     if(est_match_attr(vbuf, vsiz, cop, sign, oval, osiz, sval, ssiz, regex, onum)){
7484       scores[nnum].id = scores[i].id;
7485       scores[nnum].score = scores[i].score;
7486       nnum++;
7487     }
7488   }
7489   return nnum;
7490 }
7491 
7492 
7493 /* Compare two record in numeric order.
7494    `aptr' specifies the pointer to the region of one key.
7495    `asiz' specifies the size of the region of one key.
7496    `bptr' specifies the pointer to the region of the other key.
7497    `bsiz' specifies the size of the region of the other key.
7498    The return value is positive if the former is big, negative if the latter is big, 0 if both
7499    are equivalent. */
est_aidx_numcmp(const char * aptr,int asiz,const char * bptr,int bsiz)7500 static int est_aidx_numcmp(const char *aptr, int asiz, const char *bptr, int bsiz){
7501   int rv;
7502   if((rv = cbstrmktime(aptr) - cbstrmktime(bptr)) != 0) return rv;
7503   return VL_CMPLEX(aptr, asiz, bptr, bsiz);
7504 }
7505 
7506 
7507 /* Add an attribute of a document to an attribute narrowing index.
7508    `db' specifies a handle of an attribute narrowing index.
7509    `id' specifies the ID number of a document.
7510    `vbuf' specifies the pointer to the attribute value.
7511    `vsiz' specifies the size of the attribute value.
7512    The return value is true if success, else it is false. */
est_aidx_attr_put(VILLA * db,int id,const char * vbuf,int vsiz)7513 static int est_aidx_attr_put(VILLA *db, int id, const char *vbuf, int vsiz){
7514   char *tbuf;
7515   int err, tsiz;
7516   assert(db && id >= 0 && vbuf && vsiz >= 0);
7517   err = FALSE;
7518   tsiz = vsiz + sizeof(int) + 1;
7519   CB_MALLOC(tbuf, tsiz);
7520   memcpy(tbuf, vbuf, vsiz + 1);
7521   memcpy(tbuf + vsiz + 1, &id, sizeof(int));
7522   if(!vlput(db, tbuf, tsiz, "", 0, VL_DKEEP)) err = TRUE;
7523   free(tbuf);
7524   return err ? FALSE : TRUE;
7525 }
7526 
7527 
7528 /* Remove an attribute of a document from an attribute narrowing index.
7529    `db' specifies a handle of an attribute narrowing index.
7530    `id' specifies the ID number of a document.
7531    `vbuf' specifies the pointer to the attribute value.
7532    `vsiz' specifies the size of the attribute value.
7533    The return value is true if success, else it is false. */
est_aidx_attr_out(VILLA * db,int id,const char * vbuf,int vsiz)7534 static int est_aidx_attr_out(VILLA *db, int id, const char *vbuf, int vsiz){
7535   char *tbuf;
7536   int err, tsiz;
7537   assert(db && id >= 0 && vbuf && vsiz >= 0);
7538   err = FALSE;
7539   tsiz = vsiz + sizeof(int) + 1;
7540   CB_MALLOC(tbuf, tsiz);
7541   memcpy(tbuf, vbuf, vsiz + 1);
7542   memcpy(tbuf + vsiz + 1, &id, sizeof(int));
7543   if(!vlout(db, tbuf, tsiz)) err = TRUE;
7544   free(tbuf);
7545   return err ? FALSE : TRUE;
7546 }
7547 
7548 
7549 /* Narrow scores of search candidates with an attribute narrowing index.
7550    `db' specifies a handle of an attribute narrowing index.
7551    `pdocs' specifies a list of pseudo documents.
7552    `cop' specifies the pointer to the operator.
7553    `sign' specifies the sign of operation.
7554    `oval' specifies the operation value.
7555    `osiz' specifies the size of the operation value
7556    `sval' specifies the operation value of small cases.
7557    `ssiz' specifies the size of the operation value of small cases.
7558    `regex' specifies the regular expressions.
7559    `onum' specifies the numeric value.
7560    `scores' specifies an array of scores of search candidates.
7561    `snum' specifies the number of the array.
7562    The return value is the new number of the array. */
est_aidx_attr_narrow(VILLA * db,const CBLIST * pdocs,const char * cop,int sign,const char * oval,int osiz,const char * sval,int ssiz,const void * regex,int onum,ESTSCORE * scores,int snum)7563 static int est_aidx_attr_narrow(VILLA *db, const CBLIST *pdocs, const char *cop, int sign,
7564                                 const char *oval, int osiz, const char *sval, int ssiz,
7565                                 const void *regex, int onum, ESTSCORE *scores, int snum){
7566   CBDATUM *abuf;
7567   CBLIST *tokens;
7568   const char *kbuf;
7569   char numbuf[ESTNUMBUFSIZ], *tmp, *wp;
7570   int i, j, ksiz, len, esc, jmp, id, nnum, *ary, anum;
7571   time_t lower, upper;
7572   assert(db && pdocs && cop && oval && osiz >= 0 && scores && snum >= 0);
7573   CB_DATUMOPEN(abuf);
7574   if(cop == ESTOPSTROREQ && sign && !sval){
7575     tokens = cbsplit(oval, osiz, " ,");
7576     cblistsort(tokens);
7577     for(i = 0; i < CB_LISTNUM(tokens); i++){
7578       oval = CB_LISTVAL2(tokens, i, osiz);
7579       if(osiz < 1) continue;
7580       vlcurjump(db, oval, osiz, VL_JFORWARD);
7581       while((kbuf = vlcurkeycache(db, &ksiz)) != NULL && !strcmp(kbuf, oval)){
7582         CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
7583         vlcurnext(db);
7584       }
7585     }
7586     CB_LISTCLOSE(tokens);
7587   } else if(cop == ESTOPNUMBT && sign && !sval){
7588     CB_MEMDUP(tmp, oval, osiz);
7589     if((wp = strchr(tmp, ' ')) != NULL || (wp = strchr(tmp, '\t')) != NULL){
7590       *(wp++) = '\0';
7591       while(*wp == ' ' || *wp == '\t'){
7592         wp++;
7593       }
7594       lower = cbstrmktime(tmp);
7595       upper = cbstrmktime(wp);
7596     } else {
7597       lower = cbstrmktime(tmp);
7598       upper = INT_MAX;
7599     }
7600     len = sprintf(numbuf, "%.0f", (double)lower);
7601     vlcurjump(db, numbuf, len, VL_JFORWARD);
7602     while((kbuf = vlcurkeycache(db, &ksiz)) != NULL && cbstrmktime(kbuf) <= upper){
7603       CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
7604       vlcurnext(db);
7605     }
7606     free(tmp);
7607   } else if(!sign || sval){
7608     esc = INT_MAX;
7609     jmp = INT_MAX;
7610     if(sign && (cop == ESTOPSTREQ || cop == ESTOPSTRBW) && osiz > 0){
7611       if(*sval > 0x0 && *sval < 0x7f){
7612         numbuf[0] = *sval;
7613         numbuf[1] = '\0';
7614         esc = *(unsigned char *)sval;
7615         if(*sval >= 'a' && *sval <= 'z'){
7616           numbuf[0] -= 'a' - 'A';
7617           jmp = *sval - 'a' + 'A';
7618         }
7619         vlcurjump(db, numbuf, 1, VL_JFORWARD);
7620       } else if(*(unsigned char *)sval >= 0xc0){
7621         numbuf[0] = *sval;
7622         numbuf[1] = '\0';
7623         esc = *(unsigned char *)sval;
7624         vlcurjump(db, numbuf, 1, VL_JFORWARD);
7625       } else {
7626         vlcurfirst(db);
7627       }
7628     } else {
7629       vlcurfirst(db);
7630     }
7631     while((kbuf = vlcurkeycache(db, &ksiz)) != NULL){
7632       if(est_match_attr(kbuf, ksiz - sizeof(int) - 1,
7633                         cop, sign, oval, osiz, sval, ssiz, regex, onum))
7634         CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
7635       if(*(unsigned char *)kbuf > jmp && *(unsigned char *)kbuf < *(unsigned char *)sval){
7636         numbuf[0] = *sval;
7637         numbuf[1] = '\0';
7638         vlcurjump(db, numbuf, 1, VL_JFORWARD);
7639         jmp = INT_MAX;
7640       } else if(*(unsigned char *)kbuf > esc){
7641         break;
7642       } else {
7643         vlcurnext(db);
7644       }
7645     }
7646   } else {
7647     if(cop == ESTOPSTREQ || cop == ESTOPSTRBW ||
7648        cop == ESTOPNUMEQ || cop == ESTOPNUMGT || cop == ESTOPNUMGE){
7649       vlcurjump(db, oval, osiz, VL_JFORWARD);
7650       if(cop == ESTOPNUMGT){
7651         while((kbuf = vlcurkeycache(db, NULL)) != NULL && cbstrmktime(kbuf) <= onum){
7652           vlcurnext(db);
7653         }
7654       }
7655     } else if(cop == ESTOPNUMLT || cop == ESTOPNUMLE){
7656       len = sprintf(numbuf, "%.0f", (double)cbstrmktime(oval) + 1);
7657       vlcurjump(db, numbuf, len, VL_JBACKWARD);
7658       if(cop == ESTOPNUMLT){
7659         while((kbuf = vlcurkeycache(db, NULL)) != NULL && cbstrmktime(kbuf) >= onum){
7660           vlcurprev(db);
7661         }
7662       }
7663     } else {
7664       vlcurfirst(db);
7665     }
7666     while((kbuf = vlcurkeycache(db, &ksiz)) != NULL){
7667       if(est_match_attr(kbuf, ksiz - sizeof(int) - 1,
7668                         cop, TRUE, oval, osiz, sval, ssiz, regex, onum)){
7669         CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
7670       } else if(cop == ESTOPSTREQ || cop == ESTOPSTRBW || cop == ESTOPNUMEQ){
7671         break;
7672       }
7673       if(cop == ESTOPNUMLT || cop == ESTOPNUMLE){
7674         vlcurprev(db);
7675       } else {
7676         vlcurnext(db);
7677       }
7678     }
7679   }
7680   for(i = 0; i < CB_LISTNUM(pdocs); i++){
7681     id = ESTPDOCIDMIN + i;
7682     CB_DATUMCAT(abuf, &id, sizeof(int));
7683   }
7684   nnum = 0;
7685   ary = (int *)CB_DATUMPTR(abuf);
7686   anum = CB_DATUMSIZE(abuf) / sizeof(int);
7687   qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
7688   qsort(ary, anum, sizeof(int), est_int_compare);
7689   for(i = 0, j = 0; i < snum; i++){
7690     while(j < anum && ary[j] < scores[i].id){
7691       j++;
7692     }
7693     if(j < anum && scores[i].id == ary[j]){
7694       scores[nnum].id = scores[i].id;
7695       scores[nnum].score = scores[i].score;
7696       nnum++;
7697     }
7698   }
7699   CB_DATUMCLOSE(abuf);
7700   return nnum;
7701 }
7702 
7703 
7704 /* Compare two integers.
7705    `ap' specifies the pointer to one element.
7706    `bp' specifies the pointer to the other element.
7707    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_int_compare(const void * ap,const void * bp)7708 static int est_int_compare(const void *ap, const void *bp){
7709   assert(ap && bp);
7710   return *(int *)ap - *(int *)bp;
7711 }
7712 
7713 
7714 /* Compare elements of a record for effective compression.
7715    `ap' specifies the pointer to one element.
7716    `bp' specifies the pointer to the other element.
7717    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_short_compare(const void * ap,const void * bp)7718 static int est_short_compare(const void *ap, const void *bp){
7719   assert(ap && bp);
7720   return ((((unsigned char *)ap)[0] << 8) + ((unsigned char *)ap)[1]) -
7721     ((((unsigned char *)bp)[0] << 8) + ((unsigned char *)bp)[1]);
7722 }
7723 
7724 
7725 /* Clean up the inode map.
7726    `arg' specifies a dummy argument. */
est_inodes_delete(void * arg)7727 static void est_inodes_delete(void *arg){
7728 #if defined(NDEBUG)
7729   ESTDB *db;
7730   const char *kbuf;
7731   int ecode;
7732   assert(arg);
7733   if(cbmaprnum(est_inodes) > 0){
7734     cbmapiterinit(est_inodes);
7735     while((kbuf = cbmapiternext(est_inodes, NULL)) != NULL){
7736       db = *(ESTDB **)cbmapiterval(kbuf, NULL);
7737       est_db_set_informer(db, est_inodes_delete_informer, NULL);
7738       est_db_close(db, &ecode);
7739     }
7740   }
7741   cbmapclose(est_inodes);
7742 #else
7743   ESTDB *db;
7744   const char *kbuf;
7745   int ecode;
7746   assert(arg);
7747   if(cbmaprnum(est_inodes) > 0){
7748     cbmapiterinit(est_inodes);
7749     while((kbuf = cbmapiternext(est_inodes, NULL)) != NULL){
7750       db = *(ESTDB **)cbmapiterval(kbuf, NULL);
7751       fprintf(stderr, "\nWARNING: %s is not closed.\n\n", cbmemdup(est_db_name(db), -1));
7752       est_db_set_informer(db, est_inodes_delete_informer, NULL);
7753       est_db_close(db, &ecode);
7754     }
7755   }
7756   cbmapclose(est_inodes);
7757 #endif
7758 }
7759 
7760 
7761 /* Inform a database event while clening up database handles.
7762    `msg' specifies the message of each event.
7763    `opaque' is ignored. */
est_inodes_delete_informer(const char * msg,void * opaque)7764 static void est_inodes_delete_informer(const char *msg, void *opaque){
7765 #if !defined(NDEBUG)
7766   fprintf(stderr, "estraier: %s\n", msg);
7767 #endif
7768 }
7769 
7770 
7771 /* Write meta data to the database.
7772    `db' specifies a database object.
7773    The return value is true if success, else it is false. */
est_db_write_meta(ESTDB * db)7774 static int est_db_write_meta(ESTDB *db){
7775   char vbuf[ESTNUMBUFSIZ], *sbuf;
7776   int err, ssiz;
7777   assert(db);
7778   err = FALSE;
7779   sprintf(vbuf, "%d", est_idx_num(db->idxdb));
7780   if(!dpput(db->metadb, ESTKEYIDXNUM, -1, vbuf, -1, DP_DOVER)) err = TRUE;
7781   sprintf(vbuf, "%d", db->dseq);
7782   if(!dpput(db->metadb, ESTKEYDSEQ, -1, vbuf, -1, DP_DOVER)) err = TRUE;
7783   sprintf(vbuf, "%d", db->dnum);
7784   if(!dpput(db->metadb, ESTKEYDNUM, -1, vbuf, -1, DP_DOVER)) err = TRUE;
7785   if(db->metacc){
7786     sbuf = cbmapdump(db->metacc, &ssiz);
7787     if(!dpput(db->metadb, ESTKEYMETA, -1, sbuf, ssiz, DP_DOVER)) err = TRUE;
7788     free(sbuf);
7789   }
7790   if(err){
7791     est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
7792     db->fatal = TRUE;
7793   }
7794   return err ? FALSE : TRUE;
7795 }
7796 
7797 
7798 /* Call the callback function of a database.
7799    `db' specifies a database object.
7800    `info' specifies an extra message. */
est_db_inform(ESTDB * db,const char * info)7801 static void est_db_inform(ESTDB *db, const char *info){
7802   char *msg;
7803   assert(db && info);
7804   if(!db->infocb) return;
7805   msg = cbsprintf("%s: name=%s dnum=%d wnum=%d fsiz=%.0f crnum=%d csiz=%d dknum=%d",
7806                   info, db->name, db->dnum, vlrnum(db->fwmdb), (double)est_db_size(db),
7807                   cbmaprnum(db->idxcc) + cbmaprnum(db->auxcc), est_db_used_cache_size(db),
7808                   cbmaprnum(db->outcc));
7809   db->infocb(msg, db->infoop);
7810   free(msg);
7811 }
7812 
7813 
7814 /* Prepare cache for meta data.
7815    `db' specifies a database object. */
est_db_prepare_meta(ESTDB * db)7816 static void est_db_prepare_meta(ESTDB *db){
7817   char *sbuf;
7818   int ssiz;
7819   assert(db);
7820   if((sbuf = dpget(db->metadb, ESTKEYMETA, -1, 0, -1, &ssiz)) != NULL){
7821     db->metacc = cbmapload(sbuf, ssiz);
7822     free(sbuf);
7823   } else {
7824     db->metacc = cbmapopenex(ESTMINIBNUM);
7825   }
7826 }
7827 
7828 
7829 /* Score a document object matching the phrase of a search condition object definitely.
7830    `db' specifies a database object.
7831    `doc' specifies a document object.
7832    `cond' specifies a search condition object.
7833    `scp' specifies the pointer to a variable to which the score is assigned.
7834    The return value is true if the document matches the phrase of the condition object
7835    definitely, else it is false. */
est_db_score_doc(ESTDB * db,ESTDOC * doc,ESTCOND * cond,int * scp)7836 static int est_db_score_doc(ESTDB *db, ESTDOC *doc, ESTCOND *cond, int *scp){
7837   struct { char *word; int num; } wsets[ESTSCANWNUM], nsets[ESTSCANWNUM];
7838   CBLIST *terms, *words;
7839   const char *term, *text, *rp;
7840   unsigned char *rbuf;
7841   char *tmp;
7842   int i, j, k, sc, wsnum, nsnum, asiz, tsiz, add, rsiz, hit;
7843   double tune;
7844   assert(db && doc && cond && scp);
7845   *scp = 0;
7846   if(!cond->phrase || cbstrfwmatch(cond->phrase, ESTOPSIMILAR) ||
7847      cbstrfwmatch(cond->phrase, ESTOPID) || cbstrfwmatch(cond->phrase, ESTOPURI)) return FALSE;
7848   if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
7849   switch(cond->pmode){
7850   default:
7851     terms = est_phrase_terms(cond->phrase);
7852     break;
7853   case ESTPMSIMPLE:
7854     tmp = est_phrase_from_simple(cond->phrase);
7855     terms = est_phrase_terms(tmp);
7856     free(tmp);
7857     break;
7858   case ESTPMROUGH:
7859     tmp = est_phrase_from_rough(cond->phrase);
7860     terms = est_phrase_terms(tmp);
7861     free(tmp);
7862     break;
7863   case ESTPMUNION:
7864     tmp = est_phrase_from_union(cond->phrase);
7865     terms = est_phrase_terms(tmp);
7866     free(tmp);
7867     break;
7868   case ESTPMISECT:
7869     tmp = est_phrase_from_isect(cond->phrase);
7870     terms = est_phrase_terms(tmp);
7871     free(tmp);
7872     break;
7873   }
7874   wsnum = 0;
7875   nsnum = 0;
7876   add = TRUE;
7877   for(i = 0; i < CB_LISTNUM(terms); i++){
7878     term = CB_LISTVAL(terms, i);
7879     if(!strcmp(term, ESTOPISECT)){
7880       add = TRUE;
7881     } else if(!strcmp(term, ESTOPDIFF)){
7882       add = FALSE;
7883     } else if(strcmp(term, ESTOPUVSET)){
7884       if(term[0] == ' '){
7885         term++;
7886         if(term[0] == 'b'){
7887           term++;
7888         } else  if(term[0] == 'e'){
7889           term++;
7890         }
7891       }
7892       words = cbsplit(term, -1, "\t");
7893       if(add){
7894         while(wsnum < ESTSCANWNUM && CB_LISTNUM(words) > 0){
7895           wsets[wsnum].word = cblistshift(words, NULL);
7896           wsets[wsnum].num = i;
7897           wsnum++;
7898         }
7899       } else {
7900         while(nsnum < ESTSCANWNUM && CB_LISTNUM(words) > 0){
7901           nsets[nsnum].word = cblistshift(words, NULL);
7902           nsets[nsnum].num = i;
7903           nsnum++;
7904         }
7905       }
7906       CB_LISTCLOSE(words);
7907     }
7908   }
7909   asiz = 0;
7910   sc = 0;
7911   if((rp = cbmapget(doc->attrs, "\t", 1, NULL)) != NULL) sc = -1 - atoi(rp);
7912   for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
7913     if(i < 0){
7914       if(!doc->attrs || !(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
7915       asiz += strlen(text);
7916     } else {
7917       text = CB_LISTVAL2(doc->dtexts, i, tsiz);
7918       asiz += tsiz;
7919     }
7920     rbuf = (unsigned char *)est_uconv_in(text, strlen(text), &rsiz);
7921     est_canonicalize_text(rbuf, rsiz, FALSE);
7922     tmp = est_uconv_out((char *)rbuf, rsiz, &rsiz);
7923     for(j = 0; j < wsnum; j++){
7924       if(!wsets[j].word) continue;
7925       if((rp = est_strstr_sparse(tmp, wsets[j].word)) != NULL){
7926         if(sc >= 0){
7927           do {
7928             sc += 16;
7929             rp += strlen(wsets[j].word);
7930           } while((rp = est_strstr_sparse(rp, wsets[j].word)) != NULL);
7931         }
7932         for(k = 0; k < wsnum; k++){
7933           if(!wsets[k].word) continue;
7934           if(wsets[k].num == wsets[j].num){
7935             free(wsets[k].word);
7936             wsets[k].word = NULL;
7937           }
7938         }
7939       }
7940     }
7941     for(j = 0; j < nsnum; j++){
7942       if(!nsets[j].word) continue;
7943       if((rp = est_strstr_sparse(tmp, nsets[j].word)) != NULL){
7944         for(k = 0; k < nsnum; k++){
7945           if(!nsets[k].word) continue;
7946           if(nsets[k].num == nsets[j].num){
7947             free(nsets[k].word);
7948             nsets[k].word = NULL;
7949           }
7950         }
7951       }
7952     }
7953     free(tmp);
7954     free(rbuf);
7955   }
7956   hit = TRUE;
7957   for(i = 0; i < wsnum; i++){
7958     if(!wsets[i].word) continue;
7959     free(wsets[i].word);
7960     hit = FALSE;
7961   }
7962   for(i = 0; i < nsnum; i++){
7963     if(!nsets[i].word){
7964       hit = FALSE;
7965       continue;
7966     }
7967     free(nsets[i].word);
7968   }
7969   CB_LISTCLOSE(terms);
7970   if(sc < 0) sc = -1 - sc;
7971   tune = sqrt(asiz / 8.0 + 128) / 16.0;
7972   switch(db->smode){
7973   case ESTDFSCVOID:
7974     sc = 0;
7975     break;
7976   default:
7977     sc /= tune;
7978     if(sc >= 0x80) sc += (0x80 - sc) * 0.75;
7979     if(sc >= 0xc0) sc += (0xc0 - sc) * 0.75;
7980     sc = sc < 0xff ? sc : 0xff;
7981     break;
7982   case ESTDFSCINT:
7983     sc /= tune;
7984     break;
7985   case ESTDFSCASIS:
7986     break;
7987   }
7988   *scp = sc;
7989   return hit;
7990 }
7991 
7992 
7993 /* Get the ID of a document specified by URI from pseudo indexes.
7994    `db' specifies a database object.
7995    `uri' specifies the URI of a registered document.
7996    The return value is the ID of the document.  On error, -1 is returned. */
est_pidx_uri_to_id(ESTDB * db,const char * uri)7997 static int est_pidx_uri_to_id(ESTDB *db, const char *uri){
7998   ESTDOC *doc;
7999   const char *vbuf;
8000   int i, vsiz;
8001   assert(db && uri);
8002   if(!db->puris){
8003     db->puris = cbmapopenex(CB_LISTNUM(db->pdocs) + 1);
8004     for(i = 0; i < CB_LISTNUM(db->pdocs); i++){
8005       if((doc = est_db_get_doc(db, ESTPDOCIDMIN + i, 0)) != NULL){
8006         if((vbuf = cbmapget(doc->attrs, ESTDATTRURI, -1, &vsiz)) != NULL)
8007           cbmapput(db->puris, vbuf, vsiz, (char *)&(doc->id), sizeof(int), FALSE);
8008         est_doc_delete(doc);
8009       }
8010     }
8011   }
8012   if((vbuf = cbmapget(db->puris, uri, -1, NULL)) != NULL) return *(int *)vbuf;
8013   return -1;
8014 }
8015 
8016 
8017 /* Create a list of terms for search.
8018    `phrase' specifies a search phrase.
8019    The return value is a list object of the terms of the phrase. */
est_phrase_terms(const char * phrase)8020 static CBLIST *est_phrase_terms(const char *phrase){
8021   CBLIST *terms, *elems;
8022   CBDATUM *datum;
8023   const char *elem;
8024   char *tbuf, *pbuf;
8025   int i, tsiz, psiz, lw;
8026   assert(phrase);
8027   CB_LISTOPEN(terms);
8028   tbuf = est_uconv_in(phrase, strlen(phrase), &tsiz);
8029   est_normalize_text((unsigned char *)tbuf, tsiz, &tsiz);
8030   pbuf = est_uconv_out(tbuf, tsiz, &psiz);
8031   elems = cbsplit(pbuf, psiz, "\a\b\t\n\v\f\r ");
8032   CB_DATUMOPEN(datum);
8033   lw = FALSE;
8034   for(i = 0; i < CB_LISTNUM(elems); i++){
8035     elem = CB_LISTVAL(elems, i);
8036     if(elem[0] == '\0') continue;
8037     if(!strcmp(elem, ESTOPUNION)){
8038       if(CB_DATUMSIZE(datum) < 1) continue;
8039       if(lw) CB_DATUMCAT(datum, "\t", 1);
8040       lw = FALSE;
8041     } else if(!strcmp(elem, ESTOPWCBW)){
8042       if(!lw) CB_DATUMCAT(datum, " b", 2);
8043     } else if(!strcmp(elem, ESTOPWCEW)){
8044       if(!lw) CB_DATUMCAT(datum, " e", 2);
8045     } else if(!strcmp(elem, ESTOPWCRX)){
8046       if(!lw) CB_DATUMCAT(datum, " r", 2);
8047     } else if(!strcmp(elem, ESTOPISECT) || !strcmp(elem, ESTOPDIFF)){
8048       if(CB_DATUMSIZE(datum) < 1) continue;
8049       CB_LISTPUSH(terms, CB_DATUMPTR(datum), CB_DATUMSIZE(datum));
8050       CB_DATUMSETSIZE(datum, 0);
8051       CB_LISTPUSH(terms, elem, strlen(elem));
8052       lw = FALSE;
8053     } else {
8054       if(CB_DATUMSIZE(datum) > 0 && lw) CB_DATUMCAT(datum, " ", 1);
8055       CB_DATUMCAT(datum, elem, strlen(elem));
8056       lw = TRUE;
8057     }
8058   }
8059   if(CB_DATUMSIZE(datum) > 0) CB_LISTPUSH(terms, CB_DATUMPTR(datum), CB_DATUMSIZE(datum));
8060   CB_DATUMCLOSE(datum);
8061   CB_LISTCLOSE(elems);
8062   free(pbuf);
8063   free(tbuf);
8064   for(i = 0; i < CB_LISTNUM(terms); i++){
8065     elem = CB_LISTVAL(terms, i);
8066     if(!strcmp(elem, ESTOPUVSET) || !strcmp(elem, ESTOPISECT) ||
8067        !strcmp(elem, ESTOPDIFF)) continue;
8068     tbuf = est_uconv_in(elem, strlen(elem), &tsiz);
8069     est_canonicalize_text((unsigned char *)tbuf, tsiz, TRUE);
8070     pbuf = est_uconv_out(tbuf, tsiz, &psiz);
8071     cblistover(terms, i, pbuf, -1);
8072     free(pbuf);
8073     free(tbuf);
8074   }
8075   for(i = CB_LISTNUM(terms) - 1; i >= 0; i--){
8076     elem = CB_LISTVAL(terms, i);
8077     if(strcmp(elem, ESTOPISECT) && strcmp(elem, ESTOPDIFF)) break;
8078     CB_LISTDROP(terms);
8079   }
8080   return terms;
8081 }
8082 
8083 
8084 /* Compare two scores by each ID for ascending order.
8085    `ap' specifies the pointer to one score.
8086    `bp' specifies the pointer to the other score.
8087    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_score_compare_by_id_asc(const void * ap,const void * bp)8088 static int est_score_compare_by_id_asc(const void *ap, const void *bp){
8089   assert(ap && bp);
8090   return ((ESTSCORE *)ap)->id - ((ESTSCORE *)bp)->id;
8091 }
8092 
8093 
8094 /* Compare two scores by each ID for descending order.
8095    `ap' specifies the pointer to one score.
8096    `bp' specifies the pointer to the other score.
8097    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_score_compare_by_id_desc(const void * ap,const void * bp)8098 static int est_score_compare_by_id_desc(const void *ap, const void *bp){
8099   assert(ap && bp);
8100   return ((ESTSCORE *)bp)->id - ((ESTSCORE *)ap)->id;
8101 }
8102 
8103 
8104 /* Compare two scores by each score point for ascending order.
8105    `ap' specifies the pointer to one score.
8106    `bp' specifies the pointer to the other score.
8107    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_score_compare_by_score_asc(const void * ap,const void * bp)8108 static int est_score_compare_by_score_asc(const void *ap, const void *bp){
8109   assert(ap && bp);
8110   return ((ESTSCORE *)ap)->score - ((ESTSCORE *)bp)->score;
8111 }
8112 
8113 
8114 /* Compare two scores by each score point for descending order.
8115    `ap' specifies the pointer to one score.
8116    `bp' specifies the pointer to the other score.
8117    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_score_compare_by_score_desc(const void * ap,const void * bp)8118 static int est_score_compare_by_score_desc(const void *ap, const void *bp){
8119   assert(ap && bp);
8120   return ((ESTSCORE *)bp)->score - ((ESTSCORE *)ap)->score;
8121 }
8122 
8123 
8124 /* Compare two scores by attributes of strings for ascending order.
8125    `ap' specifies the pointer to one score.
8126    `bp' specifies the pointer to the other score.
8127    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_score_compare_by_str_asc(const void * ap,const void * bp)8128 static int est_score_compare_by_str_asc(const void *ap, const void *bp){
8129   assert(ap && bp);
8130   return strcmp(((ESTSCORE *)ap)->value, ((ESTSCORE *)bp)->value);
8131 }
8132 
8133 
8134 /* Compare two scores by attributes of strings for descending order.
8135    `ap' specifies the pointer to one score.
8136    `bp' specifies the pointer to the other score.
8137    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_score_compare_by_str_desc(const void * ap,const void * bp)8138 static int est_score_compare_by_str_desc(const void *ap, const void *bp){
8139   assert(ap && bp);
8140   return strcmp(((ESTSCORE *)bp)->value, ((ESTSCORE *)ap)->value);
8141 }
8142 
8143 
8144 /* Compare two scores by attributes of numbers for ascending order.
8145    `ap' specifies the pointer to one score.
8146    `bp' specifies the pointer to the other score.
8147    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_score_compare_by_num_asc(const void * ap,const void * bp)8148 static int est_score_compare_by_num_asc(const void *ap, const void *bp){
8149   assert(ap && bp);
8150   return (time_t)((ESTSCORE *)ap)->value - (time_t)((ESTSCORE *)bp)->value;
8151 }
8152 
8153 
8154 /* Compare two scores by attributes of numbers for descending order.
8155    `ap' specifies the pointer to one score.
8156    `bp' specifies the pointer to the other score.
8157    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_score_compare_by_num_desc(const void * ap,const void * bp)8158 static int est_score_compare_by_num_desc(const void *ap, const void *bp){
8159   assert(ap && bp);
8160   return (time_t)((ESTSCORE *)bp)->value - (time_t)((ESTSCORE *)ap)->value;
8161 }
8162 
8163 
8164 /* Compare two meta scores by each ID for ascending order.
8165    `ap' specifies the pointer to one meta score
8166    `bp' specifies the pointer to the other meta score
8167    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_metascore_compare_by_id_asc(const void * ap,const void * bp)8168 static int est_metascore_compare_by_id_asc(const void *ap, const void *bp){
8169   assert(ap && bp);
8170   return ((ESTMETASCORE *)ap)->id - ((ESTMETASCORE *)bp)->id;
8171 }
8172 
8173 
8174 /* Compare two meta scores by each ID for descending order.
8175    `ap' specifies the pointer to one meta score
8176    `bp' specifies the pointer to the other meta score
8177    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_metascore_compare_by_id_desc(const void * ap,const void * bp)8178 static int est_metascore_compare_by_id_desc(const void *ap, const void *bp){
8179   assert(ap && bp);
8180   return ((ESTMETASCORE *)bp)->id - ((ESTMETASCORE *)ap)->id;
8181 }
8182 
8183 
8184 /* Compare two meta scores by each score point for ascending order.
8185    `ap' specifies the pointer to one meta score
8186    `bp' specifies the pointer to the other meta score
8187    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_metascore_compare_by_score_asc(const void * ap,const void * bp)8188 static int est_metascore_compare_by_score_asc(const void *ap, const void *bp){
8189   assert(ap && bp);
8190   return ((ESTMETASCORE *)ap)->score - ((ESTMETASCORE *)bp)->score;
8191 }
8192 
8193 
8194 /* Compare two meta scores by each score point for descending order.
8195    `ap' specifies the pointer to one meta score
8196    `bp' specifies the pointer to the other meta score
8197    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_metascore_compare_by_score_desc(const void * ap,const void * bp)8198 static int est_metascore_compare_by_score_desc(const void *ap, const void *bp){
8199   assert(ap && bp);
8200   return ((ESTMETASCORE *)bp)->score - ((ESTMETASCORE *)ap)->score;
8201 }
8202 
8203 
8204 /* Compare two meta scores by attributes of strings for ascending order.
8205    `ap' specifies the pointer to one meta score
8206    `bp' specifies the pointer to the other meta score
8207    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_metascore_compare_by_str_asc(const void * ap,const void * bp)8208 static int est_metascore_compare_by_str_asc(const void *ap, const void *bp){
8209   assert(ap && bp);
8210   return strcmp(((ESTMETASCORE *)ap)->value, ((ESTMETASCORE *)bp)->value);
8211 }
8212 
8213 
8214 /* Compare two meta scores by attributes of strings for descending order.
8215    `ap' specifies the pointer to one meta score
8216    `bp' specifies the pointer to the other meta score
8217    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_metascore_compare_by_str_desc(const void * ap,const void * bp)8218 static int est_metascore_compare_by_str_desc(const void *ap, const void *bp){
8219   assert(ap && bp);
8220   return strcmp(((ESTMETASCORE *)bp)->value, ((ESTMETASCORE *)ap)->value);
8221 }
8222 
8223 
8224 /* Compare two meta scores by attributes of numbers for ascending order.
8225    `ap' specifies the pointer to one meta score
8226    `bp' specifies the pointer to the other meta score
8227    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_metascore_compare_by_num_asc(const void * ap,const void * bp)8228 static int est_metascore_compare_by_num_asc(const void *ap, const void *bp){
8229   assert(ap && bp);
8230   return (time_t)((ESTMETASCORE *)ap)->value - (time_t)((ESTMETASCORE *)bp)->value;
8231 }
8232 
8233 
8234 /* Compare two meta scores by attributes of numbers for descending order.
8235    `ap' specifies the pointer to one meta score
8236    `bp' specifies the pointer to the other meta score
8237    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_metascore_compare_by_num_desc(const void * ap,const void * bp)8238 static int est_metascore_compare_by_num_desc(const void *ap, const void *bp){
8239   assert(ap && bp);
8240   return (time_t)((ESTMETASCORE *)bp)->value - (time_t)((ESTMETASCORE *)ap)->value;
8241 }
8242 
8243 
8244 /* Get the universal set of documents in a database.
8245    `db' specifies a database object.
8246    `nump' specifies the pointer to which the number of elements in the result is assigned.
8247    `hints' specifies a list object.  If it is `NULL', it is not used.
8248    `add' specifies whether the result to be treated in union or difference.
8249    The return value is an array of score structures of corresponding documents. */
est_search_uvset(ESTDB * db,int * nump,CBMAP * hints,int add)8250 static ESTSCORE *est_search_uvset(ESTDB *db, int *nump, CBMAP *hints, int add){
8251   ESTSCORE *scores;
8252   char *vbuf, numbuf[ESTNUMBUFSIZ];
8253   int snum, smax;
8254   assert(db && nump);
8255   smax = ESTALLOCUNIT;
8256   CB_MALLOC(scores, smax * sizeof(ESTSCORE));
8257   snum = 0;
8258   vlcurfirst(db->listdb);
8259   while((vbuf = vlcurval(db->listdb, NULL)) != NULL){
8260     if(snum >= smax){
8261       smax *= 2;
8262       CB_REALLOC(scores, smax * sizeof(ESTSCORE));
8263     }
8264     scores[snum].id = atoi(vbuf);
8265     scores[snum].score = 0;
8266     scores[snum].value = NULL;
8267     snum++;
8268     free(vbuf);
8269     vlcurnext(db->listdb);
8270   }
8271   *nump = snum;
8272   if(hints){
8273     sprintf(numbuf, "%d", snum * (add ? 1 : -1));
8274     cbmapput(hints, ESTOPUVSET, -1, numbuf, -1, TRUE);
8275   }
8276   return scores;
8277 }
8278 
8279 
8280 /* Expand a word to words which begins with it.
8281    `db' specifies a database object.
8282    `word' specifies a word.
8283    `list' specifies a list object to contain the results. */
est_expand_word_bw(ESTDB * db,const char * word,CBLIST * list)8284 static void est_expand_word_bw(ESTDB *db, const char *word, CBLIST *list){
8285   const char *kbuf;
8286   int num, ksiz;
8287   assert(db && word && list);
8288   num = 0;
8289   vlcurjump(db->fwmdb, word, -1, VL_JFORWARD);
8290   while((kbuf = vlcurkeycache(db->fwmdb, &ksiz)) != NULL){
8291     if(!cbstrfwmatch(kbuf, word)) break;
8292     CB_LISTPUSH(list, kbuf, ksiz);
8293     if(++num >= db->wildmax) break;
8294     vlcurnext(db->fwmdb);
8295   }
8296 }
8297 
8298 
8299 /* Expand a word to words which ends with it.
8300    `db' specifies a database object.
8301    `word' specifies a word.
8302    `list' specifies a list object to contain the results. */
est_expand_word_ew(ESTDB * db,const char * word,CBLIST * list)8303 static void est_expand_word_ew(ESTDB *db, const char *word, CBLIST *list){
8304   const char *kbuf;
8305   int num, wsiz, ksiz;
8306   assert(db && word && list);
8307   num = 0;
8308   wsiz = strlen(word);
8309   vlcurfirst(db->fwmdb);
8310   while((kbuf = vlcurkeycache(db->fwmdb, &ksiz)) != NULL){
8311     if(ksiz >= wsiz && !memcmp(kbuf + ksiz - wsiz, word, wsiz)){
8312       CB_LISTPUSH(list, kbuf, ksiz);
8313       if(++num >= db->wildmax) break;
8314     }
8315     vlcurnext(db->fwmdb);
8316   }
8317 }
8318 
8319 
8320 /* Expand regular expressios to words which matches them.
8321    `db' specifies a database object.
8322    `word' specifies regular expressions.
8323    `list' specifies a list object to contain the results. */
est_expand_word_rx(ESTDB * db,const char * word,CBLIST * list)8324 static void est_expand_word_rx(ESTDB *db, const char *word, CBLIST *list){
8325   void *regex;
8326   const char *kbuf;
8327   int num, ksiz;
8328   assert(db && word && list);
8329   if(!(regex = est_regex_new(word))) return;
8330   num = 0;
8331   vlcurfirst(db->fwmdb);
8332   while((kbuf = vlcurkeycache(db->fwmdb, &ksiz)) != NULL){
8333     if(est_regex_match(regex, kbuf)){
8334       CB_LISTPUSH(list, kbuf, ksiz);
8335       if(++num >= db->wildmax) break;
8336     }
8337     vlcurnext(db->fwmdb);
8338   }
8339   est_regex_delete(regex);
8340 }
8341 
8342 
8343 /* Expand a keyword to keywords which begins with it.
8344    `db' specifies a database object.
8345    `word' specifies a word.
8346    `list' specifies a list object to contain the results. */
est_expand_keyword_bw(ESTDB * db,const char * word,CBLIST * list)8347 static void est_expand_keyword_bw(ESTDB *db, const char *word, CBLIST *list){
8348   const char *kbuf;
8349   int num, ksiz;
8350   assert(db && word && list);
8351   num = 0;
8352   vlcurjump(db->xfmdb, word, -1, VL_JFORWARD);
8353   while((kbuf = vlcurkeycache(db->xfmdb, &ksiz)) != NULL){
8354     if(!cbstrfwmatch(kbuf, word)) break;
8355     CB_LISTPUSH(list, kbuf, ksiz);
8356     if(++num >= db->wildmax) break;
8357     vlcurnext(db->xfmdb);
8358   }
8359 }
8360 
8361 
8362 /* Expand a keyword to keywords which ends with it.
8363    `db' specifies a database object.
8364    `word' specifies a word.
8365    `list' specifies a list object to contain the results. */
est_expand_keyword_ew(ESTDB * db,const char * word,CBLIST * list)8366 static void est_expand_keyword_ew(ESTDB *db, const char *word, CBLIST *list){
8367   const char *kbuf;
8368   int num, wsiz, ksiz;
8369   assert(db && word && list);
8370   num = 0;
8371   wsiz = strlen(word);
8372   vlcurfirst(db->xfmdb);
8373   while((kbuf = vlcurkeycache(db->xfmdb, &ksiz)) != NULL){
8374     if(ksiz >= wsiz && !memcmp(kbuf + ksiz - wsiz, word, wsiz)){
8375       CB_LISTPUSH(list, kbuf, ksiz);
8376       if(++num >= db->wildmax) break;
8377     }
8378     vlcurnext(db->xfmdb);
8379   }
8380 }
8381 
8382 
8383 /* Expand regular expressios to keywords which matches them.
8384    `db' specifies a database object.
8385    `word' specifies regular expressions.
8386    `list' specifies a list object to contain the results. */
est_expand_keyword_rx(ESTDB * db,const char * word,CBLIST * list)8387 static void est_expand_keyword_rx(ESTDB *db, const char *word, CBLIST *list){
8388   void *regex;
8389   const char *kbuf;
8390   int num, ksiz;
8391   assert(db && word && list);
8392   if(!(regex = est_regex_new(word))) return;
8393   num = 0;
8394   vlcurfirst(db->xfmdb);
8395   while((kbuf = vlcurkeycache(db->xfmdb, &ksiz)) != NULL){
8396     if(est_regex_match(regex, kbuf)){
8397       CB_LISTPUSH(list, kbuf, ksiz);
8398       if(++num >= db->wildmax) break;
8399     }
8400     vlcurnext(db->xfmdb);
8401   }
8402   est_regex_delete(regex);
8403 }
8404 
8405 
8406 /* Get a correspinding set of documents in a database.
8407    `db' specifies a database object.
8408    `term' specifies a union term.
8409    `gstep' specifies number of steps of N-gram.
8410    `xpn' specifies the pointer to a function for query expansion.  If it is `NULL', it is not
8411    used.
8412    `nump' specifies the pointer to which the number of elements in the result is assigned.
8413    `hints' specifies a list object.  If it is `NULL', it is not used.
8414    `add' specifies whether the result to be treated in union or difference.
8415    `auxmin' specifies the minimum hits to adopt the auxiliary index.  If it is not more than 0,
8416    the auxiliary index is not used.
8417    `auxwords' specifies a map object where keywords used with the auxiliary index are stored.  If
8418    it is `NULL', it is not used.
8419    The return value is an array of score structures of corresponding documents. */
est_search_union(ESTDB * db,const char * term,int gstep,void (* xpn)(const char *,CBLIST *),int * nump,CBMAP * hints,int add,int auxmin,CBMAP * auxwords)8420 static ESTSCORE *est_search_union(ESTDB *db, const char *term, int gstep,
8421                                   void (*xpn)(const char *, CBLIST *),
8422                                   int *nump, CBMAP *hints, int add, int auxmin, CBMAP *auxwords){
8423   const ESTSCORE *cscores;
8424   ESTSCORE *scores, *tscores, *nscores;
8425   CBMAP *umap;
8426   CBLIST *words, *grams, *tgrams;
8427   const char *ckey, *word, *gram, *rp, *fnext, *snext, *cbuf;
8428   char *vbuf, *wbuf, numbuf[ESTNUMBUFSIZ];
8429   int i, j, k, snum, smax, cksiz, single, tsmax, tsnum, nsnum, vsiz, gcnum;
8430   int gsiz, csiz, wgstep, nnum, west, wild, mfsiz, mssiz, mfhash, mshash, tfhash, tshash;
8431   int id, vstep, score, hit, hnum;
8432   double avg, sd, dif;
8433   assert(db && term && gstep > 0 && nump);
8434   smax = ESTALLOCUNIT;
8435   CB_MALLOC(scores, smax * sizeof(ESTSCORE));
8436   snum = 0;
8437   words = cbsplit(term, -1, "\t");
8438   if(xpn){
8439     umap = cbmapopenex(ESTMINIBNUM);
8440     for(i = 0; i < CB_LISTNUM(words); i++){
8441       word = CB_LISTVAL(words, i);
8442       if(word[0] == '\0' || word[0] == ' ') continue;
8443       CB_LISTOPEN(grams);
8444       xpn(word, grams);
8445       for(j = 0; j < CB_LISTNUM(grams); j++){
8446         word = CB_LISTVAL(grams, j);
8447         cbmapput(umap, word, -1, "", 0, FALSE);
8448       }
8449       CB_LISTCLOSE(grams);
8450     }
8451     CB_LISTCLOSE(words);
8452     words = cbmapkeys(umap);
8453     cbmapclose(umap);
8454   }
8455   for(i = 0; i < CB_LISTNUM(words); i++){
8456     ckey = CB_LISTVAL2(words, i, cksiz);
8457     if(cksiz < 1) continue;
8458     word = ckey;
8459     wbuf = NULL;
8460     if((cscores = est_rescc_get(db, ckey, cksiz, &tsnum)) != NULL){
8461       if(word[0] == ' '){
8462         word++;
8463         if(word[0] != '\0') word++;
8464       }
8465       if(hints){
8466         sprintf(numbuf, "%d", tsnum * (add ? 1 : -1));
8467         cbmapput(hints, word, -1, numbuf, -1, TRUE);
8468       }
8469       for(j = 0; j < tsnum; j++){
8470         if(snum >= smax){
8471           smax *= 2;
8472           CB_REALLOC(scores, smax * sizeof(ESTSCORE));
8473         }
8474         scores[snum].id = cscores[j].id;
8475         scores[snum].score = cscores[j].score;
8476         snum++;
8477       }
8478     } else if(!strchr(word + 1, ' ') && auxmin > 0 &&
8479               (tscores = est_search_keywords(db, word, auxmin, &tsnum)) != NULL){
8480       if(word[0] == ' '){
8481         word++;
8482         if(word[0] != '\0') word++;
8483       }
8484       if(hints){
8485         sprintf(numbuf, "%d", tsnum * (add ? 1 : -1));
8486         cbmapput(hints, word, -1, numbuf, -1, TRUE);
8487       }
8488       if(auxwords) cbmapput(auxwords, word, -1, "", 0, FALSE);
8489       for(j = 0; j < tsnum; j++){
8490         if(snum >= smax){
8491           smax *= 2;
8492           CB_REALLOC(scores, smax * sizeof(ESTSCORE));
8493         }
8494         scores[snum].id = tscores[j].id;
8495         scores[snum].score = tscores[j].score;
8496         snum++;
8497       }
8498       free(tscores);
8499     } else {
8500       wild = '\0';
8501       if(word[0] == ' '){
8502         word++;
8503         if(word[0] == 'b'){
8504           wild = 'b';
8505           word++;
8506         } else  if(word[0] == 'e'){
8507           wild = 'e';
8508           word++;
8509         } else  if(word[0] == 'r'){
8510           wild = 'r';
8511           word++;
8512         }
8513       }
8514       west = ((unsigned char *)word)[0] <= 0xdf;
8515       if(!west || db->amode) wild = '\0';
8516       single = FALSE;
8517       CB_LISTOPEN(grams);
8518       switch(wild){
8519       case 'b':
8520         est_break_text(word, grams, TRUE, FALSE);
8521         CB_LISTPUSH(grams, word, strlen(word));
8522         while(CB_LISTNUM(grams) > 1){
8523           CB_LISTDROP(grams);
8524         }
8525         wbuf = cbmemdup(CB_LISTVAL(grams, 0), -1);
8526         word = wbuf;
8527         est_expand_word_bw(db, word, grams);
8528         single = TRUE;
8529         break;
8530       case 'e':
8531         est_break_text(word, grams, TRUE, FALSE);
8532         cblistunshift(grams, word, -1);
8533         while(CB_LISTNUM(grams) > 1){
8534           free(cblistshift(grams, NULL));
8535         }
8536         wbuf = cbmemdup(CB_LISTVAL(grams, 0), -1);
8537         word = wbuf;
8538         est_expand_word_ew(db, word, grams);
8539         single = TRUE;
8540         break;
8541       case 'r':
8542         est_break_text(word, grams, TRUE, FALSE);
8543         while(CB_LISTNUM(grams) > 0){
8544           free(cblistshift(grams, NULL));
8545         }
8546         est_expand_word_rx(db, word, grams);
8547         single = TRUE;
8548         break;
8549       default:
8550         switch(db->amode){
8551         case ESTDFPERFNG:
8552           est_break_text_perfng(word, grams, TRUE, FALSE);
8553           break;
8554         case ESTDFCHRCAT:
8555           est_break_text_chrcat(word, grams, TRUE);
8556           break;
8557         default:
8558           est_break_text(word, grams, TRUE, FALSE);
8559           break;
8560         }
8561         if(CB_LISTNUM(grams) < 1){
8562           est_expand_word_bw(db, word, grams);
8563           single = TRUE;
8564         }
8565         break;
8566       }
8567       tsmax = ESTALLOCUNIT;
8568       CB_MALLOC(tscores, tsmax * sizeof(ESTSCORE));
8569       tsnum = 0;
8570       gcnum = 0;
8571       wgstep = !single && (CB_LISTNUM(grams) > 2 || gstep > 2) ? gstep : 1;
8572       if(west && gstep <= 2) wgstep = 1;
8573       for(j = 0; j < CB_LISTNUM(grams); j += wgstep){
8574         gcnum++;
8575         gram = CB_LISTVAL2(grams, j, gsiz);
8576         fnext = cblistval(grams, j + 1, &mfsiz);
8577         snext = cblistval(grams, j + 2, &mssiz);
8578         mfhash = fnext ? dpinnerhash(fnext, mfsiz) % ESTJHASHNUM + 1: 0xff;
8579         mshash = snext ? dpouterhash(snext, mssiz) % ESTJHASHNUM + 1: 0xff;
8580         vbuf = est_idx_scan(db->idxdb, gram, gsiz, &vsiz, db->smode);
8581         if((cbuf = cbmapget(db->idxcc, gram, gsiz, &csiz)) != NULL){
8582           CB_REALLOC(vbuf, vsiz + csiz + 1);
8583           memcpy(vbuf + vsiz, cbuf, csiz);
8584           vsiz += csiz;
8585         }
8586         rp = vbuf;
8587         while(rp < vbuf + vsiz){
8588           EST_READ_VNUMBUF(rp, id, vstep);
8589           rp += vstep;
8590           switch(db->smode){
8591           case ESTDFSCVOID:
8592             score = 0;
8593             break;
8594           default:
8595             score = *(unsigned char *)rp;
8596             rp++;
8597             break;
8598           case ESTDFSCINT:
8599           case ESTDFSCASIS:
8600             memcpy(&score, rp, sizeof(int));
8601             rp += sizeof(int);
8602             break;
8603           }
8604           hit = mfhash == 0xff && mshash == 0xff;
8605           while(rp < vbuf + vsiz && *(unsigned char *)rp != 0x00){
8606             tfhash = *(unsigned char *)rp;
8607             rp++;
8608             tshash = *(unsigned char *)rp;
8609             rp++;
8610             if((mfhash == 0xff || mfhash == tfhash) && (mshash == 0xff || mshash == tshash))
8611               hit = TRUE;
8612           }
8613           rp++;
8614           if(hit || single){
8615             if(tsnum >= tsmax){
8616               tsmax *= 2;
8617               CB_REALLOC(tscores, tsmax * sizeof(ESTSCORE));
8618             }
8619             tscores[tsnum].id = id;
8620             switch(db->smode){
8621             case ESTDFSCVOID:
8622               tscores[tsnum].score = rp - vbuf;
8623               break;
8624             default:
8625               tscores[tsnum].score = score * 100 + 10;
8626               break;
8627             case ESTDFSCASIS:
8628               tscores[tsnum].score = score;
8629               break;
8630             }
8631             tsnum++;
8632           }
8633         }
8634         free(vbuf);
8635       }
8636       if(CB_LISTNUM(grams) == 1 && !single && db->amode == 0 && *(unsigned char *)word < 0xe0){
8637         CB_LISTOPEN(tgrams);
8638         est_break_text(word, tgrams, TRUE, TRUE);
8639         if(CB_LISTNUM(tgrams) == 2){
8640           gram = CB_LISTVAL(tgrams, 1);
8641           nscores = est_search_union(db, gram, 1, NULL, &nsnum, NULL, TRUE, -1, NULL);
8642           for(j = 0; j < nsnum; j++){
8643             if(tsnum >= tsmax){
8644               tsmax *= 2;
8645               CB_REALLOC(tscores, tsmax * sizeof(ESTSCORE));
8646             }
8647             tscores[tsnum].id = nscores[j].id;
8648             tscores[tsnum].score = nscores[j].score;
8649             tsnum++;
8650           }
8651           free(nscores);
8652           gcnum++;
8653         }
8654         CB_LISTCLOSE(tgrams);
8655       }
8656       if(gcnum > 1){
8657         qsort(tscores, tsnum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
8658         nnum = 0;
8659         for(j = 0; j < tsnum; j++){
8660           id = tscores[j].id;
8661           score = tscores[j].score;
8662           hnum = 1;
8663           if(db->smode == ESTDFSCASIS){
8664             for(k = j + 1; k < tsnum && tscores[k].id == id; k++){
8665               hnum++;
8666             }
8667             if(hnum >= gcnum || single){
8668               tscores[nnum].id = id;
8669               tscores[nnum].score = score;
8670               nnum++;
8671             }
8672           } else {
8673             for(k = j + 1; k < tsnum && tscores[k].id == id; k++){
8674               score += tscores[k].score;
8675               hnum++;
8676             }
8677             if(hnum >= gcnum || single){
8678               tscores[nnum].id = id;
8679               tscores[nnum].score = score / hnum;
8680               nnum++;
8681             }
8682           }
8683           j = k - 1;
8684         }
8685         tsnum = nnum;
8686       }
8687       if(hints){
8688         sprintf(numbuf, "%d", tsnum * (add ? 1 : -1));
8689         cbmapput(hints, word, -1, numbuf, -1, TRUE);
8690       }
8691       CB_LISTCLOSE(grams);
8692       if(db->smode != ESTDFSCASIS && !strchr(word, ' ') && auxmin > 0)
8693         est_weight_keywords(db, word, tscores, tsnum);
8694       for(j = 0; j < tsnum; j++){
8695         if(snum >= smax){
8696           smax *= 2;
8697           CB_REALLOC(scores, smax * sizeof(ESTSCORE));
8698         }
8699         scores[snum].id = tscores[j].id;
8700         scores[snum].score = tscores[j].score;
8701         snum++;
8702       }
8703       est_rescc_put(db, ckey, cksiz, tscores, tsnum);
8704     }
8705     free(wbuf);
8706   }
8707   CB_LISTCLOSE(words);
8708   qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
8709   nnum = 0;
8710   for(i = 0; i < snum; i++){
8711     id = scores[i].id;
8712     score = scores[i].score;
8713     hnum = 1;
8714     for(j = i + 1; j < snum && scores[j].id == id; j++){
8715       score += scores[j].score;
8716       hnum++;
8717     }
8718     scores[nnum].id = id;
8719     scores[nnum].score = score / hnum;
8720     scores[nnum].value = NULL;
8721     nnum++;
8722     i = j - 1;
8723   }
8724   *nump = nnum;
8725   if(db->smode != ESTDFSCASIS && nnum > 0){
8726     avg = 0.0;
8727     for(i = 0; i < nnum; i++){
8728       avg += scores[i].score;
8729     }
8730     avg /= nnum;
8731     sd = 0.0;
8732     for(i = 0; i < nnum; i++){
8733       dif = avg - scores[i].score;
8734       sd += dif * dif;
8735     }
8736     sd /= nnum;
8737     sd = sqrt(sd);
8738     if(sd < 0.1){
8739       for(i = 0; i < nnum; i++){
8740         scores[i].score = ESTSCOREUNIT / 2;
8741       }
8742     } else {
8743       for(i = 0; i < nnum; i++){
8744         scores[i].score = (int)(((scores[i].score - avg) * (ESTSCOREUNIT / 10.0) / sd) +
8745                                 ESTSCOREUNIT / 2.0);
8746       }
8747     }
8748   }
8749   return scores;
8750 }
8751 
8752 
8753 /* Get scores in the result cache.
8754    `db' specifies a database object.
8755    `word' specifies a search word.
8756    `size' specifies the size of the word.
8757    `nump' specifies the pointer to which the number of elements in the result is assigned.
8758    The return value is an array whose elements are ID numbers of corresponding documents. */
est_rescc_get(ESTDB * db,const char * word,int size,int * nump)8759 static const ESTSCORE *est_rescc_get(ESTDB *db, const char *word, int size, int *nump){
8760   const char *vbuf;
8761   int vsiz;
8762   assert(db && word && size >= 0 && nump);
8763   if(!(vbuf = cbmapget(db->rescc, word, size, &vsiz))) return NULL;
8764   if(vsiz == sizeof(ESTSCORE) && ((ESTSCORE *)vbuf)->id == -1) return NULL;
8765   cbmapmove(db->rescc, word, size, FALSE);
8766   *nump = vsiz / sizeof(ESTSCORE);
8767   return (ESTSCORE *)vbuf;
8768 }
8769 
8770 
8771 /* Add scores into the result cache.
8772    `db' specifies a database object.
8773    `word' specifies a search word.
8774    `size' specifies the size of the word.
8775    `scores' specifies an array of scores.  It is released in this function.
8776    `num' specifies the number of elements of the score array. */
est_rescc_put(ESTDB * db,const char * word,int size,ESTSCORE * scores,int num)8777 static void est_rescc_put(ESTDB *db, const char *word, int size, ESTSCORE *scores, int num){
8778   int i;
8779   assert(db && word && size >= 0 && scores && num >= 0);
8780   if(db->rcmnum < 1){
8781     free(scores);
8782     return;
8783   }
8784   cbmapput(db->rescc, word, size, (char *)scores, num * sizeof(ESTSCORE), TRUE);
8785   free(scores);
8786   if(cbmaprnum(db->rescc) > db->rcmnum){
8787     num = db->rcmnum * 0.1 + 1;
8788     cbmapiterinit(db->rescc);
8789     for(i = 0; i < num && (word = cbmapiternext(db->rescc, &size)) != NULL; i++){
8790       cbmapout(db->rescc, word, size);
8791     }
8792   }
8793 }
8794 
8795 
8796 /* Search the auxiliary index.
8797    `db' specifies a database object.
8798    `word' specifies a search word.
8799    `min' specifies the minimum hits to adopt the auxiliary index.
8800    `nump' specifies the pointer to which the number of elements in the result is assigned.
8801    The return value is an array of score structures of corresponding documents. */
est_search_keywords(ESTDB * db,const char * word,int min,int * nump)8802 static ESTSCORE *est_search_keywords(ESTDB *db, const char *word, int min, int *nump){
8803   ESTSCORE *scores;
8804   CBLIST *words;
8805   CBDATUM *rbuf;
8806   const int *res;
8807   int i, rnum, snum, wsiz, nnum, lid;
8808   assert(db && word && min >= 0 && nump);
8809   if(*word != ' ' && (res = (int *)vlgetcache(db->auxdb, word, -1, &rnum)) != NULL &&
8810      (rnum /= sizeof(int)) / 2 >= min){
8811     CB_MALLOC(scores, (rnum / 2) * sizeof(ESTSCORE) + 1);
8812     snum = 0;
8813     for(i = 0; i < rnum; i += 2){
8814       scores[snum].id = res[i];
8815       scores[snum].score = res[i+1];
8816       snum++;
8817     }
8818     *nump = snum;
8819     return scores;
8820   }
8821   CB_LISTOPEN(words);
8822   if(*word == ' '){
8823     word++;
8824     if(*word == 'b'){
8825       est_expand_keyword_bw(db, word + 1, words);
8826     } else if(*word == 'e'){
8827       est_expand_keyword_ew(db, word + 1, words);
8828     } else if(*word == 'r'){
8829       est_expand_keyword_rx(db, word + 1, words);
8830     }
8831   } else if(*(unsigned char *)word >= 0xe3){
8832     est_expand_keyword_bw(db, word, words);
8833   }
8834   CB_DATUMOPEN(rbuf);
8835   for(i = 0; i < CB_LISTNUM(words) &&
8836         CB_DATUMSIZE(rbuf) <= sizeof(int) * 2 * min * ESTAUXEXRAT; i++){
8837     word = CB_LISTVAL2(words, i, wsiz);
8838     if(!(res = (int *)vlgetcache(db->auxdb, word, wsiz, &rnum))) continue;
8839     CB_DATUMCAT(rbuf, (char *)res, rnum);
8840   }
8841   res = (int *)CB_DATUMPTR(rbuf);
8842   rnum = CB_DATUMSIZE(rbuf);
8843   if((rnum /= sizeof(int)) / 2 < min){
8844     CB_DATUMCLOSE(rbuf);
8845     CB_LISTCLOSE(words);
8846     return NULL;
8847   }
8848   CB_MALLOC(scores, (rnum / 2) * sizeof(ESTSCORE) + 1);
8849   snum = 0;
8850   for(i = 0; i < rnum; i += 2){
8851     scores[snum].id = res[i];
8852     scores[snum].score = res[i+1];
8853     snum++;
8854   }
8855   qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
8856   nnum = 0;
8857   lid = -1;
8858   for(i = 0; i < snum; i++){
8859     if(nnum > 0 && scores[i].id == lid){
8860       scores[nnum-1].score += scores[i].score;
8861       continue;
8862     }
8863     scores[nnum].id = scores[i].id;
8864     scores[nnum].score = scores[i].score;
8865     nnum++;
8866     lid = scores[i].id;
8867   }
8868   CB_DATUMCLOSE(rbuf);
8869   CB_LISTCLOSE(words);
8870   *nump = nnum;
8871   return scores;
8872 }
8873 
8874 
8875 /* Weight scores with the auxiliary index.
8876    `db' specifies a database object.
8877    `word' specifies a search word.
8878    `scores' specifies an array of scores of search candidates.
8879    `snum' specifies the number of the array. */
est_weight_keywords(ESTDB * db,const char * word,ESTSCORE * scores,int snum)8880 static void est_weight_keywords(ESTDB *db, const char *word, ESTSCORE *scores, int snum){
8881   ESTSCORE *kscores;
8882   const int *res;
8883   int i, knum, nnum;
8884   double rank;
8885   if(!(res = (int *)vlgetcache(db->auxdb, word, -1, &knum)) || knum < 2) return;
8886   knum /= sizeof(int);
8887   CB_MALLOC(kscores, knum / 2 * sizeof(ESTSCORE));
8888   rank = knum / 2 + 1;
8889   nnum = 0;
8890   for(i = 0; i < knum; i += 2){
8891     kscores[nnum].id = res[i];
8892     kscores[nnum].score = (pow(rank, 0.7) / 8.0 + 1.0) * 10000.0;
8893     nnum++;
8894     rank -= 1.0;
8895   }
8896   knum = nnum;
8897   qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
8898   qsort(kscores, knum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
8899   nnum = 0;
8900   for(i = 0; i < snum; i++){
8901     while(nnum < knum && kscores[nnum].id < scores[i].id){
8902       nnum++;
8903     }
8904     if(nnum < knum && kscores[nnum].id == scores[i].id)
8905       scores[i].score *= kscores[nnum].score / 10000.0;
8906   }
8907   free(kscores);
8908 }
8909 
8910 
8911 /* Get scores correspinding a ranking search with an attribute narrowing index.
8912    `db' specifies a database object.
8913    `name' specifies the name of an attribute.
8914    `nump' specifies the pointer to which the number of elements in the result is assigned.
8915    The return value is an array of score structures of corresponding documents. */
est_search_rank(ESTDB * db,const char * name,int top,int * nump)8916 static ESTSCORE *est_search_rank(ESTDB *db, const char *name, int top, int *nump){
8917   ESTATTRIDX *attridx;
8918   ESTSCORE *scores;
8919   const char *kbuf;
8920   int snum, ksiz, id;
8921   assert(db && name && nump);
8922   if(top == 0 || !(attridx = (ESTATTRIDX *)cbmapget(db->aidxs, name, -1, NULL)) ||
8923      (attridx->type != ESTIDXATTRSTR && attridx->type != ESTIDXATTRNUM)){
8924     *nump = 0;
8925     return cbmalloc(1);
8926   }
8927   snum = abs(top);
8928   if(snum > db->dnum) snum = db->dnum;
8929   CB_MALLOC(scores, snum * sizeof(ESTSCORE) + 1);
8930   snum = 0;
8931   if(top > 0){
8932     vlcurfirst(attridx->db);
8933     while(snum < top && (kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL){
8934       if(ksiz < sizeof(int)){
8935         vlcurnext(attridx->db);
8936         continue;
8937       }
8938       memcpy(&id, kbuf + ksiz - sizeof(int), sizeof(int));
8939       if(id < 1){
8940         vlcurnext(attridx->db);
8941         continue;
8942       }
8943       scores[snum].id = id;
8944       scores[snum].score = 0;
8945       scores[snum].value = NULL;
8946       snum++;
8947       vlcurnext(attridx->db);
8948     }
8949   } else {
8950     top *= -1;
8951     vlcurlast(attridx->db);
8952     while(snum < top && (kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL){
8953       if(ksiz < sizeof(int)){
8954         vlcurprev(attridx->db);
8955         continue;
8956       }
8957       memcpy(&id, kbuf + ksiz - sizeof(int), sizeof(int));
8958       if(id < 1){
8959         vlcurprev(attridx->db);
8960         continue;
8961       }
8962       scores[snum].id = id;
8963       scores[snum].score = 0;
8964       scores[snum].value = NULL;
8965       snum++;
8966       vlcurprev(attridx->db);
8967     }
8968   }
8969   *nump = snum;
8970   return scores;
8971 }
8972 
8973 
8974 /* Get scores correspinding an attribute expression with an attribute narrowing index.
8975    `db' specifies a database object.
8976    `expr' specifies an attribute search expression.
8977    `nump' specifies the pointer to which the number of elements in the result is assigned.
8978    The return value is an array of score structures of corresponding documents or `NULL' if no
8979    index is available. */
est_search_aidx_attr(ESTDB * db,const char * expr,int * nump)8980 static ESTSCORE *est_search_aidx_attr(ESTDB *db, const char *expr, int *nump){
8981   ESTATTRIDX *attridx;
8982   ESTSCORE *scores;
8983   CBDATUM *abuf;
8984   CBLIST *tokens;
8985   void *regex;
8986   const char *cop, *pv, *kbuf, *tbuf;
8987   unsigned char *utmp;
8988   char *name, *oper, *val, *sval, *wp, numbuf[ESTNUMBUFSIZ];
8989   int i, nsiz, vsiz, ksiz, tsiz, sign, ic, ssiz, esc, jmp, len, *ary, anum;
8990   time_t num, lower, upper;
8991   assert(db && expr && nump);
8992   name = NULL;
8993   oper = NULL;
8994   val = NULL;
8995   nsiz = 0;
8996   vsiz = 0;
8997   while(*expr > 0 && *expr <= ' '){
8998     expr++;
8999   }
9000   if((pv = strchr(expr, ' ')) != NULL){
9001     nsiz = pv - expr;
9002     name = cbmemdup(expr, nsiz);
9003     expr = pv;
9004     while(*expr > 0 && *expr <= ' '){
9005       expr++;
9006     }
9007     if((pv = strchr(expr, ' ')) != NULL){
9008       oper = cbmemdup(expr, pv - expr);
9009       expr = pv;
9010       while(*expr > 0 && *expr <= ' '){
9011         expr++;
9012       }
9013       vsiz = strlen(expr);
9014       val = cbmemdup(expr, vsiz);
9015     } else {
9016       oper = cbmemdup(expr, -1);
9017     }
9018   } else {
9019     nsiz = strlen(expr);
9020     name = cbmemdup(expr, nsiz);
9021   }
9022   if(!oper){
9023     oper = cbmemdup("", 0);
9024   }
9025   if(!val){
9026     vsiz = 0;
9027     val = cbmemdup("", 0);
9028   }
9029   cop = oper;
9030   if(*cop == '!'){
9031     sign = FALSE;
9032     cop++;
9033   } else {
9034     sign = TRUE;
9035   }
9036   if(*cop == 'I' || *cop == 'i'){
9037     ic = !est_check_cjk_only(val);
9038     cop++;
9039   } else {
9040     ic = FALSE;
9041   }
9042   regex = NULL;
9043   if(!cbstricmp(cop, ESTOPSTREQ)){
9044     cop = ESTOPSTREQ;
9045   } else if(!cbstricmp(cop, ESTOPSTRNE)){
9046     cop = ESTOPSTRNE;
9047   } else if(!cbstricmp(cop, ESTOPSTRINC)){
9048     cop = ESTOPSTRINC;
9049   } else if(!cbstricmp(cop, ESTOPSTRBW)){
9050     cop = ESTOPSTRBW;
9051   } else if(!cbstricmp(cop, ESTOPSTREW)){
9052     cop = ESTOPSTREW;
9053   } else if(!cbstricmp(cop, ESTOPSTRAND)){
9054     cop = ESTOPSTRAND;
9055   } else if(!cbstricmp(cop, ESTOPSTROR)){
9056     cop = ESTOPSTROR;
9057   } else if(!cbstricmp(cop, ESTOPSTROREQ)){
9058     cop = ESTOPSTROREQ;
9059   } else if(!cbstricmp(cop, ESTOPSTRRX)){
9060     cop = ESTOPSTRRX;
9061     regex = est_regex_new(val);
9062   } else if(!cbstricmp(cop, ESTOPNUMEQ)){
9063     cop = ESTOPNUMEQ;
9064   } else if(!cbstricmp(cop, ESTOPNUMNE)){
9065     cop = ESTOPNUMNE;
9066   } else if(!cbstricmp(cop, ESTOPNUMGT)){
9067     cop = ESTOPNUMGT;
9068   } else if(!cbstricmp(cop, ESTOPNUMGE)){
9069     cop = ESTOPNUMGE;
9070   } else if(!cbstricmp(cop, ESTOPNUMLT)){
9071     cop = ESTOPNUMLT;
9072   } else if(!cbstricmp(cop, ESTOPNUMLE)){
9073     cop = ESTOPNUMLE;
9074   } else if(!cbstricmp(cop, ESTOPNUMBT)){
9075     cop = ESTOPNUMBT;
9076   } else {
9077     cop = ESTOPSTRINC;
9078     val[0] = '\0';
9079     vsiz = 0;
9080   }
9081   num = cbstrmktime(val);
9082   if(!(attridx = (ESTATTRIDX *)cbmapget(db->aidxs, name, nsiz, NULL)) ||
9083      (attridx->type != ESTIDXATTRSTR && attridx->type != ESTIDXATTRNUM) ||
9084      (attridx->type == ESTIDXATTRNUM &&
9085       cop != ESTOPNUMEQ && cop != ESTOPNUMNE && cop != ESTOPNUMGT && cop != ESTOPNUMGE &&
9086       cop != ESTOPNUMLT && cop != ESTOPNUMLE && cop != ESTOPNUMBT)){
9087     if(regex) est_regex_delete(regex);
9088     free(val);
9089     free(oper);
9090     free(name);
9091     return NULL;
9092   }
9093   CB_DATUMOPEN(abuf);
9094   if(!sign || ic){
9095     if(ic){
9096       utmp = (unsigned char *)est_uconv_in(val, vsiz, &tsiz);
9097       est_normalize_text(utmp, tsiz, &tsiz);
9098       est_canonicalize_text(utmp, tsiz, FALSE);
9099       sval = (char *)est_uconv_out((char *)utmp, tsiz, &ssiz);
9100       free(utmp);
9101     } else {
9102       sval = NULL;
9103       ssiz = 0;
9104     }
9105     esc = INT_MAX;
9106     jmp = INT_MAX;
9107     if(sign && (cop == ESTOPSTREQ || cop == ESTOPSTRBW) && vsiz > 0){
9108       if(*sval > 0x0 && *sval < 0x7f){
9109         numbuf[0] = *sval;
9110         numbuf[1] = '\0';
9111         esc = *(unsigned char *)sval;
9112         if(*sval >= 'a' && *sval <= 'z'){
9113           numbuf[0] -= 'a' - 'A';
9114           jmp = *sval - 'a' + 'A';
9115         }
9116         vlcurjump(attridx->db, numbuf, 1, VL_JFORWARD);
9117       } else if(*(unsigned char *)sval >= 0xc0){
9118         numbuf[0] = *sval;
9119         numbuf[1] = '\0';
9120         esc = *(unsigned char *)sval;
9121         vlcurjump(attridx->db, numbuf, 1, VL_JFORWARD);
9122       } else {
9123         vlcurfirst(attridx->db);
9124       }
9125     } else {
9126       vlcurfirst(attridx->db);
9127     }
9128     while((kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL){
9129       if(est_match_attr(kbuf, ksiz - sizeof(int) - 1,
9130                         cop, sign, val, vsiz, sval, ssiz, regex, num))
9131         CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
9132       if(*(unsigned char *)kbuf > jmp && *(unsigned char *)kbuf < *(unsigned char *)sval){
9133         numbuf[0] = *sval;
9134         numbuf[1] = '\0';
9135         vlcurjump(attridx->db, numbuf, 1, VL_JFORWARD);
9136         jmp = INT_MAX;
9137       } else if(*(unsigned char *)kbuf > esc){
9138         break;
9139       } else {
9140         vlcurnext(attridx->db);
9141       }
9142     }
9143     if(sval) free(sval);
9144   } else if(cop == ESTOPSTROREQ){
9145     tokens = cbsplit(val, vsiz, " ,");
9146     cblistsort(tokens);
9147     for(i = 0; i < CB_LISTNUM(tokens); i++){
9148       tbuf = CB_LISTVAL2(tokens, i, tsiz);
9149       vlcurjump(attridx->db, tbuf, tsiz, VL_JFORWARD);
9150       while((kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL && !strcmp(kbuf, tbuf)){
9151         CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
9152         vlcurnext(attridx->db);
9153       }
9154     }
9155     CB_LISTCLOSE(tokens);
9156   } else if(cop == ESTOPNUMBT){
9157     if((wp = strchr(val, ' ')) != NULL || (wp = strchr(val, '\t')) != NULL){
9158       *(wp++) = '\0';
9159       while(*wp == ' ' || *wp == '\t'){
9160         wp++;
9161       }
9162       lower = cbstrmktime(val);
9163       upper = cbstrmktime(wp);
9164     } else {
9165       lower = cbstrmktime(val);
9166       upper = INT_MAX;
9167     }
9168     len = sprintf(numbuf, "%.0f", (double)lower);
9169     vlcurjump(attridx->db, numbuf, len, VL_JFORWARD);
9170     while((kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL && cbstrmktime(kbuf) <= upper){
9171       CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
9172       vlcurnext(attridx->db);
9173     }
9174   } else {
9175     if(cop == ESTOPSTREQ || cop == ESTOPSTRBW ||
9176        cop == ESTOPNUMEQ || cop == ESTOPNUMGT || cop == ESTOPNUMGE){
9177       vlcurjump(attridx->db, val, vsiz, VL_JFORWARD);
9178       if(cop == ESTOPNUMGT){
9179         while((kbuf = vlcurkeycache(attridx->db, NULL)) != NULL && cbstrmktime(kbuf) <= num){
9180           vlcurnext(attridx->db);
9181         }
9182       }
9183     } else if(cop == ESTOPNUMLT || cop == ESTOPNUMLE){
9184       len = sprintf(numbuf, "%.0f", (double)cbstrmktime(val) + 1);
9185       vlcurjump(attridx->db, numbuf, len, VL_JBACKWARD);
9186       if(cop == ESTOPNUMLT){
9187         while((kbuf = vlcurkeycache(attridx->db, NULL)) != NULL && cbstrmktime(kbuf) >= num){
9188           vlcurprev(attridx->db);
9189         }
9190       }
9191     } else {
9192       vlcurfirst(attridx->db);
9193     }
9194     while((kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL){
9195       if(est_match_attr(kbuf, ksiz - sizeof(int) - 1,
9196                         cop, TRUE, val, vsiz, NULL, 0, regex, num)){
9197         CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
9198       } else if(cop == ESTOPSTREQ || cop == ESTOPSTRBW || cop == ESTOPNUMEQ){
9199         break;
9200       }
9201       if(cop == ESTOPNUMLT || cop == ESTOPNUMLE){
9202         vlcurprev(attridx->db);
9203       } else {
9204         vlcurnext(attridx->db);
9205       }
9206     }
9207   }
9208   ary = (int *)CB_DATUMPTR(abuf);
9209   anum = CB_DATUMSIZE(abuf) / sizeof(int);
9210   CB_MALLOC(scores, anum * sizeof(ESTSCORE) + 1);
9211   for(i = 0; i < anum; i++){
9212     scores[i].id = ary[i];
9213     scores[i].score = 0;
9214     scores[i].value = NULL;
9215   }
9216   *nump = anum;
9217   CB_DATUMCLOSE(abuf);
9218   if(regex) est_regex_delete(regex);
9219   free(val);
9220   free(oper);
9221   free(name);
9222   return scores;
9223 }
9224 
9225 
9226 /* Get a correspinding set of documents in pseudo indexes.
9227    `db' specifies a database object.
9228    `cond' specifies a search condition object.
9229    `scores' specifies an array of scores of search candidates.
9230    `nump' specifies the pointer to which the number of elements in the parameter and result is
9231    assigned.
9232    `ordattrs' specifies a map object into which ordering attributes are stored.
9233    The return value is an array of re-allocated score structures. */
est_search_pidxs(ESTDB * db,ESTCOND * cond,ESTSCORE * scores,int * nump,CBMAP * ordattrs)9234 static ESTSCORE *est_search_pidxs(ESTDB *db, ESTCOND *cond, ESTSCORE *scores, int *nump,
9235                                   CBMAP *ordattrs){
9236   ESTCATTR *list;
9237   ESTDOC *doc;
9238   const char *otype, *lbuf, *vbuf;
9239   char *oname, *wp;
9240   int i, j, k, snum, anum, id, hit, sc, miss, lsiz, vsiz;
9241   double avg, sd, dif, tune;
9242   assert(db && cond && scores && nump && ordattrs);
9243   snum = *nump;
9244   CB_REALLOC(scores, (snum + CB_LISTNUM(db->pdocs)) * sizeof(ESTSCORE) + 1);
9245   if(cond->phrase){
9246     if(cbstrfwmatch(cond->phrase, ESTOPID)){
9247       return scores;
9248     } else if(cbstrfwmatch(cond->phrase, ESTOPURI)){
9249       return scores;
9250     } else if(cbstrfwmatch(cond->phrase, ESTOPSIMILAR)){
9251       return scores;
9252     }
9253   }
9254   oname = NULL;
9255   otype = NULL;
9256   if(cond->order){
9257     oname = cbmemdup(cond->order, -1);
9258     cbstrtrim(oname);
9259     otype = ESTORDSTRA;
9260     if((wp = strchr(oname, ' ')) != NULL){
9261       *(wp++) = '\0';
9262       while(*wp == ' '){
9263         wp++;
9264       }
9265       otype = wp;
9266     }
9267   }
9268   list = NULL;
9269   anum = -1;
9270   if(cond->attrs) list = est_make_cattr_list(cond->attrs, &anum);
9271   for(i = 0; i < CB_LISTNUM(db->pdocs); i++){
9272     id = ESTPDOCIDMIN + i;
9273     hit = FALSE;
9274     sc = 0;
9275     doc = NULL;
9276     if(!cond->phrase || cond->phrase[0] == '\0'){
9277       hit = cond->attrs ? TRUE : FALSE;
9278     } else if(cbstrfwmatch(cond->phrase, ESTOPUVSET)){
9279       hit = TRUE;
9280     } else {
9281       if((doc = est_db_get_doc(db, id, 0)) != NULL){
9282         hit = est_db_score_doc(db, doc, cond, &sc);
9283       } else {
9284         hit = FALSE;
9285       }
9286     }
9287     if(hit && list){
9288       if(!doc && !(doc = est_db_get_doc(db, id, 0))){
9289         hit = FALSE;
9290       } else {
9291         miss = FALSE;
9292         for(j = 0; !miss && j < anum; j++){
9293           if(list[j].nsiz < 1) continue;
9294           if(list[j].nlist){
9295             hit = FALSE;
9296             for(k = 0; k < CB_LISTNUM(list[j].nlist); k++){
9297               lbuf = CB_LISTVAL2(list[j].nlist, k, lsiz);
9298               if(lsiz < 1) continue;
9299               if(!(vbuf = cbmapget(doc->attrs, lbuf, lsiz, &vsiz))) continue;
9300               if(est_match_attr(vbuf, vsiz, list[j].cop, list[j].sign, list[j].val, list[j].vsiz,
9301                                 list[j].sval, list[j].ssiz, list[j].regex, list[j].num)){
9302                 hit = TRUE;
9303                 break;
9304               }
9305             }
9306             if(!hit) miss = TRUE;
9307           } else if(!(vbuf = cbmapget(doc->attrs, list[j].name, list[j].nsiz, &vsiz))){
9308             miss = TRUE;
9309           } else if(!est_match_attr(vbuf, vsiz, list[j].cop, list[j].sign,
9310                                     list[j].val, list[j].vsiz, list[j].sval, list[j].ssiz,
9311                                     list[j].regex, list[j].num)){
9312             miss = TRUE;
9313           }
9314         }
9315         hit = !miss;
9316       }
9317     }
9318     if(hit){
9319       scores[snum].id = id;
9320       scores[snum].score = sc;
9321       scores[snum].value = NULL;
9322       snum++;
9323       if(oname && (doc || (doc = est_db_get_doc(db, id, 0)) != NULL)){
9324         if(!(vbuf = cbmapget(doc->attrs, oname, -1, &vsiz))){
9325           vbuf = "";
9326           vsiz = 0;
9327         }
9328         cbmapput(ordattrs, (char *)&id, sizeof(int), vbuf, vsiz, FALSE);
9329       }
9330     }
9331     if(doc) est_doc_delete(doc);
9332   }
9333   if(list) est_free_cattr_list(list, anum);
9334   if(oname) free(oname);
9335   if(db->smode != ESTDFSCASIS && snum > *nump){
9336     avg = 0.0;
9337     for(i = *nump; i < snum; i++){
9338       avg += scores[i].score;
9339     }
9340     avg /= snum - *nump;
9341     sd = 0.0;
9342     for(i = *nump; i < snum; i++){
9343       dif = avg - scores[i].score;
9344       sd += dif * dif;
9345     }
9346     sd /= snum - *nump;
9347     sd = sqrt(sd);
9348     if(sd < 0.1){
9349       for(i = *nump; i < snum; i++){
9350         scores[i].score = ESTSCOREUNIT / 2;
9351       }
9352     } else {
9353       for(i = *nump; i < snum; i++){
9354         scores[i].score = (int)(((scores[i].score - avg) * (ESTSCOREUNIT / 10.0) / sd) +
9355                                 ESTSCOREUNIT / 2.0);
9356       }
9357     }
9358     if(cond->tfidf){
9359       tune = pow(snum - *nump + 64, 0.4);
9360       for(i = *nump; i < snum; i++){
9361         scores[i].score *= 100.0 / tune;
9362       }
9363     } else {
9364       for(i = *nump; i < snum; i++){
9365         scores[i].score *= 10;
9366       }
9367     }
9368   }
9369   *nump = snum;
9370   return scores;
9371 }
9372 
9373 
9374 /* Narrow and sort scores of search candidates.
9375    `db' specifies a database object.
9376    `attrs' specifies a list object of narrowing attributes.
9377    `ign' specifies the offset of an attribute to be ignored.
9378    `order' specifies an expression for sorting.
9379    `distinct' specifies the name of the distinct attribute.
9380    `scores' specifies an array of scores of search candidates.
9381    `snum' specifies the number of the array.
9382    `limit' specifies the limit number to check.
9383    `restp' specifies the pointer to a variable to which rest number to be checked is assigned.
9384    `ordattrs' specifies a map object of cached ordering attributes.
9385    The return value is the new number of the array. */
est_narrow_scores(ESTDB * db,const CBLIST * attrs,int ign,const char * order,const char * distinct,ESTSCORE * scores,int snum,int limit,int * restp,CBMAP * ordattrs)9386 static int est_narrow_scores(ESTDB *db, const CBLIST *attrs, int ign,
9387                              const char *order, const char *distinct, ESTSCORE *scores, int snum,
9388                              int limit, int *restp, CBMAP *ordattrs){
9389   ESTCATTR *list;
9390   ESTATTRIDX *attridx;
9391   CBMAP *umap;
9392   const char *otype, *cbuf, *ibuf, *lbuf;
9393   char *oname, *wp, *mbuf, *vbuf;
9394   int i, j, k, ci, oi, anum, done, mixed, nnum, csiz, msiz;
9395   int miss, vsiz, num, isiz, lsiz, hit, onlen, dnlen;
9396   time_t tval;
9397   assert(db && scores && snum >= 0 && limit >= 0 && restp && ordattrs);
9398   *restp = 0;
9399   ci = -1;
9400   oi = -1;
9401   oname = NULL;
9402   otype = NULL;
9403   if(order){
9404     oname = cbmemdup(order, -1);
9405     cbstrtrim(oname);
9406     otype = ESTORDSTRA;
9407     if((wp = strchr(oname, ' ')) != NULL){
9408       *(wp++) = '\0';
9409       while(*wp == ' '){
9410         wp++;
9411       }
9412       otype = wp;
9413     }
9414   }
9415   if(attrs){
9416     list = est_make_cattr_list(attrs, &anum);
9417     if(cbmaprnum(db->aidxs) > 0){
9418       done = TRUE;
9419       mixed = FALSE;
9420       for(i = 0; i < anum; i++){
9421         if(i == ign) continue;
9422         if(!(attridx = (ESTATTRIDX *)cbmapget(db->aidxs, list[i].name, list[i].nsiz, NULL)) ||
9423            (attridx->type == ESTIDXATTRNUM &&
9424             list[i].cop != ESTOPNUMEQ && list[i].cop != ESTOPNUMNE &&
9425             list[i].cop != ESTOPNUMGT && list[i].cop != ESTOPNUMGE &&
9426             list[i].cop != ESTOPNUMLT && list[i].cop != ESTOPNUMLE &&
9427             list[i].cop != ESTOPNUMBT) ||
9428            (attridx->type != ESTIDXATTRSEQ && snum < ESTAISNUMMIN)){
9429           done = FALSE;
9430           continue;
9431         }
9432         switch(attridx->type){
9433         case ESTIDXATTRSTR:
9434         case ESTIDXATTRNUM:
9435           snum = est_aidx_attr_narrow(attridx->db, db->pdocs, list[i].cop, list[i].sign,
9436                                       list[i].val, list[i].vsiz, list[i].sval, list[i].ssiz,
9437                                       list[i].regex, list[i].num, scores, snum);
9438           mixed = TRUE;
9439           break;
9440         default:
9441           if(done && i == anum - 1 && !order && mixed){
9442             qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
9443             mixed = FALSE;
9444           }
9445           snum = est_aidx_seq_narrow(attridx->db, db->pdocs, list[i].cop, list[i].sign,
9446                                      list[i].val, list[i].vsiz, list[i].sval, list[i].ssiz,
9447                                      list[i].regex, list[i].num, scores, snum,
9448                                      done && i == anum - 1 ? limit : INT_MAX, restp);
9449           break;
9450         }
9451         list[i].cop = ESTOPDUMMY;
9452       }
9453       if(mixed && !order) qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
9454     } else {
9455       done = FALSE;
9456     }
9457     if(db->spacc){
9458       for(i = 0; i < anum; i++){
9459         if(!strcmp(list[i].name, db->scname)){
9460           ci = i;
9461           break;
9462         }
9463       }
9464     }
9465     if(oname){
9466       for(i = 0; i < anum; i++){
9467         if(!strcmp(list[i].name, oname)){
9468           oi = i;
9469           break;
9470         }
9471       }
9472     }
9473     if(!done){
9474       nnum = 0;
9475       for(i = 0; i < snum; i++){
9476         if(nnum >= limit){
9477           *restp = snum - i;
9478           break;
9479         }
9480         scores[i].value = NULL;
9481         if(ci >= 0){
9482           if((cbuf = cbmapget(db->spacc, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL)
9483             cbmapmove(db->spacc, (char *)&(scores[i].id), sizeof(int), FALSE);
9484         } else {
9485           cbuf = NULL;
9486           csiz = 0;
9487         }
9488         mbuf = NULL;
9489         if(scores[i].id >= ESTPDOCIDMIN){
9490           scores[nnum++] = scores[i];
9491         } else if((cbuf && anum == 1) ||
9492                   (mbuf = est_crget(db->attrdb, db->zmode, scores[i].id, &msiz)) != NULL){
9493           miss = FALSE;
9494           for(j = 0; !miss && j < anum; j++){
9495             if(list[j].nsiz < 1) continue;
9496             if(list[j].nlist){
9497               hit = FALSE;
9498               for(k = 0; k < CB_LISTNUM(list[j].nlist); k++){
9499                 lbuf = CB_LISTVAL2(list[j].nlist, k, lsiz);
9500                 if(lsiz < 1) continue;
9501                 if(!(vbuf = cbmaploadone(mbuf, msiz, lbuf, lsiz, &vsiz))) continue;
9502                 if(est_match_attr(vbuf, vsiz, list[j].cop, list[j].sign, list[j].val, list[j].vsiz,
9503                                   list[j].sval, list[j].ssiz, list[j].regex, list[j].num)){
9504                   hit = TRUE;
9505                   free(vbuf);
9506                   break;
9507                 }
9508                 free(vbuf);
9509               }
9510               if(!hit) miss = TRUE;
9511               vbuf = NULL;
9512             } else {
9513               if(mbuf){
9514                 vbuf = cbmaploadone(mbuf, msiz, list[j].name, list[j].nsiz, &vsiz);
9515               } else if(csiz != 1 || cbuf[0] != '\0'){
9516                 vbuf = cbmemdup(cbuf, csiz);
9517                 vsiz = csiz;
9518               } else {
9519                 vbuf = NULL;
9520               }
9521               if(list[j].oper[0] == '\0'){
9522                 if(!vbuf) miss = TRUE;
9523               } else {
9524                 if(!vbuf){
9525                   vbuf = cbmemdup("", 0);
9526                   vsiz = 0;
9527                 }
9528                 if(!est_match_attr(vbuf, vsiz, list[j].cop, list[j].sign,
9529                                    list[j].val, list[j].vsiz, list[j].sval, list[j].ssiz,
9530                                    list[j].regex, list[j].num)) miss = TRUE;
9531               }
9532             }
9533             if(j == ci && !cbuf){
9534               if(vbuf){
9535                 cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), vbuf, vsiz, FALSE);
9536               } else {
9537                 cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), "", 1, FALSE);
9538               }
9539               if(cbmaprnum(db->spacc) > db->scmnum){
9540                 num = db->scmnum * 0.1 + 1;
9541                 cbmapiterinit(db->spacc);
9542                 for(k = 0; k < num && (ibuf = cbmapiternext(db->spacc, &isiz)) != NULL; k++){
9543                   cbmapout(db->spacc, ibuf, isiz);
9544                 }
9545               }
9546             }
9547             if(j == oi){
9548               scores[i].value = vbuf;
9549             } else {
9550               free(vbuf);
9551             }
9552           }
9553           if(miss){
9554             free(scores[i].value);
9555           } else {
9556             scores[nnum++] = scores[i];
9557           }
9558         }
9559         free(mbuf);
9560       }
9561       snum = nnum;
9562     } else {
9563       for(i = 0; i < snum; i++){
9564         scores[i].value = NULL;
9565       }
9566     }
9567     est_free_cattr_list(list, anum);
9568   } else {
9569     for(i = 0; i < snum; i++){
9570       scores[i].value = NULL;
9571     }
9572   }
9573   if(oname){
9574     if(!cbstricmp(oname, ESTORDIDA)){
9575       qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
9576     } else if(!cbstricmp(oname, ESTORDIDD)){
9577       qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_desc);
9578     } else if(!cbstricmp(oname, ESTORDSCA)){
9579       qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_asc);
9580     } else if(!cbstricmp(oname, ESTORDSCD)){
9581       qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
9582     } else {
9583       ci = db->spacc && !strcmp(oname, db->scname);
9584       onlen = strlen(oname);
9585       attridx = (ESTATTRIDX *)cbmapget(db->aidxs, oname, onlen, NULL);
9586       if(attridx && attridx->type != ESTIDXATTRSEQ) attridx = NULL;
9587       for(i = 0; i < snum; i++){
9588         if(scores[i].value) continue;
9589         if(ci &&
9590            (cbuf = cbmapget(db->spacc, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL){
9591           cbmapmove(db->spacc, (char *)&(scores[i].id), sizeof(int), FALSE);
9592           if(csiz == 1 && cbuf[0] == '\0'){
9593             scores[i].value = cbmemdup("", 0);
9594           } else {
9595             scores[i].value = cbmemdup(cbuf, csiz);
9596           }
9597           continue;
9598         }
9599         if((cbuf = cbmapget(ordattrs, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL){
9600           scores[i].value = cbmemdup(cbuf, csiz);
9601           continue;
9602         }
9603         if(attridx){
9604           if(!(vbuf = est_aidx_seq_get(attridx->db, scores[i].id, &vsiz))) vbuf = cbmemdup("", 0);
9605           scores[i].value = vbuf;
9606           continue;
9607         }
9608         if((mbuf = est_crget(db->attrdb, db->zmode, scores[i].id, &msiz)) != NULL){
9609           if((vbuf = cbmaploadone(mbuf, msiz, oname, onlen, &vsiz)) != NULL){
9610             if(ci) cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), vbuf, vsiz, FALSE);
9611             scores[i].value = vbuf;
9612           } else {
9613             if(ci) cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), "", 1, FALSE);
9614             scores[i].value = cbmemdup("", 0);
9615           }
9616           if(ci && cbmaprnum(db->spacc) > db->scmnum){
9617             num = db->scmnum * 0.1 + 1;
9618             cbmapiterinit(db->spacc);
9619             for(j = 0; j < num && (ibuf = cbmapiternext(db->spacc, &isiz)) != NULL; j++){
9620               cbmapout(db->spacc, ibuf, isiz);
9621             }
9622           }
9623           free(mbuf);
9624         } else {
9625           scores[i].value = cbmemdup("", 0);
9626         }
9627       }
9628       if(!cbstricmp(otype, ESTORDSTRA)){
9629         qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_str_asc);
9630       } else if(!cbstricmp(otype, ESTORDSTRD)){
9631         qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_str_desc);
9632       } else if(!cbstricmp(otype, ESTORDNUMA)){
9633         for(i = 0; i < snum; i++){
9634           tval = cbstrmktime(scores[i].value);
9635           free(scores[i].value);
9636           scores[i].value = (void *)tval;
9637         }
9638         qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_num_asc);
9639         for(i = 0; i < snum; i++){
9640           scores[i].value = NULL;
9641         }
9642       } else if(!cbstricmp(otype, ESTORDNUMD)){
9643         for(i = 0; i < snum; i++){
9644           tval = cbstrmktime(scores[i].value);
9645           free(scores[i].value);
9646           scores[i].value = (void *)tval;
9647         }
9648         qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_num_desc);
9649         for(i = 0; i < snum; i++){
9650           scores[i].value = NULL;
9651         }
9652       }
9653       for(i = 0; i < snum; i++){
9654         free(scores[i].value);
9655       }
9656     }
9657     free(oname);
9658   }
9659   if(distinct){
9660     if(!order && *distinct != '~')
9661       qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
9662     if(*distinct == '~') distinct++;
9663     dnlen = strlen(distinct);
9664     umap = cbmapopenex(snum + 1);
9665     attridx = (ESTATTRIDX *)cbmapget(db->aidxs, distinct, dnlen, NULL);
9666     if(attridx && attridx->type != ESTIDXATTRSEQ) attridx = NULL;
9667     nnum = 0;
9668     for(i = 0; i < snum; i++){
9669       if(scores[i].id >= ESTPDOCIDMIN){
9670         if(!(vbuf = est_db_get_doc_attr(db, scores[i].id, distinct))) vbuf = cbmemdup("", 0);
9671         vsiz = strlen(vbuf);
9672       } else if(attridx){
9673         if(!(vbuf = est_aidx_seq_get(attridx->db, scores[i].id, &vsiz))){
9674           vbuf = cbmemdup("", 0);
9675           vsiz = 0;
9676         }
9677       } else {
9678         if((mbuf = est_crget(db->attrdb, db->zmode, scores[i].id, &msiz)) != NULL){
9679           if(!(vbuf = cbmaploadone(mbuf, msiz, distinct, dnlen, &vsiz))){
9680             vbuf = cbmemdup("", 0);
9681             vsiz = 0;
9682           }
9683           free(mbuf);
9684         } else {
9685           vbuf = cbmemdup("", 0);
9686           vsiz = 0;
9687         }
9688       }
9689       if(cbmapput(umap, vbuf, vsiz, "", 0, FALSE)) scores[nnum++] = scores[i];
9690       free(vbuf);
9691     }
9692     snum = nnum;
9693     cbmapclose(umap);
9694   }
9695   return snum;
9696 }
9697 
9698 
9699 /* Make a list of condition attributes.
9700    `attrs' specifies a list object of attribute expressions.
9701    `nump' specifies the pointer to which the number of elements in the result is assigned.
9702    The return value is a list of condition attributes. */
est_make_cattr_list(const CBLIST * attrs,int * nump)9703 static ESTCATTR *est_make_cattr_list(const CBLIST *attrs, int *nump){
9704   ESTCATTR *list;
9705   const char *rp, *pv;
9706   unsigned char *utmp;
9707   int i, anum, tsiz;
9708   assert(attrs && nump);
9709   anum = CB_LISTNUM(attrs);
9710   CB_MALLOC(list, sizeof(ESTCATTR) * anum + 1);
9711   for(i = 0; i < anum; i++){
9712     list[i].name = NULL;
9713     list[i].oper = NULL;
9714     list[i].val = NULL;
9715     rp = CB_LISTVAL(attrs, i);
9716     while(*rp > 0 && *rp <= ' '){
9717       rp++;
9718     }
9719     if((pv = strchr(rp, ' ')) != NULL){
9720       list[i].nsiz = pv - rp;
9721       list[i].name = cbmemdup(rp, list[i].nsiz);
9722       rp = pv;
9723       while(*rp > 0 && *rp <= ' '){
9724         rp++;
9725       }
9726       if((pv = strchr(rp, ' ')) != NULL){
9727         list[i].oper = cbmemdup(rp, pv - rp);
9728         rp = pv;
9729         while(*rp > 0 && *rp <= ' '){
9730           rp++;
9731         }
9732         list[i].vsiz = strlen(rp);
9733         list[i].val = cbmemdup(rp, list[i].vsiz);
9734       } else {
9735         list[i].oper = cbmemdup(rp, -1);
9736       }
9737     } else {
9738       list[i].nsiz = strlen(rp);
9739       list[i].name = cbmemdup(rp, list[i].nsiz);
9740     }
9741     if(strchr(list[i].name, ',')){
9742       list[i].nlist = cbsplit(list[i].name, list[i].nsiz, ",");
9743     } else {
9744       list[i].nlist = NULL;
9745     }
9746     if(!list[i].oper){
9747       list[i].oper = cbmemdup("", 0);
9748     }
9749     if(!list[i].val){
9750       list[i].vsiz = 0;
9751       list[i].val = cbmemdup("", 0);
9752     }
9753   }
9754   for(i = 0; i < anum; i++){
9755     rp = list[i].oper;
9756     if(*rp == '!'){
9757       list[i].sign = FALSE;
9758       rp++;
9759     } else {
9760       list[i].sign = TRUE;
9761     }
9762     if(*rp == 'I' || *rp == 'i'){
9763       if(est_check_cjk_only(list[i].val)){
9764         list[i].sval = NULL;
9765         list[i].ssiz = 0;
9766       } else {
9767         utmp = (unsigned char *)est_uconv_in(list[i].val, list[i].vsiz, &tsiz);
9768         est_normalize_text(utmp, tsiz, &tsiz);
9769         est_canonicalize_text(utmp, tsiz, FALSE);
9770         list[i].sval = (char *)est_uconv_out((char *)utmp, tsiz, &(list[i].ssiz));
9771         free(utmp);
9772       }
9773       rp++;
9774     } else {
9775       list[i].sval = NULL;
9776       list[i].ssiz = 0;
9777     }
9778     list[i].regex = NULL;
9779     list[i].num = cbstrmktime(list[i].val);
9780     if(!cbstricmp(rp, ESTOPSTREQ)){
9781       list[i].cop = ESTOPSTREQ;
9782     } else if(!cbstricmp(rp, ESTOPSTRNE)){
9783       list[i].cop = ESTOPSTRNE;
9784     } else if(!cbstricmp(rp, ESTOPSTRINC)){
9785       list[i].cop = ESTOPSTRINC;
9786     } else if(!cbstricmp(rp, ESTOPSTRBW)){
9787       list[i].cop = ESTOPSTRBW;
9788     } else if(!cbstricmp(rp, ESTOPSTREW)){
9789       list[i].cop = ESTOPSTREW;
9790     } else if(!cbstricmp(rp, ESTOPSTRAND)){
9791       list[i].cop = ESTOPSTRAND;
9792     } else if(!cbstricmp(rp, ESTOPSTROR)){
9793       list[i].cop = ESTOPSTROR;
9794     } else if(!cbstricmp(rp, ESTOPSTROREQ)){
9795       list[i].cop = ESTOPSTROREQ;
9796     } else if(!cbstricmp(rp, ESTOPSTRRX)){
9797       list[i].cop = ESTOPSTRRX;
9798       list[i].regex = list[i].sval ? est_regex_new(list[i].sval) : est_regex_new(list[i].val);
9799     } else if(!cbstricmp(rp, ESTOPNUMEQ)){
9800       list[i].cop = ESTOPNUMEQ;
9801     } else if(!cbstricmp(rp, ESTOPNUMNE)){
9802       list[i].cop = ESTOPNUMNE;
9803     } else if(!cbstricmp(rp, ESTOPNUMGT)){
9804       list[i].cop = ESTOPNUMGT;
9805     } else if(!cbstricmp(rp, ESTOPNUMGE)){
9806       list[i].cop = ESTOPNUMGE;
9807     } else if(!cbstricmp(rp, ESTOPNUMLT)){
9808       list[i].cop = ESTOPNUMLT;
9809     } else if(!cbstricmp(rp, ESTOPNUMLE)){
9810       list[i].cop = ESTOPNUMLE;
9811     } else if(!cbstricmp(rp, ESTOPNUMBT)){
9812       list[i].cop = ESTOPNUMBT;
9813     } else {
9814       list[i].cop = ESTOPSTRINC;
9815       list[i].val[0] = '\0';
9816       list[i].vsiz = 0;
9817       if(list[i].sval){
9818         list[i].sval[0] = '\0';
9819         list[i].ssiz = 0;
9820       }
9821     }
9822   }
9823   *nump = anum;
9824   return list;
9825 }
9826 
9827 
9828 /* Release resources of a list of condition attributes.
9829    `list' specifies a list of condition attributes.
9830    `anum' specifies the number of elements of the list. */
est_free_cattr_list(ESTCATTR * list,int anum)9831 static void est_free_cattr_list(ESTCATTR *list, int anum){
9832   int i;
9833   assert(list && anum >= 0);
9834   for(i = 0; i < anum; i++){
9835     if(list[i].regex) est_regex_delete(list[i].regex);
9836     free(list[i].sval);
9837     free(list[i].val);
9838     free(list[i].oper);
9839     if(list[i].nlist) CB_LISTCLOSE(list[i].nlist);
9840     free(list[i].name);
9841   }
9842   free(list);
9843 }
9844 
9845 
9846 /* Narrow and sort scores of search candidates.
9847    `db' specifies a database object.
9848    `scores' specifies an array of scores of search candidates.
9849    `snum' specifies the number of the array.
9850    `num' specifies the number of documents to be shown.
9851    `max' specifies the maximum number of shown documents.
9852    `vnum' specifies the number of dimensions of the vector.
9853    `tfidf' specifies whether to perform TF-IDF tuning.
9854    `limit' specifies the upper limit of similarity for documents to survive.
9855    `opts' specifies optoins for eclipse.
9856    `shadows' specifies a map object to store shadow document information.
9857    The return value is the new number of the array. */
est_eclipse_scores(ESTDB * db,ESTSCORE * scores,int snum,int num,int vnum,int tfidf,double limit,CBMAP * shadows)9858 static int est_eclipse_scores(ESTDB *db, ESTSCORE *scores, int snum, int num,
9859                               int vnum, int tfidf, double limit, CBMAP *shadows){
9860   CBMAP *svmap, *tvmap;
9861   const char *suri, *turi;
9862   char *tmp;
9863   int i, j, ubase, simurl, max, *svec, *tvec, pair[2], nnum;
9864   double dval;
9865   assert(db && scores && snum >= 0 && num >= 0 && vnum > 0 && limit > 0.0 && shadows);
9866   ubase = FALSE;
9867   simurl = FALSE;
9868   if(limit == ESTECLSERV || limit == ESTECLDIR || limit == ESTECLFILE){
9869     ubase = TRUE;
9870   } else if(limit >= ESTECLSIMURL){
9871     simurl = TRUE;
9872     limit -= ESTECLSIMURL;
9873     if(limit < 0.01) limit = 0.01;
9874     if(limit > 1.0) limit = 1.0;
9875   }
9876   nnum = 0;
9877   if(ubase){
9878     if(limit == ESTECLSERV){
9879       max = num * 14.8 + 8;
9880     } else if(limit == ESTECLDIR){
9881       max = num * 6.8 + 8;
9882     } else {
9883       max = num * 4.8 + 8;
9884     }
9885     if(max > snum) max = snum;
9886     for(i = 0; i < max; i++){
9887       scores[i].value = est_db_get_doc_attr(db, scores[i].id, ESTDATTRURI);
9888     }
9889     for(i = 0; i < max; i++){
9890       if(!scores[i].value) continue;
9891       for(j = i + 1; j < max; j++){
9892         dval = 0.0;
9893         if(scores[j].value){
9894           switch(est_url_sameness(scores[i].value, scores[j].value)){
9895           case 1:
9896             dval = ESTECLSERV;
9897             break;
9898           case 2:
9899             dval = ESTECLDIR;
9900             break;
9901           case 3:
9902             dval = ESTECLFILE;
9903             break;
9904           }
9905         }
9906         if(dval >= limit){
9907           free(scores[j].value);
9908           scores[j].value = NULL;
9909           pair[0] = scores[j].id;
9910           pair[1] = 0;
9911           cbmapputcat(shadows, (char *)&(scores[i].id), sizeof(int),
9912                       (char *)pair, sizeof(int) * 2);
9913         }
9914       }
9915     }
9916     for(i = 0; i < max; i++){
9917       if(scores[i].value){
9918         free(scores[i].value);
9919         scores[nnum++] = scores[i];
9920       }
9921     }
9922     for(i = max; i < snum; i++){
9923       scores[nnum++] = scores[i];
9924     }
9925   } else {
9926     max = limit < 0.1 ? snum : num * ((2.4 / (limit - 0.05)) + 0.8) + 8;
9927     if(simurl) max *= 1.4;
9928     if(max > snum) max = snum;
9929     CB_MALLOC(svec, vnum * sizeof(int));
9930     CB_MALLOC(tvec, vnum * sizeof(int));
9931     for(i = 0; i < max; i++){
9932       if((svmap = est_get_tvmap(db, scores[i].id, vnum, tfidf)) != NULL){
9933         scores[i].value = (char *)svmap;
9934         if(simurl && (tmp = est_db_get_doc_attr(db, scores[i].id, ESTDATTRURI)) != NULL){
9935           cbmapput(svmap, "", 0, tmp, -1, TRUE);
9936           free(tmp);
9937         }
9938       } else {
9939         scores[i].value = NULL;
9940       }
9941     }
9942     for(i = 0; i < max; i++){
9943       svmap = (CBMAP *)(scores[i].value);
9944       if(!svmap || cbmaprnum(svmap) < 1) continue;
9945       suri = cbmapget((CBMAP *)scores[i].value, "", -1, NULL);
9946       if(num-- < 1) continue;
9947       est_vector_set_seed(svmap, svec, vnum);
9948       for(j = i + 1; j < max; j++){
9949         tvmap = (CBMAP *)(scores[j].value);
9950         if(!tvmap || cbmaprnum(tvmap) < 1) continue;
9951         est_vector_set_target(svmap, tvmap, tvec, vnum);
9952         dval = est_vector_cosine(svec, tvec, vnum);
9953         if(dval > 0.01 && suri &&
9954            (turi = cbmapget((CBMAP *)scores[j].value, "", -1, NULL)) != NULL){
9955           switch(est_url_sameness(suri, turi)){
9956           default:
9957             dval = pow(cos(acos(dval) * (1.0 - pow(dval, 9.9))), 1.07);
9958             break;
9959           case 1:
9960             dval = pow(cos(acos(dval) * (1.0 - pow(dval, 4.1))), 1.05);
9961             break;
9962           case 2:
9963             dval = pow(cos(acos(dval) * (1.0 - pow(dval, 2.9))), 1.03);
9964             break;
9965           case 3:
9966             dval = pow(cos(acos(dval) * (1.0 - pow(dval, 2.1))), 1.01);
9967             break;
9968           }
9969         }
9970         if(dval > limit){
9971           cbmapclose(tvmap);
9972           scores[j].value = NULL;
9973           pair[0] = scores[j].id;
9974           pair[1] = (int)(dval * 10000.0);
9975           cbmapputcat(shadows, (char *)&(scores[i].id), sizeof(int),
9976                       (char *)pair, sizeof(int) * 2);
9977         }
9978       }
9979     }
9980     for(i = 0; i < max; i++){
9981       if(scores[i].value){
9982         cbmapclose((CBMAP *)(scores[i].value));
9983         scores[nnum++] = scores[i];
9984       }
9985     }
9986     for(i = max; i < snum; i++){
9987       scores[nnum++] = scores[i];
9988     }
9989     free(tvec);
9990     free(svec);
9991   }
9992   return nnum;
9993 }
9994 
9995 
9996 /* Check whether a score matches an attribute condition.
9997    `tval' specifies the target value;
9998    `tsiz' specifies the size of the target value
9999    `cop' specifies the pointer to the operator.
10000    `sign' specifies the sign of operation.
10001    `oval' specifies the operation value.
10002    `osiz' specifies the size of the operation value
10003    `sval' specifies the operation value of small cases.
10004    `ssiz' specifies the size of the operation value of small cases.
10005    `regex' specifies the regular expressions.
10006    `onum' specifies the numeric value.
10007    The return value is true if it does match, else it is false. */
est_match_attr(const char * tval,int tsiz,const char * cop,int sign,const char * oval,int osiz,const char * sval,int ssiz,const void * regex,int onum)10008 static int est_match_attr(const char *tval, int tsiz, const char *cop, int sign,
10009                           const char *oval, int osiz, const char *sval, int ssiz,
10010                           const void *regex, int onum){
10011   unsigned char *eval;
10012   char *cval;
10013   int csiz, esiz, hit;
10014   assert(tval && tsiz >= 0 && oval && osiz >= 0);
10015   cval = NULL;
10016   if(sval){
10017     eval = (unsigned char *)est_uconv_in(tval, tsiz, &esiz);
10018     est_normalize_text(eval, esiz, &esiz);
10019     est_canonicalize_text(eval, esiz, FALSE);
10020     cval = (char *)est_uconv_out((char *)eval, esiz, &csiz);
10021     free(eval);
10022     tval = cval;
10023     tsiz = csiz;
10024     oval = sval;
10025     osiz = ssiz;
10026   }
10027   if(cop == ESTOPSTREQ){
10028     hit = !strcmp(tval, oval);
10029   } else if(cop == ESTOPSTRNE){
10030     hit = strcmp(tval, oval) != 0;
10031   } else if(cop == ESTOPSTRINC){
10032     hit = strstr(tval, oval) != NULL;
10033   } else if(cop == ESTOPSTRBW){
10034     hit = cbstrfwmatch(tval, oval);
10035   } else if(cop == ESTOPSTREW){
10036     hit = cbstrbwmatch(tval, oval);
10037   } else if(cop == ESTOPSTRAND){
10038     hit = est_check_strand(tval, oval);
10039   } else if(cop == ESTOPSTROR){
10040     hit = est_check_stror(tval, oval);
10041   } else if(cop == ESTOPSTROREQ){
10042     hit = est_check_stroreq(tval, oval);
10043   } else if(cop == ESTOPSTRRX){
10044     hit = regex ? est_regex_match(regex, tval) : FALSE;
10045   } else if(cop == ESTOPNUMEQ){
10046     hit = cbstrmktime(tval) == onum;
10047   } else if(cop == ESTOPNUMNE){
10048     hit = cbstrmktime(tval) != onum;
10049   } else if(cop == ESTOPNUMGT){
10050     hit = cbstrmktime(tval) > onum;
10051   } else if(cop == ESTOPNUMGE){
10052     hit = cbstrmktime(tval) >= onum;
10053   } else if(cop == ESTOPNUMLT){
10054     hit = cbstrmktime(tval) < onum;
10055   } else if(cop == ESTOPNUMLE){
10056     hit = cbstrmktime(tval) <= onum;
10057   } else if(cop == ESTOPNUMBT){
10058     hit = est_check_numbt(tval, oval);
10059   } else if(cop == ESTOPDUMMY){
10060     hit = TRUE;
10061   } else {
10062     hit = FALSE;
10063   }
10064   free(cval);
10065   return sign ? hit : !hit;
10066 }
10067 
10068 
10069 /* Check whether a string includes all tokens in another string.
10070    `tval' specifies the target value;
10071    `oval' specifies the operation value;
10072    The return value is the result of the check. */
est_check_strand(const char * tval,const char * oval)10073 static int est_check_strand(const char *tval, const char *oval){
10074   const char *sp, *ep, *rp, *pp, *qp;
10075   int hit;
10076   assert(tval && oval);
10077   sp = oval;
10078   while(*sp != '\0'){
10079     while(*sp == ' ' || *sp == ','){
10080       sp++;
10081     }
10082     ep = sp;
10083     while(*ep != '\0' && *ep != ' ' && *ep != ','){
10084       ep++;
10085     }
10086     if(ep > sp){
10087       hit = FALSE;
10088       for(rp = tval; *rp != '\0'; rp++){
10089         for(pp = sp, qp = rp; pp < ep; pp++, qp++){
10090           if(*pp != *qp) break;
10091         }
10092         if(pp == ep && (*qp == '\0' || *qp == ' ' || *qp == ',')){
10093           hit = TRUE;
10094           break;
10095         }
10096       }
10097       if(!hit) return FALSE;
10098     }
10099     sp = ep;
10100   }
10101   return TRUE;
10102 }
10103 
10104 
10105 /* Check whether a string includes at least one token in another string.
10106    `tval' specifies the target value;
10107    `oval' specifies the operation value;
10108    The return value is the result of the check. */
est_check_stror(const char * tval,const char * oval)10109 static int est_check_stror(const char *tval, const char *oval){
10110   const char *sp, *ep, *rp, *pp, *qp;
10111   int hit;
10112   assert(tval && oval);
10113   sp = oval;
10114   while(*sp != '\0'){
10115     while(*sp == ' ' || *sp == ','){
10116       sp++;
10117     }
10118     ep = sp;
10119     while(*ep != '\0' && *ep != ' ' && *ep != ','){
10120       ep++;
10121     }
10122     if(ep > sp){
10123       hit = FALSE;
10124       for(rp = tval; *rp != '\0'; rp++){
10125         for(pp = sp, qp = rp; pp < ep; pp++, qp++){
10126           if(*pp != *qp) break;
10127         }
10128         if(pp == ep && (*qp == '\0' || *qp == ' ' || *qp == ',')){
10129           hit = TRUE;
10130           break;
10131         }
10132       }
10133       if(hit) return TRUE;
10134     }
10135     sp = ep;
10136   }
10137   return FALSE;
10138 }
10139 
10140 
10141 /* Check whether a string is equal to at least one token in another string.
10142    `tval' specifies the target value;
10143    `oval' specifies the operation value;
10144    The return value is the result of the check. */
est_check_stroreq(const char * tval,const char * oval)10145 static int est_check_stroreq(const char *tval, const char *oval){
10146   const char *sp, *ep, *rp;
10147   assert(tval && oval);
10148   sp = oval;
10149   while(*sp != '\0'){
10150     while(*sp == ' ' || *sp == ','){
10151       sp++;
10152     }
10153     ep = sp;
10154     while(*ep != '\0' && *ep != ' ' && *ep != ','){
10155       ep++;
10156     }
10157     if(ep > sp){
10158       for(rp = tval; *rp != '\0'; rp++){
10159         if(*sp != *rp || sp >= ep) break;
10160         sp++;
10161       }
10162       if(*rp == '\0' && sp == ep) return TRUE;
10163     }
10164     sp = ep;
10165   }
10166   return FALSE;
10167 }
10168 
10169 
10170 /* Check whether a decimal string is between two tokens in another string.
10171    `tval' specifies the target value;
10172    `oval' specifies the operation value;
10173    The return value is the result of the check. */
est_check_numbt(const char * tval,const char * oval)10174 static int est_check_numbt(const char *tval, const char *oval){
10175   time_t val, lower, upper, swap;
10176   char numbuf[ESTNUMBUFSIZ];
10177   int i;
10178   for(i = 0; i < ESTNUMBUFSIZ && oval[i] != '\0' && oval[i] != ' ' && oval[i] != '\t'; i++){
10179     numbuf[i] = oval[i];
10180   }
10181   numbuf[i] = '\0';
10182   oval += i;
10183   while(*oval == ' ' || *oval == '\t'){
10184     oval++;
10185   }
10186   if(*oval == '\0') return FALSE;
10187   val = cbstrmktime(tval);
10188   lower = cbstrmktime(numbuf);
10189   upper = cbstrmktime(oval);
10190   if(lower > upper){
10191     swap = lower;
10192     lower = upper;
10193     upper = swap;
10194   }
10195   return val >= lower && val <= upper;
10196 }
10197 
10198 
10199 /* Compare two keywords by scores in descending order.
10200    `ap' specifies the pointer to one keyword.
10201    `bp' specifies the pointer to the other keyword.
10202    The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_keysc_compare(const void * ap,const void * bp)10203 static int est_keysc_compare(const void *ap, const void *bp){
10204   assert(ap && bp);
10205   return ((ESTKEYSC *)bp)->pt - ((ESTKEYSC *)ap)->pt;
10206 }
10207 
10208 
10209 /* Get a similar set of documents in a database.
10210    `db' specifies a database object.
10211    `svmap' specifies a map object of a seed vector.
10212    `nump' specifies the pointer to which the number of elements in the result is assigned.
10213    `knum' specifies the number of keywords to get candidates.
10214    `unum' specifies the number of adopted documents for a keyword.
10215    `tfidf' specifies whether to perform TF-IDF tuning.
10216    `nmin' specifies the minimum value for narrowing.
10217    `auxmin' specifies the minimum hits to adopt the auxiliary index.  If it is not more than 0,
10218    the auxiliary index is not used.
10219    `auxwords' specifies a map object where keywords used with the auxiliary index are stored.  If
10220    it is `NULL', it is not used.
10221    The return value is an array of score structures of corresponding documents. */
est_search_similar(ESTDB * db,CBMAP * svmap,int * nump,int knum,int unum,int mnum,int tfidf,double nmin,int auxmin,CBMAP * auxwords)10222 static ESTSCORE *est_search_similar(ESTDB *db, CBMAP *svmap, int *nump,
10223                                     int knum, int unum, int mnum, int tfidf,
10224                                     double nmin, int auxmin, CBMAP *auxwords){
10225   ESTSCORE *scores, *tscores;
10226   CBMAP *tvmap;
10227   const char *word;
10228   int i, j, vnum, snum, tmax, tsnum, nnum, lid, *svec, *tvec;
10229   double dval;
10230   assert(db && svmap && nump && knum >= 0 && unum >= 0 && nmin >= 0.0);
10231   CB_MALLOC(scores, sizeof(ESTSCORE) * (unum * knum + CB_LISTNUM(db->pdocs)) + 1);
10232   snum = 0;
10233   if((vnum = cbmaprnum(svmap)) < 1) vnum = 1;
10234   cbmapiterinit(svmap);
10235   tmax = unum;
10236   for(i = 0; (i < knum || (i < knum * 2 && snum < unum * 2)) &&
10237         (word = cbmapiternext(svmap, NULL)) != NULL; i++){
10238     while(*word > '\0' && *word <= ' '){
10239       word++;
10240     }
10241     tscores = est_search_union(db, word, 1, NULL, &tsnum, NULL, TRUE, auxmin, auxwords);
10242     qsort(tscores, tsnum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
10243     for(j = 0; j < tmax && j < tsnum; j++){
10244       scores[snum].id = tscores[j].id;
10245       scores[snum].score = tscores[j].score * (knum * 2.2 - i);
10246       snum++;
10247     }
10248     free(tscores);
10249     tmax -= unum / knum / 1.25;
10250     if(tmax < unum / 4) tmax = unum / 4;
10251   }
10252   for(i = 0; i < CB_LISTNUM(db->pdocs); i++){
10253     scores[snum].id = ESTPDOCIDMIN + i;
10254     scores[snum].score = 1;
10255     snum++;
10256   }
10257   qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
10258   nnum = 0;
10259   lid = -1;
10260   for(i = 0; i < snum; i++){
10261     if(nnum > 0 && scores[i].id == lid){
10262       scores[nnum-1].score += scores[i].score;
10263       continue;
10264     }
10265     scores[nnum].id = scores[i].id;
10266     scores[nnum].score = scores[i].score;
10267     nnum++;
10268     lid = scores[i].id;
10269   }
10270   snum = nnum;
10271   qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
10272   nnum = 0;
10273   CB_MALLOC(svec, vnum * sizeof(int));
10274   CB_MALLOC(tvec, vnum * sizeof(int));
10275   est_vector_set_seed(svmap, svec, vnum);
10276   for(i = 0; i < snum && nnum < mnum; i++){
10277     tvmap = est_get_tvmap(db, scores[i].id, vnum, tfidf);
10278     if(tvmap){
10279       est_vector_set_target(svmap, tvmap, tvec, vnum);
10280       if((dval = est_vector_cosine(svec, tvec, vnum)) >= nmin){
10281         scores[nnum].id = scores[i].id;
10282         scores[nnum].score = (int)(dval * 10000);
10283         if(scores[nnum].score == 9999) scores[nnum].score = 10000;
10284         scores[nnum].value = NULL;
10285         nnum++;
10286       }
10287       cbmapclose(tvmap);
10288     }
10289   }
10290   free(tvec);
10291   free(svec);
10292   snum = nnum;
10293   *nump = snum;
10294   return scores;
10295 }
10296 
10297 
10298 /* Create a map object of a vector for similar search from a phrase.
10299    `phrase' specifies a search phrase for similar search.
10300    The return value is a map object of the seed vector. */
est_phrase_vector(const char * phrase)10301 static CBMAP *est_phrase_vector(const char *phrase){
10302   ESTKEYSC *scores;
10303   CBMAP *svmap;
10304   CBLIST *list;
10305   const char *pv, *rp;
10306   char *utext, *rtext;
10307   int i, num, len, size;
10308   svmap = cbmapopenex(ESTMINIBNUM);
10309   CB_LISTOPEN(list);
10310   while(*phrase != '\0'){
10311     if(*phrase == ESTOPWITH[0] && cbstrfwmatch(phrase, ESTOPWITH)){
10312       phrase += strlen(ESTOPWITH);
10313       pv = phrase;
10314       while(*phrase != '\0'){
10315         if(*phrase <= ' ' && cbstrfwmatch(phrase + 1, ESTOPWITH)){
10316           phrase++;
10317           break;
10318         }
10319         phrase++;
10320       }
10321       CB_LISTPUSH(list, pv, phrase - pv);
10322     } else {
10323       phrase++;
10324     }
10325   }
10326   for(i = 0; i < CB_LISTNUM(list); i++){
10327     pv = CB_LISTVAL(list, i);
10328     while(*pv > '\0' && *pv <= ' '){
10329       pv++;
10330     }
10331     num = strtol(pv, (char **)&rp, 10);
10332     if(rp && (len = rp - pv) > 0 && num >= 0){
10333       utext = est_uconv_in(rp, strlen(rp), &size);
10334       est_normalize_text((unsigned char *)utext, size, &size);
10335       est_canonicalize_text((unsigned char *)utext, size, FALSE);
10336       rtext = est_uconv_out(utext, size, NULL);
10337       cbstrsqzspc(rtext);
10338       if(rtext[0] != '\0') cbmapput(svmap, rtext, -1, pv, len, FALSE);
10339       free(rtext);
10340       free(utext);
10341     }
10342   }
10343   CB_LISTCLOSE(list);
10344   CB_MALLOC(scores, cbmaprnum(svmap) * sizeof(ESTKEYSC) + 1);
10345   cbmapiterinit(svmap);
10346   for(i = 0; (rp = cbmapiternext(svmap, &len)) != NULL; i++){
10347     scores[i].word = rp;
10348     scores[i].wsiz = len;
10349     scores[i].pt = atoi(cbmapiterval(rp, NULL));
10350   }
10351   qsort(scores, i, sizeof(ESTKEYSC), est_keysc_compare);
10352   for(i--; i >= 0; i--){
10353     cbmapmove(svmap, scores[i].word, scores[i].wsiz, TRUE);
10354   }
10355   free(scores);
10356   return svmap;
10357 }
10358 
10359 
10360 /* Get the target vector of a document dynamically.
10361    `db' specifies a database object.
10362    `id' specifies the ID of a document.
10363    `vnum' specifies the number of dimensions of the vector.
10364    `tfidf' specifies whether to perform TF-IDF tuning.
10365    The return value is a map object of the target vector. */
est_get_tvmap(ESTDB * db,int id,int vnum,int tfidf)10366 static CBMAP *est_get_tvmap(ESTDB *db, int id, int vnum, int tfidf){
10367   ESTDOC *doc;
10368   CBMAP *tvmap;
10369   assert(db && id > 0);
10370   if((tvmap = est_db_get_keywords(db, id)) != NULL) return tvmap;
10371   if(!(doc = est_db_get_doc(db, id, 0))) return NULL;
10372   tvmap = est_db_etch_doc(tfidf ? db : NULL, doc, vnum);
10373   est_doc_delete(doc);
10374   if(dpwritable(db->metadb)) est_db_put_keywords(db, id, tvmap, 1.0);
10375   return tvmap;
10376 }
10377 
10378 
10379 /* Calculate sameness of two URLs.
10380    The return value is 0 if the both have different servers, 1 if the both have the same server,
10381    2 if the both have the same parent directory, 3 if the both have the same file. */
est_url_sameness(const char * aurl,const char * burl)10382 static int est_url_sameness(const char *aurl, const char *burl){
10383   const char *apv, *bpv;
10384   int i, alen, blen;
10385   assert(aurl && burl);
10386   if((apv = strstr(aurl, "://")) != NULL){
10387     aurl = apv + 3;
10388   } else {
10389     return 0;
10390   }
10391   if((bpv = strstr(burl, "://")) != NULL){
10392     burl = bpv + 3;
10393   } else {
10394     return 0;
10395   }
10396   if(!(apv = strchr(aurl, '/'))) apv = aurl + strlen(aurl);
10397   if(!(bpv = strchr(burl, '/'))) bpv = burl + strlen(burl);
10398   alen = apv - aurl;
10399   blen = bpv - burl;
10400   if(alen != blen || memcmp(aurl, burl, alen)) return 0;
10401   aurl = *apv == '\0' ? "/" : apv;
10402   burl = *bpv == '\0' ? "/" : bpv;
10403   if(!(apv = strchr(aurl, '?'))) apv = aurl + strlen(aurl);
10404   if(!(bpv = strchr(burl, '?'))) bpv = burl + strlen(burl);
10405   alen = apv - aurl;
10406   blen = bpv - burl;
10407   if(alen == blen && !memcmp(aurl, burl, alen)) return 3;
10408   apv = aurl;
10409   for(i = 0; i < alen; i++){
10410     if(aurl[i] == '/') apv = aurl + i;
10411   }
10412   bpv = burl;
10413   for(i = 0; i < blen; i++){
10414     if(burl[i] == '/') bpv = burl + i;
10415   }
10416   alen = apv - aurl;
10417   blen = bpv - burl;
10418   if(alen == blen && !memcmp(aurl, burl, alen)) return 2;
10419   return 1;
10420 }
10421 
10422 
10423 /* Close the handle to the file of random number generator. */
est_random_fclose(void)10424 static void est_random_fclose(void){
10425   if(est_random_ifp) fclose(est_random_ifp);
10426 }
10427 
10428 
10429 /* Dispatch a signal to the corresponding handler.
10430    Signum specifies the number of catched signal. */
est_signal_dispatch(int signum)10431 static int est_signal_dispatch(int signum){
10432 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
10433   switch(signum){
10434   case CTRL_C_EVENT: case CTRL_BREAK_EVENT: case CTRL_CLOSE_EVENT:
10435     signum = 2;
10436     break;
10437   case CTRL_LOGOFF_EVENT: case CTRL_SHUTDOWN_EVENT:
10438     signum = 15;
10439     break;
10440   }
10441   if(est_signal_handlers[signum]) est_signal_handlers[signum](signum);
10442   return TRUE;
10443 #else
10444   assert(signum >= 0);
10445   if(est_signal_handlers[signum]) est_signal_handlers[signum](signum);
10446   return TRUE;
10447 #endif
10448 }
10449 
10450 
10451 
10452 /* END OF FILE */
10453