1 /*************************************************************************************************
2 * Implementation of the core API
3 * Copyright (C) 2004-2007 Mikio Hirabayashi
4 * This file is part of Hyper Estraier.
5 * Hyper Estraier is free software; you can redistribute it and/or modify it under the terms of
6 * the GNU Lesser General Public License as published by the Free Software Foundation; either
7 * version 2.1 of the License or any later version. Hyper Estraier is distributed in the hope
8 * that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
10 * License for more details.
11 * You should have received a copy of the GNU Lesser General Public License along with Hyper
12 * Estraier; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330,
13 * Boston, MA 02111-1307 USA.
14 *************************************************************************************************/
15
16
17 #if defined(_MYVISTA)
18 #include <vista.h>
19 #endif
20
21 #include "estraier.h"
22 #include "myconf.h"
23
24 #define ESTNUMBUFSIZ 32 /* size of a buffer for a number */
25 #define ESTPATHBUFSIZ 4096 /* size of a buffer for a path */
26 #define ESTIOBUFSIZ 8192 /* size of a buffer for I/O */
27 #define ESTALLOCUNIT 1024 /* unit number of memory allocation */
28 #define ESTMINIBNUM 31 /* bucket number of map for attributes */
29 #define ESTSCANWNUM 256 /* number of words for scaning check */
30 #define ESTSIGNUM 64 /* number of signals */
31 #define ESTREGSUBMAX 32 /* maximum number of substrings for regex */
32
33 #define ESTMETADBNAME "_meta" /* name of the meta database */
34 #define ESTKEYIDXNUM "_idxnum" /* key for the number of inverted indexes */
35 #define ESTKEYDSEQ "_dseq" /* key for the sequence for document IDs */
36 #define ESTKEYDNUM "_dnum" /* key for the number of documents */
37 #define ESTKEYMETA "_meta" /* key for meta data */
38
39 #define ESTIDXDBNAME "_idx" /* name of the inverted index */
40 #define ESTIDXDBLRM 109 /* records in a leaf node of the inverted index */
41 #define ESTIDXDBLRMA 17 /* records in a leaf node of the index in APN mode */
42 #define ESTIDXDBNIM 160 /* records in a non-leaf node of the inverted index */
43 #define ESTIDXDBLCN 16 /* number of leaf cache of the inverted index */
44 #define ESTIDXDBNCN 16 /* number of non-leaf cache of the inverted index */
45 #define ESTIDXDBRLCN 128 /* number of leaf cache of the index reader */
46 #define ESTIDXDBRLCNA 32 /* number of leaf cache of the reader in APN mode */
47 #define ESTIDXDBRNCN 256 /* number of non-leaf cache of the index reader */
48 #define ESTIDXDBFBP 512 /* size of free block pool of the inverted index */
49 #define ESTIDXDBMIN (1048576*512) /* minimum size of a database file */
50 #define ESTIDXDBMAX (1048576*1536) /* maximum size of a database file */
51
52 #define ESTFWMDBNAME "_fwm" /* name of the database for forward matching */
53 #define ESTFWMDBLRM 251 /* records in a leaf node of forward matching DB */
54 #define ESTFWMDBNIM 110 /* records in a non-leaf node of forward matching DB */
55 #define ESTFWMDBLCN 32 /* number of leaf cache of forward matching DB */
56 #define ESTFWMDBNCN 16 /* number of non-leaf cache of forward matching DB */
57 #define ESTFWMDBFBP 128 /* size of free block pool of forward matching DB */
58
59 #define ESTAUXDBNAME "_aux" /* name of the auxiliary index */
60 #define ESTAUXDBLRM 23 /* records in a leaf node of the auxiliary index */
61 #define ESTAUXDBNIM 160 /* records in a non-leaf node of the auxiliary index */
62 #define ESTAUXDBLCN 16 /* number of leaf cache of the auxiliary index */
63 #define ESTAUXDBNCN 16 /* number of non-leaf cache of the auxiliary index */
64 #define ESTAUXDBRLCN 256 /* number of leaf cache of the auxiliary reader */
65 #define ESTAUXDBRNCN 64 /* number of non-leaf cache of the auxiliary reader */
66 #define ESTAUXDBFBP 256 /* size of free block pool of the auxiliary index */
67
68 #define ESTXFMDBNAME "_xfm" /* name of the database for auxiliary forward matching */
69 #define ESTXFMDBLRM 111 /* records in a leaf node of xfm DB */
70 #define ESTXFMDBNIM 110 /* records in a non-leaf node of xfm DB */
71 #define ESTXFMDBLCN 32 /* number of leaf cache of xfm DB */
72 #define ESTXFMDBNCN 16 /* number of non-leaf cache of xfm DB */
73 #define ESTXFMDBFBP 128 /* size of free block pool of xfm DB */
74
75 #define ESTATTRDBNAME "_attr" /* name of the database for attributes */
76 #define ESTATTRDBBNUM 212987 /* bucket number of the database for attributes */
77 #define ESTATTRDBDNUM 3 /* division number of the database for attributes */
78 #define ESTATTRDBALN -5 /* alignment of the database for attributes */
79 #define ESTATTRDBFBP 64 /* size of free block pool of the attribute DB */
80
81 #define ESTTEXTDBNAME "_text" /* name of the database of texts */
82 #define ESTTEXTDBBNUM 61417 /* bucket number of the database for texts */
83 #define ESTTEXTDBDNUM 7 /* division number of the database for texts */
84 #define ESTTEXTDBALN -5 /* alignment of the database for texts */
85 #define ESTTEXTDBFBP 128 /* size of free block pool of the text DB */
86
87 #define ESTKWDDBNAME "_kwd" /* name of the database of keywords */
88 #define ESTKWDDBBNUM 163819 /* bucket number of the database for keywords */
89 #define ESTKWDDBDNUM 3 /* division number of the database for keywords */
90 #define ESTKWDDBALN -5 /* alignment of the database for keywords */
91 #define ESTKWDDBFBP 64 /* size of free block pool of the keyword DB */
92
93 #define ESTLISTDBNAME "_list" /* name of the database of document list */
94 #define ESTLISTDBLRM 99 /* records in a leaf node of document list DB */
95 #define ESTLISTDBNIM 200 /* records in a non-leaf node of document list DB */
96 #define ESTLISTDBLCN 64 /* number of leaf cache of document list DB */
97 #define ESTLISTDBNCN 16 /* number of non-leaf cache of document list DB */
98 #define ESTLISTDBFBP 128 /* size of free block pool of document list DB */
99
100 #define ESTAISEQPREF "__seq_" /* prefix of the database for sequencial access */
101 #define ESTAISTRPREF "__str_" /* prefix of the database for string narrowing */
102 #define ESTAINUMPREF "__num_" /* prefix of the database for number narrowing */
103 #define ESTAIBDIAM 0.8 /* diameter of the bucket number */
104 #define ESTAIDXLRM 99 /* records in a leaf node of narrowing index */
105 #define ESTAIDXNIM 120 /* records in a non-leaf node of narrowing index */
106 #define ESTAIDXLCN 1024 /* number of leaf cache of narrowing index */
107 #define ESTAIDXNCN 256 /* number of non-leaf cache of narrowing index */
108 #define ESTAIDXDPFBP 32 /* size of free block pool of sequencial DB */
109 #define ESTAIDXVLFBP 128 /* size of free block pool of narrowing DB */
110 #define ESTAIKBUFSIZ 8192 /* size of a buffer for a key */
111 #define ESTAISNUMMIN 256 /* minimum number of scores to use narrowing index */
112 #define ESTOPDUMMY "[DUMMY]" /* dummy operator */
113
114 #define ESTDBSBRAT 0.3 /* ratio of bucket numbers of large mode */
115 #define ESTDBSDRAT 0.4 /* ratio of the division number of large mode */
116 #define ESTDBLBRAT 3.0 /* ratio of bucket numbers of large mode */
117 #define ESTDBLDRAT 1.0 /* ratio of the division number of large mode */
118 #define ESTDBHBRAT 5.0 /* ratio of bucket numbers of huge mode */
119 #define ESTDBHDRAT 2.0 /* ratio of the division number of huge mode */
120 #define ESTDBH2RAT 1.4 /* ratio of huge mode second */
121 #define ESTDBH3RAT 2.0 /* ratio of huge mode third */
122
123 #define ESTVLCRDNUM 2 /* division number of usual Villa databases */
124 #define ESTVLCRDNAUX 7 /* division number of the auxiliary index */
125
126 #define ESTIDXCCBNUM 524288 /* bucket number of cache for the inverted index */
127 #define ESTAUXCCBNUM 65521 /* bucket number of cache for the auxiliary index */
128 #define ESTIDXCCMAX (1048576*64) /* max size of the cache */
129 #define ESTOUTCCBNUM 131072 /* bucket number of cache for deleted documents */
130 #define ESTKEYCCMNUM 65536 /* bucket number of cache for keys for TF-IDF */
131 #define ESTATTRCCMNUM 8192 /* number of cache for attributes */
132 #define ESTTEXTCCMNUM 1024 /* number of cache for texts */
133 #define ESTRESCCMNUM 256 /* number of cache for results */
134 #define ESTCCIRSLOT 256 /* slot timing for interruption */
135 #define ESTCCCBFREQ 10000 /* frequency of callback for flushing words */
136
137 #define ESTDIRMODE 00755 /* permission of a creating directory */
138 #define ESTICCHECKSIZ 32768 /* size of checking character code */
139 #define ESTICMISSMAX 256 /* allowance number of missing characters */
140 #define ESTICALLWRAT 0.001 /* allowance ratio of missing characters */
141 #define ESTOCPOINT 16 /* point per occurrence */
142 #define ESTJHASHNUM 251 /* hash number for a junction */
143 #define ESTWORDMAXLEN 48 /* maximum length of a word */
144 #define ESTWORDAVGLEN 8 /* average length of a word */
145 #define ESTATTRALW 1.5 /* allowance ratio of attribute narrowing */
146 #define ESTKEYSCALW 3 /* allowance ratio of TF-IDF for keywords */
147 #define ESTMEMIRATIO 1.1 /* incremental ratio of memory allocation */
148
149 #define ESTSCOREUNIT 1000 /* unit of standard deviation of scoring */
150 #define ESTAUXMIN 32 /* minimum hits to adopt the auxiliary index */
151 #define ESTAUXEXRAT 16 /* ratio of hits of keywords expansion */
152 #define ESTWILDMAX 256 /* maximum number of expansion of wild cards */
153 #define ESTECLKNUM 32 /* number of keywords to eclipse candidates */
154 #define ESTSMLRKNUM 16 /* number of keywords to get candidates */
155 #define ESTSMLRUNUM 1024 /* number of adopted documents for a keyword */
156 #define ESTSMLRMNUM 4096 /* maximum number of candidates to be checked */
157 #define ESTSMLRNMIN 0.5 /* the minimum value for narrowing */
158
159 /* set a buffer for a variable length number */
160 #define EST_SET_VNUMBUF(EST_len, EST_buf, EST_num) \
161 do { \
162 int _EST_num = (EST_num); \
163 div_t EST_d; \
164 if(_EST_num == 0){ \
165 ((signed char *)(EST_buf))[0] = 0; \
166 (EST_len) = 1; \
167 } else { \
168 (EST_len) = 0; \
169 while(_EST_num > 0){ \
170 EST_d = div(_EST_num, 128); \
171 _EST_num = EST_d.quot; \
172 if(_EST_num > 0){ \
173 ((signed char *)(EST_buf))[(EST_len)] = -EST_d.rem - 1; \
174 } else { \
175 ((signed char *)(EST_buf))[(EST_len)] = EST_d.rem; \
176 } \
177 (EST_len)++; \
178 } \
179 } \
180 } while(FALSE)
181
182 /* read a variable length buffer */
183 #define EST_READ_VNUMBUF(EST_buf, EST_num, EST_step) \
184 do { \
185 int _EST_i, _EST_base; \
186 (EST_num) = 0; \
187 _EST_base = 1; \
188 for(_EST_i = 0; TRUE; _EST_i++){ \
189 if(((signed char *)(EST_buf))[_EST_i] >= 0){ \
190 (EST_num) += ((signed char *)(EST_buf))[_EST_i] * _EST_base; \
191 break; \
192 } \
193 (EST_num) += _EST_base * (((signed char *)(EST_buf))[_EST_i] + 1) * -1; \
194 _EST_base *= 128; \
195 } \
196 EST_step = _EST_i + 1; \
197 } while(FALSE)
198
199 typedef struct { /* type of structure for an attribute database */
200 void *db; /* handle of the database */
201 int type; /* data type */
202 } ESTATTRIDX;
203
204 enum { /* enumeration for character categories */
205 ESTSPACECHR, /* space characters */
206 ESTDELIMCHR, /* delimiter characters */
207 ESTWESTALPH, /* west alphabets */
208 ESTEASTALPH, /* east alphabets */
209 ESTHIRAGANA, /* east alphabets: hiragana */
210 ESTKATAKANA, /* east alphabets: katakana */
211 ESTHANGUL, /* east alphabets: hangul */
212 ESTKANJI /* east alphabets: kanji */
213 };
214
215 enum { /* enumeration for flags for databases */
216 ESTDFPERFNG = 1 << 10, /* use perfect N-gram analizer */
217 ESTDFCHRCAT = 1 << 11, /* use character category analizer */
218 ESTDFZLIB = 1 << 15, /* compress records with ZLIB */
219 ESTDFLZO = 1 << 16, /* compress records with LZO */
220 ESTDFBZIP = 1 << 17, /* compress records with BZIP2 */
221 ESTDFSCVOID = 1 << 20, /* store scores as void */
222 ESTDFSCINT = 1 << 21, /* store scores as integer */
223 ESTDFSCASIS = 1 << 22 /* refrain from adjustment of scores */
224 };
225
226 enum { /* enumration for phrase format */
227 ESTPMUSUAL, /* usual phrase */
228 ESTPMSIMPLE, /* simplified phrase */
229 ESTPMROUGH, /* rough phrase */
230 ESTPMUNION, /* union phrase */
231 ESTPMISECT /* intersection phrase */
232 };
233
234 typedef struct { /* type of structure for a hitting object */
235 int id; /* ID of a document */
236 int score; /* score tuned by TF-IDF */
237 char *value; /* value of an attribute for sorting */
238 } ESTSCORE;
239
240 typedef struct { /* type of structure for a conditional attribute */
241 char *name; /* name */
242 int nsiz; /* size of the name */
243 CBLIST *nlist; /* list of plural names */
244 char *oper; /* operator */
245 char *val; /* value */
246 int vsiz; /* size of the value */
247 const char *cop; /* canonical operator */
248 int sign; /* positive or negative */
249 char *sval; /* value of small cases */
250 int ssiz; /* size of the small value */
251 void *regex; /* compiled regular expressions */
252 time_t num; /* numeric value */
253 } ESTCATTR;
254
255 typedef struct { /* type of structure for a hitting object */
256 const char *word; /* face of keyword */
257 int wsiz; /* size of the keyword */
258 int pt; /* score tuned by TF-IDF */
259 } ESTKEYSC;
260
261 typedef struct { /* type of structure for a meta hitting object */
262 int db; /* index of a container database */
263 int id; /* ID of a document */
264 int score; /* score tuned by TF-IDF */
265 char *value; /* value of an attribute for sorting */
266 } ESTMETASCORE;
267
268
269 /* private function prototypes */
270 static void est_set_ecode(int *ecp, int value, int line);
271 static char *est_hex_encode(const char *str);
272 static char *est_hex_decode(const char *str);
273 static int est_enc_miss(const char *ptr, int size, const char *icode, const char *ocode);
274 static void est_normalize_text(unsigned char *utext, int size, int *sp);
275 static void est_canonicalize_text(unsigned char *utext, int size, int funcspc);
276 static int est_char_category(int c);
277 static int est_char_category_perfng(int c);
278 static int est_char_category_chrcat(int c);
279 static char *est_make_snippet(const char *str, int len, const CBLIST *words,
280 int wwidth, int hwidth, int awidth);
281 static int est_check_cjk_only(const char *str);
282 static char *est_phrase_from_simple(const char *sphrase);
283 static char *est_phrase_from_rough(const char *rphrase);
284 static char *est_phrase_from_union(const char *uphrase);
285 static char *est_phrase_from_isect(const char *iphrase);
286 static void est_snippet_add_text(const unsigned char *rtext, const unsigned char *ctext,
287 int size, int awsiz, CBDATUM *res, const CBLIST *rwords);
288 static int est_str_fwmatch_wide(const unsigned char *haystack, int hsiz,
289 const unsigned char *needle, int nsiz);
290 static char *est_strstr_sparse(const char *haystack, const char *needle);
291 static int est_idx_rec_last_id(const char *vbuf, int vsiz, int smode);
292 static void est_encode_idx_rec(CBDATUM *datum, const char *vbuf, int vsiz, int lid, int smode);
293 static void est_decode_idx_rec(CBDATUM *datum, const char *vbuf, int vsiz, int smode);
294 static ESTIDX *est_idx_open(const char *name, int omode, int dnum);
295 static int est_idx_close(ESTIDX *idx);
296 static void est_idx_set_tuning(ESTIDX *idx, int lrecmax, int nidxmax, int lcnum, int ncnum,
297 int fbpsiz);
298 static void est_idx_increment(ESTIDX *idx);
299 static int est_idx_dnum(ESTIDX *idx);
300 static int est_idx_add(ESTIDX *idx, const char *word, int wsiz,
301 const char *vbuf, int vsiz, int smode);
302 static int est_idx_put_one(ESTIDX *idx, int inum, const char *word, int wsiz,
303 const char *vbuf, int vsiz);
304 static int est_idx_out(ESTIDX *idx, const char *word, int wsiz);
305 static char *est_idx_scan(ESTIDX *idx, const char *word, int wsiz, int *sp, int smode);
306 static const char *est_idx_get_one(ESTIDX *idx, int inum, const char *word, int wsiz, int *sp);
307 static int est_idx_vsiz(ESTIDX *idx, const char *word, int wsiz);
308 static int est_idx_num(ESTIDX *idx);
309 static double est_idx_size(ESTIDX *idx);
310 static int est_idx_size_current(ESTIDX *idx);
311 static int est_idx_memflush(ESTIDX *idx);
312 static int est_idx_sync(ESTIDX *idx);
313 static int est_idx_optimize(ESTIDX *idx);
314 static void est_idx_set_current(ESTIDX *idx);
315 static int est_crput(CURIA *curia, int zmode, int id, const char *vbuf, int vsiz, int dmode);
316 static int est_crout(CURIA *curia, int id);
317 static char *est_crget(CURIA *curia, int flags, int id, int *sp);
318 static int est_aidx_seq_put(DEPOT *db, int id, const char *vbuf, int vsiz);
319 static int est_aidx_seq_out(DEPOT *db, int id);
320 static char *est_aidx_seq_get(DEPOT *db, int id, int *sp);
321 static int est_aidx_seq_narrow(DEPOT *db, const CBLIST *pdocs, const char *cop, int sign,
322 const char *oval, int osiz, const char *sval, int ssiz,
323 const void *regex, int onum, ESTSCORE *scores, int snum,
324 int limit, int *restp);
325 static int est_aidx_numcmp(const char *aptr, int asiz, const char *bptr, int bsiz);
326 static int est_aidx_attr_put(VILLA *db, int id, const char *vbuf, int vsiz);
327 static int est_aidx_attr_out(VILLA *db, int id, const char *vbuf, int vsiz);
328 static int est_aidx_attr_narrow(VILLA *db, const CBLIST *pdocs, const char *cop, int sign,
329 const char *oval, int osiz, const char *sval, int ssiz,
330 const void *regex, int onum, ESTSCORE *scores, int snum);
331 static int est_int_compare(const void *ap, const void *bp);
332 static int est_short_compare(const void *ap, const void *bp);
333 static void est_inodes_delete(void *arg);
334 static void est_inodes_delete_informer(const char *msg, void *opaque);
335 static int est_db_write_meta(ESTDB *db);
336 static void est_db_inform(ESTDB *db, const char *info);
337 static void est_db_prepare_meta(ESTDB *db);
338 static int est_db_score_doc(ESTDB *db, ESTDOC *doc, ESTCOND *cond, int *scp);
339 static int est_pidx_uri_to_id(ESTDB *db, const char *uri);
340 static CBLIST *est_phrase_terms(const char *phrase);
341 static int est_score_compare_by_id_asc(const void *ap, const void *bp);
342 static int est_score_compare_by_id_desc(const void *ap, const void *bp);
343 static int est_score_compare_by_score_asc(const void *ap, const void *bp);
344 static int est_score_compare_by_score_desc(const void *ap, const void *bp);
345 static int est_score_compare_by_str_asc(const void *ap, const void *bp);
346 static int est_score_compare_by_str_desc(const void *ap, const void *bp);
347 static int est_score_compare_by_num_asc(const void *ap, const void *bp);
348 static int est_score_compare_by_num_desc(const void *ap, const void *bp);
349 static int est_metascore_compare_by_id_asc(const void *ap, const void *bp);
350 static int est_metascore_compare_by_id_desc(const void *ap, const void *bp);
351 static int est_metascore_compare_by_score_asc(const void *ap, const void *bp);
352 static int est_metascore_compare_by_score_desc(const void *ap, const void *bp);
353 static int est_metascore_compare_by_str_asc(const void *ap, const void *bp);
354 static int est_metascore_compare_by_str_desc(const void *ap, const void *bp);
355 static int est_metascore_compare_by_num_asc(const void *ap, const void *bp);
356 static int est_metascore_compare_by_num_desc(const void *ap, const void *bp);
357 static ESTSCORE *est_search_uvset(ESTDB *db, int *nump, CBMAP *hints, int add);
358 static void est_expand_word_bw(ESTDB *db, const char *word, CBLIST *list);
359 static void est_expand_word_ew(ESTDB *db, const char *word, CBLIST *list);
360 static void est_expand_word_rx(ESTDB *db, const char *word, CBLIST *list);
361 static void est_expand_keyword_bw(ESTDB *db, const char *word, CBLIST *list);
362 static void est_expand_keyword_ew(ESTDB *db, const char *word, CBLIST *list);
363 static void est_expand_keyword_rx(ESTDB *db, const char *word, CBLIST *list);
364 static ESTSCORE *est_search_union(ESTDB *db, const char *term, int gstep,
365 void (*xpn)(const char *, CBLIST *),
366 int *nump, CBMAP *hints, int add, int auxmin, CBMAP *auxwords);
367 static const ESTSCORE *est_rescc_get(ESTDB *db, const char *word, int size, int *nump);
368 static void est_rescc_put(ESTDB *db, const char *word, int size, ESTSCORE *scores, int num);
369 static ESTSCORE *est_search_keywords(ESTDB *db, const char *word, int min, int *nump);
370 static void est_weight_keywords(ESTDB *db, const char *word, ESTSCORE *scores, int snum);
371 static ESTSCORE *est_search_rank(ESTDB *db, const char *name, int top, int *nump);
372 static ESTSCORE *est_search_aidx_attr(ESTDB *db, const char *expr, int *nump);
373 static ESTSCORE *est_search_pidxs(ESTDB *db, ESTCOND *cond, ESTSCORE *scores, int *nump,
374 CBMAP *ordattrs);
375 static int est_narrow_scores(ESTDB *db, const CBLIST *attrs, int ign,
376 const char *order, const char *distinct, ESTSCORE *scores, int snum,
377 int limit, int *restp, CBMAP *ordattrs);
378 static ESTCATTR *est_make_cattr_list(const CBLIST *attrs, int *nump);
379 static void est_free_cattr_list(ESTCATTR *list, int anum);
380 static int est_eclipse_scores(ESTDB *db, ESTSCORE *scores, int snum, int num,
381 int vnum, int tfidf, double limit, CBMAP *shadows);
382 static int est_match_attr(const char *tval, int tsiz, const char *cop, int sign,
383 const char *oval, int osiz, const char *sval, int ssiz,
384 const void *regex, int onum);
385 static int est_check_strand(const char *tval, const char *oval);
386 static int est_check_stror(const char *tval, const char *oval);
387 static int est_check_stroreq(const char *tval, const char *oval);
388 static int est_check_numbt(const char *tval, const char *oval);
389 static int est_keysc_compare(const void *ap, const void *bp);
390 static ESTSCORE *est_search_similar(ESTDB *db, CBMAP *svmap, int *nump,
391 int knum, int unum, int mnum, int tfidf,
392 double nmin, int auxmin, CBMAP *auxwords);
393 static CBMAP *est_phrase_vector(const char *phrase);
394 static CBMAP *est_get_tvmap(ESTDB *db, int id, int vnum, int tfidf);
395 static int est_url_sameness(const char *aurl, const char *burl);
396 static void est_random_fclose(void);
397 static int est_signal_dispatch(int signum);
398
399
400
401 /*************************************************************************************************
402 * common settings
403 *************************************************************************************************/
404
405
406 /* version of Hyper Estraier */
407 const char *est_version = _EST_VERSION;
408
409
410
411 /*************************************************************************************************
412 * API for document
413 *************************************************************************************************/
414
415
416 /* Create a document object. */
est_doc_new(void)417 ESTDOC *est_doc_new(void){
418 ESTDOC *doc;
419 CB_MALLOC(doc, sizeof(ESTDOC));
420 doc->id = -1;
421 doc->attrs = NULL;
422 doc->dtexts = NULL;
423 doc->kwords = NULL;
424 return doc;
425 }
426
427
428 /* Create a document object made from draft data. */
est_doc_new_from_draft(const char * draft)429 ESTDOC *est_doc_new_from_draft(const char *draft){
430 ESTDOC *doc;
431 CBLIST *lines;
432 const char *line;
433 char *pv, *rp, *ep;
434 int i;
435 assert(draft);
436 doc = est_doc_new();
437 lines = cbsplit(draft, -1, "\n");
438 for(i = 0; i < CB_LISTNUM(lines); i++){
439 line = CB_LISTVAL(lines, i);
440 while(*line > '\0' && *line <= ' '){
441 line++;
442 }
443 if(*line == '\0'){
444 i++;
445 break;
446 }
447 if(*line == '%'){
448 if(cbstrfwmatch(line, ESTDCNTLVECTOR)){
449 if(!doc->kwords) doc->kwords = cbmapopenex(ESTMINIBNUM);
450 if((rp = strchr(line, '\t')) != NULL) rp++;
451 while(rp && (pv = strchr(rp, '\t')) != NULL){
452 pv++;
453 if((ep = strchr(pv, '\t')) != NULL){
454 *ep = '\0';
455 ep++;
456 }
457 if(rp[0] != '\0' && pv[0] != '\0') cbmapput(doc->kwords, rp, pv - rp - 1, pv, -1, TRUE);
458 rp = ep;
459 }
460 } else if(cbstrfwmatch(line, ESTDCNTLSCORE)){
461 if((rp = strchr(line, '\t')) != NULL) est_doc_set_score(doc, atoi(rp + 1));
462 }
463 } else if((pv = strchr(line, '=')) != NULL){
464 *(pv++) = '\0';
465 est_doc_add_attr(doc, line, pv);
466 }
467 }
468 for(; i < CB_LISTNUM(lines); i++){
469 line = CB_LISTVAL(lines, i);
470 if(*line == '\t'){
471 est_doc_add_hidden_text(doc, line + 1);
472 } else {
473 est_doc_add_text(doc, line);
474 }
475 }
476 CB_LISTCLOSE(lines);
477 return doc;
478 }
479
480
481 /* Destroy a document object. */
est_doc_delete(ESTDOC * doc)482 void est_doc_delete(ESTDOC *doc){
483 assert(doc);
484 if(doc->kwords) cbmapclose(doc->kwords);
485 if(doc->dtexts) CB_LISTCLOSE(doc->dtexts);
486 if(doc->attrs) cbmapclose(doc->attrs);
487 free(doc);
488 }
489
490
491 /* Add an attribute to a document object. */
est_doc_add_attr(ESTDOC * doc,const char * name,const char * value)492 void est_doc_add_attr(ESTDOC *doc, const char *name, const char *value){
493 char *rbuf, *wp;
494 int len;
495 assert(doc && name);
496 if(name[0] == '\0' || name[0] == '%') return;
497 if(!doc->attrs) doc->attrs = cbmapopenex(ESTMINIBNUM);
498 if(value){
499 rbuf = cbmemdup(value, -1);
500 for(wp = rbuf; *wp != '\0'; wp++){
501 if(*wp > 0 && *wp < ' ') *wp = ' ';
502 }
503 cbstrsqzspc(rbuf);
504 if((len = strlen(name)) > 0) cbmapput(doc->attrs, name, len, rbuf, -1, TRUE);
505 free(rbuf);
506 } else {
507 cbmapout(doc->attrs, name, -1);
508 }
509 }
510
511
512 /* Add a sentence of text to a document object. */
est_doc_add_text(ESTDOC * doc,const char * text)513 void est_doc_add_text(ESTDOC *doc, const char *text){
514 unsigned char *utext;
515 char *rtext, *wp;
516 int size;
517 assert(doc && text);
518 while(*text > '\0' && *text <= ' '){
519 text++;
520 }
521 if(text[0] == '\0') return;
522 if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
523 utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
524 est_normalize_text(utext, size, &size);
525 rtext = est_uconv_out((char *)utext, size, NULL);
526 for(wp = rtext; *wp != '\0'; wp++){
527 if(*wp > 0 && *wp < ' ') *wp = ' ';
528 }
529 cbstrsqzspc(rtext);
530 if(rtext[0] != '\0'){
531 CB_LISTPUSHBUF(doc->dtexts, rtext, strlen(rtext));
532 } else {
533 free(rtext);
534 }
535 free(utext);
536 }
537
538
539 /* Add a hidden sentence to a document object. */
est_doc_add_hidden_text(ESTDOC * doc,const char * text)540 void est_doc_add_hidden_text(ESTDOC *doc, const char *text){
541 unsigned char *utext;
542 char *rtext, *wp;
543 int size;
544 assert(doc && text);
545 while(*text > '\0' && *text <= ' '){
546 text++;
547 }
548 if(text[0] == '\0') return;
549 utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
550 est_normalize_text(utext, size, &size);
551 rtext = est_uconv_out((char *)utext, size, NULL);
552 for(wp = rtext; *wp != '\0'; wp++){
553 if(*wp > 0 && *wp < ' ') *wp = ' ';
554 }
555 cbstrsqzspc(rtext);
556 if(rtext[0] != '\0'){
557 if(!doc->attrs) doc->attrs = cbmapopenex(ESTMINIBNUM);
558 if(cbmapget(doc->attrs, "", 0, NULL)) cbmapputcat(doc->attrs, "", 0, " ", 1);
559 cbmapputcat(doc->attrs, "", 0, rtext, -1);
560 }
561 free(rtext);
562 free(utext);
563 }
564
565
566 /* Attach keywords to a document object. */
est_doc_set_keywords(ESTDOC * doc,CBMAP * kwords)567 void est_doc_set_keywords(ESTDOC *doc, CBMAP *kwords){
568 assert(doc && kwords);
569 if(doc->kwords) cbmapclose(doc->kwords);
570 doc->kwords = cbmapdup(kwords);
571 }
572
573
574 /* Set the substitute score of a document object. */
est_doc_set_score(ESTDOC * doc,int score)575 void est_doc_set_score(ESTDOC *doc, int score){
576 char numbuf[ESTNUMBUFSIZ];
577 assert(doc);
578 if(!doc->attrs) doc->attrs = cbmapopenex(ESTMINIBNUM);
579 if(score >= 0){
580 sprintf(numbuf, "%d", score);
581 cbmapput(doc->attrs, "\t", 1, numbuf, -1, TRUE);
582 } else {
583 cbmapout(doc->attrs, "\t", 1);
584 }
585 }
586
587
588 /* Get the ID number of a document object. */
est_doc_id(ESTDOC * doc)589 int est_doc_id(ESTDOC *doc){
590 assert(doc);
591 return doc->id;
592 }
593
594
595 /* Get a list of attribute names of a document object. */
est_doc_attr_names(ESTDOC * doc)596 CBLIST *est_doc_attr_names(ESTDOC *doc){
597 CBLIST *names;
598 const char *kbuf;
599 int ksiz;
600 assert(doc);
601 if(!doc->attrs){
602 CB_LISTOPEN(names);
603 return names;
604 }
605 CB_LISTOPEN(names);
606 cbmapiterinit(doc->attrs);
607 while((kbuf = cbmapiternext(doc->attrs, &ksiz)) != NULL){
608 if(ksiz > 0 && kbuf[0] != '\t') CB_LISTPUSH(names, kbuf, ksiz);
609 }
610 cblistsort(names);
611 return names;
612 }
613
614
615 /* Get the value of an attribute of a document object. */
est_doc_attr(ESTDOC * doc,const char * name)616 const char *est_doc_attr(ESTDOC *doc, const char *name){
617 assert(doc && name);
618 if(!doc->attrs || name[0] == '\0') return NULL;
619 return cbmapget(doc->attrs, name, -1, NULL);
620 }
621
622
623 /* Get a list of sentences of the text of a document object. */
est_doc_texts(ESTDOC * doc)624 const CBLIST *est_doc_texts(ESTDOC *doc){
625 assert(doc);
626 if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
627 return doc->dtexts;
628 }
629
630
631 /* Concatenate sentences of the text of a document object. */
est_doc_cat_texts(ESTDOC * doc)632 char *est_doc_cat_texts(ESTDOC *doc){
633 CBDATUM *datum;
634 const char *elem;
635 int i, size;
636 if(!doc->dtexts) return cbmemdup("", 0);
637 CB_DATUMOPEN(datum);
638 for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
639 elem = CB_LISTVAL2(doc->dtexts, i, size);
640 if(i > 0) CB_DATUMCAT(datum, " ", 1);
641 CB_DATUMCAT(datum, elem, size);
642 }
643 return cbdatumtomalloc(datum, NULL);
644 }
645
646
647 /* Get attached keywords of a document object. */
est_doc_keywords(ESTDOC * doc)648 CBMAP *est_doc_keywords(ESTDOC *doc){
649 assert(doc);
650 return doc->kwords;
651 }
652
653
654 /* Get the substitute score of a document object. */
est_doc_score(ESTDOC * doc)655 int est_doc_score(ESTDOC *doc){
656 const char *vbuf;
657 assert(doc);
658 if(doc->attrs && (vbuf = cbmapget(doc->attrs, "\t", 1, NULL)) != NULL) return atoi(vbuf);
659 return -1;
660 }
661
662
663 /* Dump draft data of a document object. */
est_doc_dump_draft(ESTDOC * doc)664 char *est_doc_dump_draft(ESTDOC *doc){
665 CBLIST *list;
666 CBDATUM *datum;
667 const char *kbuf, *vbuf;
668 int i, ksiz, vsiz;
669 assert(doc);
670 CB_DATUMOPEN(datum);
671 if(doc->attrs){
672 list = est_doc_attr_names(doc);
673 for(i = 0; i < CB_LISTNUM(list); i++){
674 kbuf = CB_LISTVAL2(list, i, ksiz);
675 vbuf = cbmapget(doc->attrs, kbuf, ksiz, &vsiz);
676 CB_DATUMCAT(datum, kbuf, ksiz);
677 CB_DATUMCAT(datum, "=", 1);
678 CB_DATUMCAT(datum, vbuf, vsiz);
679 CB_DATUMCAT(datum, "\n", 1);
680 }
681 CB_LISTCLOSE(list);
682 }
683 if(doc->kwords && cbmaprnum(doc->kwords) > 0){
684 CB_DATUMCAT(datum, ESTDCNTLVECTOR, strlen(ESTDCNTLVECTOR));
685 cbmapiterinit(doc->kwords);
686 while((kbuf = cbmapiternext(doc->kwords, &ksiz)) != NULL){
687 CB_MAPITERVAL(vbuf, kbuf, vsiz);
688 CB_DATUMCAT(datum, "\t", 1);
689 CB_DATUMCAT(datum, kbuf, ksiz);
690 CB_DATUMCAT(datum, "\t", 1);
691 CB_DATUMCAT(datum, vbuf, vsiz);
692 }
693 CB_DATUMCAT(datum, "\n", 1);
694 }
695 if(doc->attrs && (vbuf = cbmapget(doc->attrs, "\t", 1, &vsiz)) != NULL){
696 CB_DATUMCAT(datum, ESTDCNTLSCORE, strlen(ESTDCNTLSCORE));
697 CB_DATUMCAT(datum, "\t", 1);
698 CB_DATUMCAT(datum, vbuf, vsiz);
699 CB_DATUMCAT(datum, "\n", 1);
700 }
701 CB_DATUMCAT(datum, "\n", 1);
702 if(doc->dtexts){
703 for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
704 kbuf = CB_LISTVAL2(doc->dtexts, i, ksiz);
705 CB_DATUMCAT(datum, kbuf, ksiz);
706 CB_DATUMCAT(datum, "\n", 1);
707 }
708 }
709 if(doc->attrs && (vbuf = cbmapget(doc->attrs, "", 0, &vsiz)) != NULL){
710 CB_DATUMCAT(datum, "\t", 1);
711 CB_DATUMCAT(datum, vbuf, vsiz);
712 CB_DATUMCAT(datum, "\n", 1);
713 }
714 return cbdatumtomalloc(datum, NULL);
715 }
716
717
718 /* Make a snippet of the body text of a document object. */
est_doc_make_snippet(ESTDOC * doc,const CBLIST * words,int wwidth,int hwidth,int awidth)719 char *est_doc_make_snippet(ESTDOC *doc, const CBLIST *words, int wwidth, int hwidth, int awidth){
720 CBDATUM *sbuf;
721 const char *text;
722 char *snippet;
723 int i, size;
724 assert(doc && words && wwidth >= 0 && hwidth >= 0 && awidth >= 0);
725 if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
726 CB_DATUMOPEN(sbuf);
727 for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
728 text = CB_LISTVAL2(doc->dtexts, i, size);
729 if(i > 0) CB_DATUMCAT(sbuf, " ", 1);
730 CB_DATUMCAT(sbuf, text, size);
731 }
732 snippet = est_make_snippet(CB_DATUMPTR(sbuf), CB_DATUMSIZE(sbuf),
733 words, wwidth, hwidth, awidth);
734 CB_DATUMCLOSE(sbuf);
735 return snippet;
736 }
737
738
739
740 /*************************************************************************************************
741 * API for search conditions
742 *************************************************************************************************/
743
744
745 /* Create a condition object. */
est_cond_new(void)746 ESTCOND *est_cond_new(void){
747 ESTCOND *cond;
748 CB_MALLOC(cond, sizeof(ESTCOND));
749 cond->phrase = NULL;
750 cond->gstep = 2;
751 cond->tfidf = TRUE;
752 cond->pmode = ESTPMUSUAL;
753 cond->cbxpn = NULL;
754 cond->attrs = NULL;
755 cond->order = NULL;
756 cond->max = -1;
757 cond->skip = 0;
758 cond->auxmin = ESTAUXMIN;
759 cond->auxwords = NULL;
760 cond->scfb = FALSE;
761 cond->scores = NULL;
762 cond->snum = 0;
763 cond->nscores = NULL;
764 cond->nsnum = -1;
765 cond->opts = 0;
766 cond->ecllim = -1.0;
767 cond->shadows = NULL;
768 cond->distinct = NULL;
769 cond->mask = 0;
770 return cond;
771 }
772
773
774 /* Destroy a condition object. */
est_cond_delete(ESTCOND * cond)775 void est_cond_delete(ESTCOND *cond){
776 assert(cond);
777 if(cond->distinct) free(cond->distinct);
778 if(cond->shadows) cbmapclose(cond->shadows);
779 if(cond->auxwords) cbmapclose(cond->auxwords);
780 if(cond->scores) free(cond->scores);
781 if(cond->order) free(cond->order);
782 if(cond->attrs) CB_LISTCLOSE(cond->attrs);
783 if(cond->phrase) free(cond->phrase);
784 free(cond);
785 }
786
787
788 /* Set a search phrase to a condition object. */
est_cond_set_phrase(ESTCOND * cond,const char * phrase)789 void est_cond_set_phrase(ESTCOND *cond, const char *phrase){
790 assert(cond && phrase);
791 if(cond->phrase) free(cond->phrase);
792 while(*phrase > '\0' && *phrase <= ' '){
793 phrase++;
794 }
795 cond->phrase = cbmemdup(phrase, -1);
796 }
797
798
799 /* Add a condition of an attribute fo a condition object. */
est_cond_add_attr(ESTCOND * cond,const char * expr)800 void est_cond_add_attr(ESTCOND *cond, const char *expr){
801 assert(cond && expr);
802 while(*expr > '\0' && *expr <= ' '){
803 expr++;
804 }
805 if(*expr == '\0') return;
806 if(!cond->attrs) CB_LISTOPEN(cond->attrs);
807 CB_LISTPUSH(cond->attrs, expr, strlen(expr));
808 }
809
810
811 /* Set the order of a condition object. */
est_cond_set_order(ESTCOND * cond,const char * expr)812 void est_cond_set_order(ESTCOND *cond, const char *expr){
813 assert(cond && expr);
814 while(*expr > '\0' && *expr <= ' '){
815 expr++;
816 }
817 if(*expr == '\0') return;
818 if(cond->order) free(cond->order);
819 cond->order = cbmemdup(expr, -1);
820 }
821
822
823 /* Set the maximum number of retrieval of a condition object. */
est_cond_set_max(ESTCOND * cond,int max)824 void est_cond_set_max(ESTCOND *cond, int max){
825 assert(cond && max >= 0);
826 cond->max = max;
827 }
828
829
830 /* Set the number of skipped documents of a condition object. */
est_cond_set_skip(ESTCOND * cond,int skip)831 void est_cond_set_skip(ESTCOND *cond, int skip){
832 assert(cond && skip >= 0);
833 cond->skip = skip;
834 }
835
836
837 /* Set options of retrieval of a condition object. */
est_cond_set_options(ESTCOND * cond,int options)838 void est_cond_set_options(ESTCOND *cond, int options){
839 assert(cond);
840 if(options & ESTCONDSURE) cond->gstep = 1;
841 if(options & ESTCONDUSUAL) cond->gstep = 2;
842 if(options & ESTCONDFAST) cond->gstep = 3;
843 if(options & ESTCONDAGITO) cond->gstep = 4;
844 if(options & ESTCONDNOIDF) cond->tfidf = FALSE;
845 if(options & ESTCONDSIMPLE) cond->pmode = ESTPMSIMPLE;
846 if(options & ESTCONDROUGH) cond->pmode = ESTPMROUGH;
847 if(options & ESTCONDUNION) cond->pmode = ESTPMUNION;
848 if(options & ESTCONDISECT) cond->pmode = ESTPMISECT;
849 if(options & ESTCONDSCFB) cond->scfb = TRUE;
850 cond->opts |= options;
851 }
852
853
854 /* Set permission to adopt result of the auxiliary index. */
est_cond_set_auxiliary(ESTCOND * cond,int min)855 void est_cond_set_auxiliary(ESTCOND *cond, int min){
856 assert(cond);
857 cond->auxmin = min;
858 }
859
860
861 /* Set the upper limit of similarity for document eclipse. */
est_cond_set_eclipse(ESTCOND * cond,double limit)862 void est_cond_set_eclipse(ESTCOND *cond, double limit){
863 assert(cond);
864 if(limit > 0.0) cond->ecllim = limit;
865 }
866
867
868 /* Set the attribute distinction filter. */
est_cond_set_distinct(ESTCOND * cond,const char * name)869 void est_cond_set_distinct(ESTCOND *cond, const char *name){
870 assert(cond && name);
871 while(*name > '\0' && *name <= ' '){
872 name++;
873 }
874 if(*name == '\0') return;
875 if(cond->distinct) free(cond->distinct);
876 cond->distinct = cbmemdup(name, -1);
877 }
878
879
880 /* Set the mask of targets of meta search. */
est_cond_set_mask(ESTCOND * cond,int mask)881 void est_cond_set_mask(ESTCOND *cond, int mask){
882 assert(cond);
883 cond->mask = mask & INT_MAX;
884 }
885
886
887
888 /*************************************************************************************************
889 * API for database
890 *************************************************************************************************/
891
892
893 /* Inode map for duplication check. */
894 CBMAP *est_inodes = NULL;
895
896
897 /* Get the string of an error code. */
est_err_msg(int ecode)898 const char *est_err_msg(int ecode){
899 switch(ecode){
900 case ESTENOERR: return "no error";
901 case ESTEINVAL: return "invalid argument";
902 case ESTEACCES: return "access forbidden";
903 case ESTELOCK: return "lock failure";
904 case ESTEDB: return "database problem";
905 case ESTEIO: return "I/O problem";
906 case ESTENOITEM: return "no such item";
907 default: break;
908 }
909 return "miscellaneous";
910 }
911
912
913 /* Open a database. */
est_db_open(const char * name,int omode,int * ecp)914 ESTDB *est_db_open(const char *name, int omode, int *ecp){
915 ESTDB *db;
916 DEPOT *metadb;
917 ESTIDX *idxdb;
918 CURIA *attrdb, *textdb, *kwddb;
919 VILLA *fwmdb, *auxdb, *xfmdb, *listdb;
920 CBMAP *aidxs;
921 CBLIST *list;
922 ESTATTRIDX attridx;
923 void *aidxdb;
924 const char *elem;
925 char path[ESTPATHBUFSIZ], vbuf[ESTNUMBUFSIZ], *dec;
926 int i, inode, domode, comode, vomode, flags, idxnum, dseq, dnum;
927 int amode, zmode, smode, vsiz, type, crdnum;
928 double bdiam, ddiam;
929 assert(name && ecp);
930 if(!est_inodes){
931 est_inodes = cbmapopenex(ESTMINIBNUM);
932 cbglobalgc(est_inodes, est_inodes_delete);
933 }
934 est_set_ecode(ecp, ESTENOERR, __LINE__);
935 if((omode & ESTDBWRITER) && (omode & ESTDBCREAT) && !est_mkdir(name)){
936 switch(errno){
937 case EACCES:
938 est_set_ecode(ecp, ESTEACCES, __LINE__);
939 return NULL;
940 case EEXIST:
941 break;
942 default:
943 est_set_ecode(ecp, ESTEIO, __LINE__);
944 return NULL;
945 }
946 }
947 if((inode = est_inode(name)) < 1){
948 est_set_ecode(ecp, ESTEIO, __LINE__);
949 return NULL;
950 }
951 if(cbmapget(est_inodes, (char *)&inode, sizeof(int), NULL) && !(omode & ESTDBNOLCK)){
952 est_set_ecode(ecp, ESTEACCES, __LINE__);
953 return NULL;
954 }
955 domode = DP_OREADER;
956 comode = CR_OREADER;
957 vomode = VL_OREADER;
958 if(omode & ESTDBWRITER){
959 domode = DP_OWRITER;
960 comode = CR_OWRITER;
961 vomode = VL_OWRITER;
962 if(ESTUSEBZIP){
963 vomode |= VL_OXCOMP;
964 } else if(ESTUSELZO){
965 vomode |= VL_OYCOMP;
966 } else if(ESTUSEZLIB){
967 vomode |= VL_OZCOMP;
968 }
969 if(omode & ESTDBCREAT){
970 domode |= DP_OCREAT;
971 comode |= CR_OCREAT;
972 vomode |= VL_OCREAT;
973 }
974 if(omode & ESTDBTRUNC){
975 domode |= DP_OTRUNC;
976 comode |= CR_OTRUNC;
977 vomode |= VL_OTRUNC;
978 }
979 }
980 if(omode & ESTDBNOLCK){
981 domode |= DP_ONOLCK;
982 comode |= CR_ONOLCK;
983 vomode |= VL_ONOLCK;
984 }
985 if(omode & ESTDBLCKNB){
986 domode |= DP_OLCKNB;
987 comode |= CR_OLCKNB;
988 vomode |= VL_OLCKNB;
989 }
990 flags = 0;
991 idxnum = 0;
992 dseq = 0;
993 dnum = 0;
994 amode = 0;
995 zmode = 0;
996 smode = 0;
997 if(omode & ESTDBSMALL){
998 bdiam = ESTDBSBRAT;
999 ddiam = ESTDBSDRAT;
1000 } else if(omode & ESTDBLARGE){
1001 bdiam = ESTDBLBRAT;
1002 ddiam = ESTDBLDRAT;
1003 } else if(omode & ESTDBHUGE){
1004 bdiam = ESTDBHBRAT;
1005 ddiam = ESTDBHDRAT;
1006 } else if(omode & ESTDBHUGE2){
1007 bdiam = ESTDBHBRAT * ESTDBH2RAT;
1008 ddiam = ESTDBHDRAT * ESTDBH2RAT;
1009 } else if(omode & ESTDBHUGE3){
1010 bdiam = ESTDBHBRAT * ESTDBH3RAT;
1011 ddiam = ESTDBHDRAT * ESTDBH3RAT;
1012 } else {
1013 bdiam = 1.0;
1014 ddiam = 1.0;
1015 }
1016 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTMETADBNAME);
1017 if((metadb = dpopen(path, domode, ESTMINIBNUM)) != NULL){
1018 flags = dpgetflags(metadb);
1019 if(dprnum(metadb) < 1){
1020 if(omode & ESTDBPERFNG){
1021 flags |= ESTDFPERFNG;
1022 } else if(omode & ESTDBCHRCAT){
1023 flags |= ESTDFCHRCAT;
1024 }
1025 if(ESTUSEBZIP){
1026 flags |= ESTDFBZIP;
1027 } else if(ESTUSELZO){
1028 flags |= ESTDFLZO;
1029 } else if(ESTUSEZLIB){
1030 flags |= ESTDFZLIB;
1031 }
1032 if(omode & ESTDBSCVOID){
1033 flags |= ESTDFSCVOID;
1034 } else if(omode & ESTDBSCINT){
1035 flags |= ESTDFSCINT;
1036 } else if(omode & ESTDBSCASIS){
1037 flags |= ESTDFSCASIS;
1038 }
1039 dpsetflags(metadb, flags);
1040 }
1041 if((vsiz = dpgetwb(metadb, ESTKEYIDXNUM, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
1042 vbuf[vsiz] = '\0';
1043 idxnum = atoi(vbuf);
1044 }
1045 if((vsiz = dpgetwb(metadb, ESTKEYDSEQ, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
1046 vbuf[vsiz] = '\0';
1047 dseq = atoi(vbuf);
1048 }
1049 if((vsiz = dpgetwb(metadb, ESTKEYDNUM, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
1050 vbuf[vsiz] = '\0';
1051 dnum = atoi(vbuf);
1052 }
1053 if(flags & ESTDFPERFNG){
1054 amode = ESTDFPERFNG;
1055 } else if(flags & ESTDFCHRCAT){
1056 amode = ESTDFCHRCAT;
1057 }
1058 if(flags & ESTDFZLIB){
1059 zmode = ESTDFZLIB;
1060 } else if(flags & ESTDFLZO){
1061 zmode = ESTDFLZO;
1062 } else if(flags & ESTDFBZIP){
1063 zmode = ESTDFBZIP;
1064 }
1065 if(flags & ESTDFSCVOID){
1066 smode = ESTDFSCVOID;
1067 } else if(flags & ESTDFSCINT){
1068 smode = ESTDFSCINT;
1069 } else if(flags & ESTDFSCASIS){
1070 smode = ESTDFSCASIS;
1071 }
1072 } else {
1073 est_set_ecode(ecp, dpecode == DP_ELOCK ? ESTELOCK : ESTEDB, __LINE__);
1074 return NULL;
1075 }
1076 if(idxnum < 1) idxnum = 1;
1077 if(dseq < 0) dseq = 0;
1078 if(dnum < 0) dnum = 0;
1079 crdnum = vlcrdnum;
1080 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTIDXDBNAME);
1081 idxdb = est_idx_open(path, vomode, idxnum);
1082 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTFWMDBNAME);
1083 vlcrdnum = ESTVLCRDNUM;
1084 fwmdb = vlopen(path, vomode, VL_CMPLEX);
1085 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTAUXDBNAME);
1086 vlcrdnum = ESTVLCRDNAUX;
1087 auxdb = vlopen(path, vomode, VL_CMPLEX);
1088 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTXFMDBNAME);
1089 vlcrdnum = ESTVLCRDNUM;
1090 xfmdb = vlopen(path, vomode, VL_CMPLEX);
1091 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTATTRDBNAME);
1092 attrdb = cropen(path, comode, ESTATTRDBBNUM * bdiam, ESTATTRDBDNUM * ddiam);
1093 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTTEXTDBNAME);
1094 textdb = cropen(path, comode, ESTTEXTDBBNUM * bdiam, ESTTEXTDBDNUM * ddiam);
1095 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTKWDDBNAME);
1096 kwddb = cropen(path, comode, ESTKWDDBBNUM * bdiam, ESTKWDDBDNUM * ddiam);
1097 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTLISTDBNAME);
1098 vlcrdnum = ESTVLCRDNUM;
1099 listdb = vlopen(path, vomode, VL_CMPLEX);
1100 vlcrdnum = crdnum;
1101 if(!idxdb || !fwmdb || !auxdb || !xfmdb || !attrdb ||!textdb || !kwddb || !listdb){
1102 if(listdb) vlclose(listdb);
1103 if(kwddb) crclose(kwddb);
1104 if(textdb) crclose(textdb);
1105 if(attrdb) crclose(attrdb);
1106 if(xfmdb) vlclose(xfmdb);
1107 if(auxdb) vlclose(auxdb);
1108 if(fwmdb) vlclose(fwmdb);
1109 if(idxdb) est_idx_close(idxdb);
1110 dpclose(metadb);
1111 est_set_ecode(ecp, ESTEDB, __LINE__);
1112 return NULL;
1113 }
1114 if(omode & ESTDBWRITER){
1115 est_idx_set_tuning(idxdb, amode == ESTDFPERFNG ? ESTIDXDBLRMA : ESTIDXDBLRM, ESTIDXDBNIM,
1116 ESTIDXDBLCN, ESTIDXDBNCN, ESTIDXDBFBP);
1117 est_idx_set_current(idxdb);
1118 vlsettuning(fwmdb, ESTFWMDBLRM, ESTFWMDBNIM, ESTFWMDBLCN, ESTFWMDBNCN);
1119 vlsetfbpsiz(fwmdb, ESTFWMDBFBP);
1120 vlsettuning(auxdb, ESTAUXDBLRM, ESTAUXDBNIM, ESTAUXDBLCN, ESTAUXDBNCN);
1121 vlsetfbpsiz(auxdb, ESTAUXDBFBP);
1122 vlsettuning(xfmdb, ESTXFMDBLRM, ESTXFMDBNIM, ESTXFMDBLCN, ESTXFMDBNCN);
1123 vlsetfbpsiz(xfmdb, ESTXFMDBFBP);
1124 crsetalign(attrdb, ESTATTRDBALN);
1125 crsetfbpsiz(attrdb, ESTATTRDBFBP);
1126 crsetalign(textdb, ESTTEXTDBALN);
1127 crsetfbpsiz(textdb, ESTTEXTDBFBP);
1128 crsetalign(kwddb, ESTKWDDBALN);
1129 crsetfbpsiz(kwddb, ESTKWDDBFBP);
1130 vlsettuning(listdb, ESTLISTDBLRM, ESTLISTDBNIM, ESTLISTDBLCN, ESTLISTDBNCN);
1131 vlsetfbpsiz(listdb, ESTLISTDBFBP);
1132 } else {
1133 est_idx_set_tuning(idxdb, -1, -1,
1134 amode == ESTDFPERFNG ? ESTIDXDBRLCNA : ESTIDXDBRLCN, ESTIDXDBRNCN, -1);
1135 vlsettuning(fwmdb, -1, -1, ESTFWMDBLCN, ESTFWMDBNCN);
1136 vlsettuning(auxdb, -1, -1, ESTAUXDBRLCN, ESTAUXDBRNCN);
1137 vlsettuning(xfmdb, -1, -1, ESTXFMDBLCN, ESTXFMDBNCN);
1138 vlsettuning(listdb, -1, -1, ESTLISTDBLCN, ESTLISTDBNCN);
1139 }
1140 if((omode & ESTDBWRITER) && (omode & ESTDBTRUNC) && (list = cbdirlist(name)) != NULL){
1141 for(i = 0; i < CB_LISTNUM(list); i++){
1142 elem = CB_LISTVAL(list, i);
1143 if(cbstrfwmatch(elem, ESTAISEQPREF) || cbstrfwmatch(elem, ESTAISTRPREF) ||
1144 cbstrfwmatch(elem, ESTAINUMPREF)){
1145 sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
1146 if(unlink(path) == -1) est_rmdir_rec(path);
1147 }
1148 }
1149 CB_LISTCLOSE(list);
1150 }
1151 aidxs = cbmapopenex(ESTMINIBNUM);
1152 if((list = cbdirlist(name)) != NULL){
1153 for(i = 0; i < CB_LISTNUM(list); i++){
1154 elem = CB_LISTVAL(list, i);
1155 dec = NULL;
1156 type = -1;
1157 if(cbstrfwmatch(elem, ESTAISEQPREF)){
1158 dec = est_hex_decode(elem + strlen(ESTAISEQPREF));
1159 type = ESTIDXATTRSEQ;
1160 } else if(cbstrfwmatch(elem, ESTAISTRPREF)){
1161 dec = est_hex_decode(elem + strlen(ESTAISTRPREF));
1162 type = ESTIDXATTRSTR;
1163 } else if(cbstrfwmatch(elem, ESTAINUMPREF)){
1164 dec = est_hex_decode(elem + strlen(ESTAINUMPREF));
1165 type = ESTIDXATTRNUM;
1166 }
1167 if(dec){
1168 sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
1169 switch(type){
1170 case ESTIDXATTRSTR:
1171 if((aidxdb = vlopen(path, vomode, VL_CMPLEX)) != NULL){
1172 vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
1173 vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
1174 attridx.db = aidxdb;
1175 attridx.type = type;
1176 cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
1177 }
1178 break;
1179 case ESTIDXATTRNUM:
1180 if((aidxdb = vlopen(path, vomode, est_aidx_numcmp)) != NULL){
1181 vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
1182 vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
1183 attridx.db = aidxdb;
1184 attridx.type = type;
1185 cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
1186 }
1187 break;
1188 default:
1189 if((aidxdb = dpopen(path, domode, crbnum(attrdb) / ESTAIBDIAM)) != NULL){
1190 dpsetfbpsiz(aidxdb, ESTAIDXDPFBP);
1191 attridx.db = aidxdb;
1192 attridx.type = type;
1193 cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
1194 }
1195 break;
1196 }
1197 free(dec);
1198 }
1199 }
1200 CB_LISTCLOSE(list);
1201 }
1202 CB_MALLOC(db, sizeof(ESTDB));
1203 db->name = cbmemdup(name, -1);
1204 db->inode = inode;
1205 db->metadb = metadb;
1206 db->idxdb = idxdb;
1207 db->fwmdb = fwmdb;
1208 db->auxdb = auxdb;
1209 db->xfmdb = xfmdb;
1210 db->attrdb = attrdb;
1211 db->textdb = textdb;
1212 db->kwddb = kwddb;
1213 db->listdb = listdb;
1214 db->aidxs = aidxs;
1215 CB_LISTOPEN(db->pdocs);
1216 db->puris = NULL;
1217 est_set_ecode(&(db->ecode), ESTENOERR, __LINE__);
1218 db->fatal = FALSE;
1219 db->dseq = dseq;
1220 db->dnum = dnum;
1221 db->amode = amode;
1222 db->zmode = zmode;
1223 db->smode = smode;
1224 if(omode & ESTDBWRITER){
1225 db->idxcc = cbmapopenex(ESTIDXCCBNUM);
1226 db->auxcc = cbmapopenex(ESTAUXCCBNUM);
1227 db->icsiz = 0;
1228 db->icmax = ESTIDXCCMAX;
1229 db->outcc = cbmapopenex(ESTOUTCCBNUM);
1230 } else {
1231 db->idxcc = cbmapopenex(1);
1232 db->auxcc = cbmapopenex(1);
1233 db->icsiz = 0;
1234 db->icmax = 0;
1235 db->outcc = cbmapopenex(1);
1236 }
1237 db->keycc = cbmapopenex(ESTKEYCCMNUM + 1);
1238 db->kcmnum = ESTKEYCCMNUM;
1239 db->attrcc = cbmapopenex(ESTATTRCCMNUM + 1);
1240 db->acmnum = ESTATTRCCMNUM;
1241 db->textcc = cbmapopenex(ESTTEXTCCMNUM + 1);
1242 db->tcmnum = ESTTEXTCCMNUM;
1243 db->veccc = cbmapopenex(ESTATTRCCMNUM / 2 + 1);
1244 db->vcmnum = ESTATTRCCMNUM / 2;
1245 db->rescc = cbmapopenex(ESTRESCCMNUM * 2 + 1);
1246 db->rcmnum = ESTRESCCMNUM;
1247 db->spacc = NULL;
1248 db->scmnum = 0;
1249 db->scname = NULL;
1250 db->infocb = NULL;
1251 db->infoop = NULL;
1252 db->dfdb = NULL;
1253 db->metacc = NULL;
1254 db->wildmax = ESTWILDMAX;
1255 db->flsflag = FALSE;
1256 db->intflag = FALSE;
1257 cbmapput(est_inodes, (char *)&inode, sizeof(int), (char *)&db, sizeof(ESTDB *), FALSE);
1258 return db;
1259 }
1260
1261
1262 /* Close a database. */
est_db_close(ESTDB * db,int * ecp)1263 int est_db_close(ESTDB *db, int *ecp){
1264 ESTATTRIDX *attridx;
1265 const char *kbuf;
1266 int err;
1267 assert(db && ecp);
1268 est_set_ecode(ecp, ESTENOERR, __LINE__);
1269 err = FALSE;
1270 cbmapout(est_inodes, (char *)&(db->inode), sizeof(int));
1271 if(dpwritable(db->metadb)){
1272 if(!est_db_flush(db, -1)) err = TRUE;
1273 if(!est_db_write_meta(db)) err = TRUE;
1274 }
1275 est_db_inform(db, "closing");
1276 if(db->metacc) cbmapclose(db->metacc);
1277 if(db->spacc){
1278 free(db->scname);
1279 cbmapclose(db->spacc);
1280 }
1281 cbmapclose(db->rescc);
1282 cbmapclose(db->veccc);
1283 cbmapclose(db->textcc);
1284 cbmapclose(db->attrcc);
1285 cbmapclose(db->keycc);
1286 cbmapclose(db->outcc);
1287 cbmapclose(db->auxcc);
1288 cbmapclose(db->idxcc);
1289 if(db->puris) cbmapclose(db->puris);
1290 CB_LISTCLOSE(db->pdocs);
1291 cbmapiterinit(db->aidxs);
1292 while((kbuf = cbmapiternext(db->aidxs, NULL)) != NULL){
1293 attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
1294 switch(attridx->type){
1295 case ESTIDXATTRSTR:
1296 case ESTIDXATTRNUM:
1297 if(!vlclose(attridx->db)) err = TRUE;
1298 break;
1299 default:
1300 if(!dpclose(attridx->db)) err = TRUE;
1301 break;
1302 }
1303 }
1304 cbmapclose(db->aidxs);
1305 if(!vlclose(db->listdb)) err = TRUE;
1306 if(!crclose(db->kwddb)) err = TRUE;
1307 if(!crclose(db->textdb)) err = TRUE;
1308 if(!crclose(db->attrdb)) err = TRUE;
1309 if(!vlclose(db->xfmdb)) err = TRUE;
1310 if(!vlclose(db->auxdb)) err = TRUE;
1311 if(!vlclose(db->fwmdb)) err = TRUE;
1312 if(!est_idx_close(db->idxdb)) err = TRUE;
1313 if(!dpclose(db->metadb)) err = TRUE;
1314 free(db->name);
1315 if(db->fatal){
1316 est_set_ecode(ecp, db->ecode, __LINE__);
1317 err = TRUE;
1318 } else if(err){
1319 est_set_ecode(ecp, ESTEDB, __LINE__);
1320 }
1321 free(db);
1322 return err ? FALSE : TRUE;
1323 }
1324
1325
1326 /* Get the last happended error code of a database. */
est_db_error(ESTDB * db)1327 int est_db_error(ESTDB *db){
1328 assert(db);
1329 return db->ecode;
1330 }
1331
1332
1333 /* Check whether a database has a fatal error. */
est_db_fatal(ESTDB * db)1334 int est_db_fatal(ESTDB *db){
1335 assert(db);
1336 return db->fatal;
1337 }
1338
1339
1340 /* Add an index for narrowing or sorting with document attributes. */
est_db_add_attr_index(ESTDB * db,const char * name,int type)1341 int est_db_add_attr_index(ESTDB *db, const char *name, int type){
1342 ESTATTRIDX attridx;
1343 ESTSCORE *scores;
1344 void *aidxdb;
1345 char path[ESTPATHBUFSIZ], *enc, *vbuf;
1346 int i, domode, vomode, crdnum, err, snum;
1347 assert(db && name);
1348 if(!dpwritable(db->metadb)){
1349 est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
1350 return FALSE;
1351 }
1352 if(cbmapget(db->aidxs, name, -1, NULL)){
1353 est_set_ecode(&(db->ecode), ESTEMISC, __LINE__);
1354 return FALSE;
1355 }
1356 enc = est_hex_encode(name);
1357 switch(type){
1358 case ESTIDXATTRSEQ:
1359 sprintf(path, "%s%c%s%s", db->name, ESTPATHCHR, ESTAISEQPREF, enc);
1360 break;
1361 case ESTIDXATTRSTR:
1362 sprintf(path, "%s%c%s%s", db->name, ESTPATHCHR, ESTAISTRPREF, enc);
1363 break;
1364 case ESTIDXATTRNUM:
1365 sprintf(path, "%s%c%s%s", db->name, ESTPATHCHR, ESTAINUMPREF, enc);
1366 break;
1367 default:
1368 free(enc);
1369 est_set_ecode(&(db->ecode), ESTEINVAL, __LINE__);
1370 return FALSE;
1371 }
1372 free(enc);
1373 domode = DP_OWRITER | DP_OCREAT | DP_OTRUNC;
1374 vomode = VL_OWRITER | VL_OCREAT | VL_OTRUNC;
1375 if(ESTUSEBZIP){
1376 vomode |= VL_OXCOMP;
1377 } else if(ESTUSELZO){
1378 vomode |= VL_OYCOMP;
1379 } else if(ESTUSEZLIB){
1380 vomode |= VL_OZCOMP;
1381 }
1382 err = FALSE;
1383 crdnum = vlcrdnum;
1384 switch(type){
1385 case ESTIDXATTRSTR:
1386 vlcrdnum = ESTVLCRDNUM;
1387 if(!(aidxdb = vlopen(path, vomode, VL_CMPLEX))){
1388 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1389 vlcrdnum = crdnum;
1390 return FALSE;
1391 }
1392 vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
1393 vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
1394 if(est_db_doc_num(db) > 0){
1395 scores = est_search_uvset(db, &snum, NULL, TRUE);
1396 for(i = 0; i < snum; i++){
1397 if((vbuf = est_db_get_doc_attr(db, scores[i].id, name)) != NULL){
1398 if(!est_aidx_attr_put(aidxdb, scores[i].id, vbuf, strlen(vbuf))){
1399 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1400 db->fatal = TRUE;
1401 err = TRUE;
1402 }
1403 free(vbuf);
1404 }
1405 if(i % (ESTCCCBFREQ / 10) == 0) est_db_inform(db, "entering existing attributes");
1406 }
1407 free(scores);
1408 }
1409 break;
1410 case ESTIDXATTRNUM:
1411 vlcrdnum = ESTVLCRDNUM;
1412 if(!(aidxdb = vlopen(path, vomode, est_aidx_numcmp))){
1413 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1414 vlcrdnum = crdnum;
1415 return FALSE;
1416 }
1417 vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
1418 vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
1419 if(est_db_doc_num(db) > 0){
1420 scores = est_search_uvset(db, &snum, NULL, TRUE);
1421 for(i = 0; i < snum; i++){
1422 if((vbuf = est_db_get_doc_attr(db, scores[i].id, name)) != NULL){
1423 if(!est_aidx_attr_put(aidxdb, scores[i].id, vbuf, strlen(vbuf))){
1424 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1425 db->fatal = TRUE;
1426 err = TRUE;
1427 }
1428 free(vbuf);
1429 }
1430 if(i % (ESTCCCBFREQ / 10) == 0) est_db_inform(db, "entering existing attributes");
1431 }
1432 free(scores);
1433 }
1434 break;
1435 default:
1436 if(!(aidxdb = dpopen(path, domode, crbnum(db->attrdb) * ESTAIBDIAM))){
1437 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1438 vlcrdnum = crdnum;
1439 return FALSE;
1440 }
1441 dpsetfbpsiz(aidxdb, ESTAIDXDPFBP);
1442 if(est_db_doc_num(db) > 0){
1443 scores = est_search_uvset(db, &snum, NULL, TRUE);
1444 for(i = 0; i < snum; i++){
1445 if((vbuf = est_db_get_doc_attr(db, scores[i].id, name)) != NULL){
1446 if(!est_aidx_seq_put(aidxdb, scores[i].id, vbuf, strlen(vbuf))){
1447 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1448 db->fatal = TRUE;
1449 err = TRUE;
1450 }
1451 free(vbuf);
1452 }
1453 if(i % (ESTCCCBFREQ / 10) == 0) est_db_inform(db, "entering existing attributes");
1454 }
1455 free(scores);
1456 }
1457 break;
1458 }
1459 vlcrdnum = crdnum;
1460 attridx.db = aidxdb;
1461 attridx.type = type;
1462 cbmapput(db->aidxs, name, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
1463 return err ? FALSE : TRUE;
1464 }
1465
1466
1467 /* Flush index words in the cache of a database. */
est_db_flush(ESTDB * db,int max)1468 int est_db_flush(ESTDB *db, int max){
1469 ESTATTRIDX *attridx;
1470 CBMAP *ids;
1471 CBLIST *keys;
1472 CBDATUM *nval;
1473 const char *kbuf, *vbuf, *rp, *pv, *ep;
1474 char *tbuf, *wp, numbuf[ESTNUMBUFSIZ];
1475 int i, j, inc, err, ksiz, vsiz, rnum, len, id, sum, cid, vnum, lid, dnum, tsiz, vstep;
1476 assert(db);
1477 if(!dpwritable(db->metadb)){
1478 est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
1479 return FALSE;
1480 }
1481 if(max < 1 || max >= INT_MAX){
1482 if(!est_db_write_meta(db)) err = TRUE;
1483 if(!dpmemflush(db->metadb)) err = TRUE;
1484 if(!crmemflush(db->attrdb)) err = TRUE;
1485 if(!crmemflush(db->textdb)) err = TRUE;
1486 if(!crmemflush(db->kwddb)) err = TRUE;
1487 if(!vlmemflush(db->listdb)) err = TRUE;
1488 cbmapiterinit(db->aidxs);
1489 while((kbuf = cbmapiternext(db->aidxs, NULL)) != NULL){
1490 attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
1491 switch(attridx->type){
1492 case ESTIDXATTRSTR:
1493 case ESTIDXATTRNUM:
1494 if(!vlmemflush(attridx->db)) err = TRUE;
1495 break;
1496 default:
1497 if(!dpmemflush(attridx->db)) err = TRUE;
1498 break;
1499 }
1500 }
1501 }
1502 if(cbmaprnum(db->idxcc) < 1 && cbmaprnum(db->auxcc) < 1 && cbmaprnum(db->outcc) < 1)
1503 return TRUE;
1504 db->flsflag = TRUE;
1505 db->intflag = FALSE;
1506 inc = est_db_used_cache_size(db) > db->icmax;
1507 err = FALSE;
1508 CB_LISTOPEN(keys);
1509 cbmapiterinit(db->idxcc);
1510 while((kbuf = cbmapiternext(db->idxcc, &ksiz)) != NULL){
1511 CB_LISTPUSH(keys, kbuf, ksiz);
1512 }
1513 rnum = CB_LISTNUM(keys);
1514 cblistsort(keys);
1515 if(max > 0){
1516 while(CB_LISTNUM(keys) > max){
1517 CB_LISTDROP(keys);
1518 }
1519 }
1520 for(i = 0; i < CB_LISTNUM(keys); i++){
1521 kbuf = CB_LISTVAL2(keys, i, ksiz);
1522 vbuf = cbmapget(db->idxcc, kbuf, ksiz, &vsiz);
1523 if(!est_idx_add(db->idxdb, kbuf, ksiz, vbuf, vsiz, db->smode) ||
1524 (!vlput(db->fwmdb, kbuf, ksiz, "", 0, VL_DKEEP) && dpecode != DP_EKEEP)){
1525 err = TRUE;
1526 break;
1527 }
1528 cbmapout(db->idxcc, kbuf, ksiz);
1529 db->icsiz -= vsiz;
1530 if(i % ESTCCCBFREQ == 0){
1531 est_db_inform(db, "flushing index words");
1532 if(est_idx_size_current(db->idxdb) >= ESTIDXDBMAX){
1533 est_db_inform(db, "adding a new database file");
1534 est_idx_increment(db->idxdb);
1535 inc = FALSE;
1536 }
1537 }
1538 if(max > 0 && db->intflag && i > 0 && i % ESTCCIRSLOT == 0) break;
1539 }
1540 CB_LISTCLOSE(keys);
1541 if(cbmaprnum(db->idxcc) < 1){
1542 cbmapclose(db->idxcc);
1543 db->idxcc = cbmapopenex(rnum > ESTIDXCCBNUM ? rnum * 1.5 : ESTIDXCCBNUM);
1544 if(cbmaprnum(db->auxcc) > 0){
1545 CB_LISTOPEN(keys);
1546 cbmapiterinit(db->auxcc);
1547 while((kbuf = cbmapiternext(db->auxcc, &ksiz)) != NULL){
1548 CB_LISTPUSH(keys, kbuf, ksiz);
1549 }
1550 cblistsort(keys);
1551 for(i = 0; i < CB_LISTNUM(keys); i++){
1552 kbuf = CB_LISTVAL2(keys, i, ksiz);
1553 vbuf = cbmapget(db->auxcc, kbuf, ksiz, &vsiz);
1554 if(!vlput(db->auxdb, kbuf, ksiz, vbuf, vsiz, VL_DCAT)){
1555 err = TRUE;
1556 break;
1557 }
1558 len = sprintf(numbuf, "%d", vlvsiz(db->auxdb, kbuf, ksiz) / (int)(sizeof(int) * 2));
1559 if(!vlput(db->xfmdb, kbuf, ksiz, numbuf, len, VL_DOVER)){
1560 err = TRUE;
1561 break;
1562 }
1563 cbmapout(db->auxcc, kbuf, ksiz);
1564 db->icsiz -= vsiz;
1565 if(i % ESTCCCBFREQ == 0) est_db_inform(db, "flushing auxiliary keywords");
1566 if(max > 0 && db->intflag && i > 0 && i % ESTCCIRSLOT == 0) break;
1567 }
1568 CB_LISTCLOSE(keys);
1569 if(cbmaprnum(db->auxcc) < 1){
1570 cbmapclose(db->auxcc);
1571 db->auxcc = cbmapopenex(ESTAUXCCBNUM);
1572 }
1573 }
1574 }
1575 if(max < 1 && cbmaprnum(db->outcc) > 0){
1576 ids = cbmapopen();
1577 CB_LISTOPEN(keys);
1578 cbmapiterinit(db->outcc);
1579 while((kbuf = cbmapiternext(db->outcc, &ksiz)) != NULL){
1580 if(*kbuf == '\t'){
1581 id = atoi(kbuf + 1);
1582 cbmapput(ids, (char *)&id, sizeof(int), "", 0, FALSE);
1583 } else {
1584 CB_LISTPUSH(keys, kbuf, ksiz);
1585 }
1586 }
1587 cblistsort(keys);
1588 dnum = est_idx_dnum(db->idxdb);
1589 for(i = 0; i < CB_LISTNUM(keys); i++){
1590 kbuf = CB_LISTVAL2(keys, i, ksiz);
1591 if(kbuf[0] == ' '){
1592 if((tbuf = vlget(db->auxdb, kbuf + 1, ksiz - 1, &tsiz)) != NULL){
1593 rp = tbuf;
1594 wp = tbuf;
1595 ep = tbuf + tsiz;
1596 while(rp < ep){
1597 if(!cbmapget(ids, rp, sizeof(int), NULL)){
1598 memmove(wp, rp, sizeof(int) * 2);
1599 wp += sizeof(int) * 2;
1600 }
1601 rp += sizeof(int) * 2;
1602 }
1603 if(wp > tbuf){
1604 if(!vlput(db->auxdb, kbuf + 1, ksiz - 1, tbuf, wp - tbuf, VL_DOVER)) err = TRUE;
1605 len = sprintf(numbuf, "%d", (int)((wp - tbuf) / (sizeof(int) * 2)));
1606 if(!vlput(db->xfmdb, kbuf + 1, ksiz - 1, numbuf, len, VL_DOVER)) err = TRUE;
1607 } else {
1608 if(!vlout(db->auxdb, kbuf + 1, ksiz - 1)) err = TRUE;
1609 if(!vlout(db->xfmdb, kbuf + 1, ksiz - 1) && dpecode != DP_ENOITEM) err = TRUE;
1610 }
1611 free(tbuf);
1612 }
1613 } else {
1614 sum = 0;
1615 for(j = 0; j < dnum; j++){
1616 if((vbuf = est_idx_get_one(db->idxdb, j, kbuf, ksiz, &tsiz)) != NULL){
1617 CB_DATUMOPEN(nval);
1618 rp = vbuf;
1619 ep = vbuf + tsiz;
1620 lid = 0;
1621 cid = 0;
1622 while(rp < ep){
1623 EST_READ_VNUMBUF(rp, vnum, vstep);
1624 cid += vnum + 1;
1625 rp += vstep;
1626 pv = rp;
1627 switch(db->smode){
1628 case ESTDFSCVOID:
1629 break;
1630 default:
1631 rp++;
1632 break;
1633 case ESTDFSCINT:
1634 case ESTDFSCASIS:
1635 rp += sizeof(int);
1636 break;
1637 }
1638 while(*rp != 0x0){
1639 rp += 2;
1640 }
1641 rp++;
1642 if(!cbmapget(ids, (char *)&cid, sizeof(int), NULL)){
1643 EST_SET_VNUMBUF(vstep, numbuf, cid - lid - 1);
1644 CB_DATUMCAT(nval, numbuf, vstep);
1645 CB_DATUMCAT(nval, pv, rp - pv);
1646 lid = cid;
1647 }
1648 }
1649 if(!est_idx_put_one(db->idxdb, j, kbuf, ksiz, CB_DATUMPTR(nval), CB_DATUMSIZE(nval)))
1650 err = TRUE;
1651 sum += CB_DATUMSIZE(nval);
1652 CB_DATUMCLOSE(nval);
1653 }
1654 }
1655 if(sum < 1 && !vlout(db->fwmdb, kbuf, ksiz) && dpecode != DP_ENOITEM) err = TRUE;
1656 }
1657 cbmapout(db->outcc, kbuf, ksiz);
1658 if(i % ESTCCCBFREQ == 0) est_db_inform(db, "cleaning dispensable keys");
1659 if(max > 0 && db->intflag && i > 0 && i % ESTCCIRSLOT == 0) break;
1660 }
1661 if(cbmaprnum(db->outcc) <= cbmaprnum(ids)){
1662 cbmapclose(db->outcc);
1663 db->outcc = cbmapopenex(ESTOUTCCBNUM);
1664 }
1665 CB_LISTCLOSE(keys);
1666 cbmapclose(ids);
1667 }
1668 cbmapclose(db->keycc);
1669 db->keycc = cbmapopenex(ESTKEYCCMNUM + 1);
1670 db->kcmnum = ESTKEYCCMNUM;
1671 if(!(max > 0 && db->intflag) && inc && est_idx_size_current(db->idxdb) >= ESTIDXDBMIN){
1672 est_db_inform(db, "adding a new database file");
1673 est_idx_increment(db->idxdb);
1674 }
1675 if(max < 1 || max >= INT_MAX){
1676 if(!vlmemflush(db->auxdb)) err = TRUE;
1677 if(!est_idx_memflush(db->idxdb)) err = TRUE;
1678 }
1679 if(max > 0 && db->intflag) est_db_inform(db, "flushing interrupted");
1680 db->flsflag = FALSE;
1681 db->intflag = FALSE;
1682 if(err){
1683 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1684 db->fatal = TRUE;
1685 return FALSE;
1686 }
1687 return TRUE;
1688 }
1689
1690
1691 /* Synchronize updating contents of a database. */
est_db_sync(ESTDB * db)1692 int est_db_sync(ESTDB *db){
1693 ESTATTRIDX *attridx;
1694 const char *kbuf;
1695 int err;
1696 assert(db);
1697 if(!dpwritable(db->metadb)){
1698 est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
1699 return FALSE;
1700 }
1701 err = FALSE;
1702 if(!est_db_flush(db, -1) || !est_db_write_meta(db)) err = TRUE;
1703 est_db_inform(db, "synchronizing the database for meta information");
1704 if(!dpsync(db->metadb)) err = TRUE;
1705 est_db_inform(db, "synchronizing the inverted index");
1706 if(!est_idx_sync(db->idxdb)) err = TRUE;
1707 est_db_inform(db, "synchronizing the database for forward matching");
1708 if(!vlsync(db->fwmdb)) err = TRUE;
1709 est_db_inform(db, "synchronizing the database for attributes");
1710 if(!crsync(db->attrdb)) err = TRUE;
1711 est_db_inform(db, "synchronizing the database for texts");
1712 if(!crsync(db->textdb)) err = TRUE;
1713 est_db_inform(db, "synchronizing the database for keywords");
1714 if(!crsync(db->kwddb)) err = TRUE;
1715 est_db_inform(db, "synchronizing the database for document list");
1716 if(!vlsync(db->listdb)) err = TRUE;
1717 if(cbmaprnum(db->aidxs) > 0){
1718 est_db_inform(db, "synchronizing the databases for attribute narrowing");
1719 cbmapiterinit(db->aidxs);
1720 while((kbuf = cbmapiternext(db->aidxs, NULL)) != NULL){
1721 attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
1722 switch(attridx->type){
1723 case ESTIDXATTRSTR:
1724 case ESTIDXATTRNUM:
1725 if(!vlsync(attridx->db)) err = TRUE;
1726 break;
1727 default:
1728 if(!dpsync(attridx->db)) err = TRUE;
1729 break;
1730 }
1731 }
1732 }
1733 if(err){
1734 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1735 db->fatal = TRUE;
1736 }
1737 return err ? FALSE : TRUE;
1738 }
1739
1740
1741 /* Optimize a database. */
est_db_optimize(ESTDB * db,int options)1742 int est_db_optimize(ESTDB *db, int options){
1743 CBMAP *dmap;
1744 CBLIST *words;
1745 CBDATUM *nval;
1746 ESTATTRIDX *attridx;
1747 const char *word, *rp, *pv, *ep;
1748 char *kbuf, *vbuf, *wp, numbuf[ESTNUMBUFSIZ];
1749 int i, err, id, ksiz, vsiz, wsiz, len, vstep;
1750 assert(db);
1751 if(!dpwritable(db->metadb)){
1752 est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
1753 return FALSE;
1754 }
1755 if(!est_db_flush(db, -1)) return FALSE;
1756 err = FALSE;
1757 if(!(options & ESTOPTNOPURGE)){
1758 dmap = cbmapopenex(vlrnum(db->listdb) + 1);
1759 vlcurfirst(db->listdb);
1760 while((vbuf = vlcurval(db->listdb, NULL)) != NULL){
1761 id = atoi(vbuf);
1762 cbmapput(dmap, (char *)&id, sizeof(int), "", 0, FALSE);
1763 free(vbuf);
1764 vlcurnext(db->listdb);
1765 }
1766 CB_LISTOPEN(words);
1767 vlcurfirst(db->fwmdb);
1768 while((kbuf = vlcurkey(db->fwmdb, &ksiz)) != NULL){
1769 CB_LISTPUSHBUF(words, kbuf, ksiz);
1770 vlcurnext(db->fwmdb);
1771 }
1772 for(i = 0; i < CB_LISTNUM(words); i++){
1773 if(i % (ESTIDXDBLRM * 4) == 0) est_idx_set_current(db->idxdb);
1774 word = CB_LISTVAL2(words, i, wsiz);
1775 vbuf = est_idx_scan(db->idxdb, word, wsiz, &vsiz, db->smode);
1776 CB_DATUMOPEN(nval);
1777 rp = vbuf;
1778 ep = vbuf + vsiz;
1779 while(rp < ep){
1780 pv = rp;
1781 EST_READ_VNUMBUF(rp, id, vstep);
1782 rp += vstep;
1783 switch(db->smode){
1784 case ESTDFSCVOID:
1785 break;
1786 default:
1787 rp++;
1788 break;
1789 case ESTDFSCINT:
1790 case ESTDFSCASIS:
1791 rp += sizeof(int);
1792 break;
1793 }
1794 while(*rp != 0x00){
1795 rp += 2;
1796 }
1797 rp++;
1798 if(cbmapget(dmap, (char *)&id, sizeof(int), NULL)) CB_DATUMCAT(nval, pv, rp - pv);
1799 }
1800 if(!est_idx_out(db->idxdb, word, wsiz)) err = TRUE;
1801 if(CB_DATUMSIZE(nval) > 0){
1802 if(!est_idx_add(db->idxdb, word, wsiz, CB_DATUMPTR(nval), CB_DATUMSIZE(nval), db->smode))
1803 err = TRUE;
1804 } else {
1805 if(!vlout(db->fwmdb, word, wsiz)) err = TRUE;
1806 }
1807 CB_DATUMCLOSE(nval);
1808 free(vbuf);
1809 free(kbuf);
1810 if(i % ESTCCCBFREQ == 0) est_db_inform(db, "cleaning dispensable keys");
1811 }
1812 CB_LISTCLOSE(words);
1813 CB_LISTOPEN(words);
1814 vlcurfirst(db->auxdb);
1815 while((kbuf = vlcurkey(db->auxdb, &ksiz)) != NULL){
1816 CB_LISTPUSHBUF(words, kbuf, ksiz);
1817 vlcurnext(db->auxdb);
1818 }
1819 for(i = 0; i < CB_LISTNUM(words); i++){
1820 word = CB_LISTVAL2(words, i, wsiz);
1821 if(!(vbuf = vlget(db->auxdb, word, wsiz, &vsiz))) continue;
1822 rp = vbuf;
1823 wp = vbuf;
1824 ep = vbuf + vsiz;
1825 while(rp < ep){
1826 if(cbmapget(dmap, rp, sizeof(int), NULL)){
1827 memmove(wp, rp, sizeof(int) * 2);
1828 wp += sizeof(int) * 2;
1829 }
1830 rp += sizeof(int) * 2;
1831 }
1832 if(wp > vbuf){
1833 if(!vlput(db->auxdb, word, wsiz, vbuf, wp - vbuf, VL_DOVER)) err = TRUE;
1834 len = sprintf(numbuf, "%d", (int)((wp - vbuf) / (sizeof(int) * 2)));
1835 if(!vlput(db->xfmdb, word, wsiz, numbuf, len, VL_DOVER)) err = TRUE;
1836 } else {
1837 if(!vlout(db->auxdb, word, wsiz)) err = TRUE;
1838 if(!vlout(db->xfmdb, word, wsiz) && dpecode != DP_ENOITEM) err = TRUE;
1839 }
1840 free(vbuf);
1841 if(i % ESTCCCBFREQ == 0) est_db_inform(db, "cleaning dispensable auxiliary keys");
1842 }
1843 CB_LISTCLOSE(words);
1844 cbmapclose(dmap);
1845 }
1846 if(!(options & ESTOPTNODBOPT)){
1847 est_db_inform(db, "optimizing the inverted index");
1848 if(!est_idx_optimize(db->idxdb)) err = TRUE;
1849 est_db_inform(db, "optimizing the database for forward matching");
1850 if(!vloptimize(db->fwmdb)) err = TRUE;
1851 est_db_inform(db, "optimizing the auxiliary index");
1852 if(!vloptimize(db->auxdb)) err = TRUE;
1853 est_db_inform(db, "optimizing the database for auxiliary forward matching");
1854 if(!vloptimize(db->xfmdb)) err = TRUE;
1855 est_db_inform(db, "optimizing the database for attributes");
1856 if(!croptimize(db->attrdb, -1)) err = TRUE;
1857 est_db_inform(db, "optimizing the database for texts");
1858 if(!croptimize(db->textdb, -1)) err = TRUE;
1859 est_db_inform(db, "optimizing the database for keywords");
1860 if(!croptimize(db->kwddb, -1)) err = TRUE;
1861 est_db_inform(db, "optimizing the database for document list");
1862 if(!vloptimize(db->listdb)) err = TRUE;
1863 if(cbmaprnum(db->aidxs) > 0){
1864 est_db_inform(db, "optimizing the databases for attribute narrowing");
1865 cbmapiterinit(db->aidxs);
1866 while((rp = cbmapiternext(db->aidxs, NULL)) != NULL){
1867 attridx = (ESTATTRIDX *)cbmapiterval(rp, NULL);
1868 switch(attridx->type){
1869 case ESTIDXATTRSTR:
1870 case ESTIDXATTRNUM:
1871 if(!vloptimize(attridx->db)) err = TRUE;
1872 break;
1873 default:
1874 if(!dpoptimize(attridx->db, -1)) err = TRUE;
1875 break;
1876 }
1877 }
1878 }
1879 }
1880 cbmapclose(db->rescc);
1881 db->rescc = cbmapopenex(db->rcmnum * 2 + 1);
1882 if(err){
1883 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1884 db->fatal = TRUE;
1885 }
1886 return err ? FALSE : TRUE;
1887 }
1888
1889
1890 /* Merge another database. */
est_db_merge(ESTDB * db,const char * name,int options)1891 int est_db_merge(ESTDB *db, const char *name, int options){
1892 ESTDB *tgdb;
1893 ESTATTRIDX *attridx;
1894 CBMAP *idmap, *seqmap, *attrs;
1895 CBLIST *words;
1896 CBDATUM *rbuf;
1897 const char *kbuf, *vbuf, *rp, *ep, *sp;
1898 char *tbuf, numbuf[ESTNUMBUFSIZ];
1899 int i, j, ecode, err, ksiz, vsiz, tsiz, oid, nid, len, vstep, anum, *ary;
1900 assert(db && name);
1901 if(!dpwritable(db->metadb)){
1902 est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
1903 return FALSE;
1904 }
1905 est_db_inform(db, "opening the target database");
1906 if(!(tgdb = est_db_open(name, ESTDBREADER, &ecode))){
1907 est_set_ecode(&(db->ecode), ecode, __LINE__);
1908 return FALSE;
1909 }
1910 if(dpgetflags(db->metadb) != dpgetflags(tgdb->metadb)){
1911 est_db_close(tgdb, &ecode);
1912 est_set_ecode(&(db->ecode), ESTEMISC, __LINE__);
1913 return FALSE;
1914 }
1915 err = FALSE;
1916 idmap = cbmapopenex(est_db_doc_num(tgdb) + 1);
1917 vlcurfirst(tgdb->listdb);
1918 for(i = 0; (kbuf = vlcurkeycache(tgdb->listdb, &ksiz)) != NULL; i++){
1919 if((vbuf = vlgetcache(db->listdb, kbuf, ksiz, NULL)) != NULL &&
1920 !est_db_out_doc(db, atoi(vbuf), options & ESTMGCLEAN ? ESTODCLEAN : 0)) err = TRUE;
1921 oid = atoi(vlcurvalcache(tgdb->listdb, NULL));
1922 db->dseq++;
1923 db->dnum++;
1924 cbmapput(idmap, (char *)&oid, sizeof(int), (char *)&(db->dseq), sizeof(int), FALSE);
1925 vlcurnext(tgdb->listdb);
1926 if(i % (ESTCCCBFREQ / 10) == 0) est_db_inform(db, "calculating ID mapping");
1927 }
1928 if(!est_db_flush(db, -1)){
1929 cbmapclose(idmap);
1930 est_db_close(tgdb, &ecode);
1931 return FALSE;
1932 }
1933 cbmapiterinit(idmap);
1934 for(i = 0; (kbuf = cbmapiternext(idmap, &ksiz)) != NULL; i++){
1935 CB_MAPITERVAL(vbuf, kbuf, vsiz);
1936 oid = *(int *)kbuf;
1937 nid = *(int *)vbuf;
1938 if((tbuf = est_crget(tgdb->attrdb, tgdb->zmode, oid, &tsiz)) != NULL){
1939 attrs = cbmapload(tbuf, tsiz);
1940 len = sprintf(numbuf, "%d", nid);
1941 cbmapput(attrs, ESTDATTRID, -1, numbuf, len, TRUE);
1942 free(tbuf);
1943 tbuf = cbmapdump(attrs, &tsiz);
1944 if((vbuf = cbmapget(attrs, ESTDATTRURI, -1, &vsiz)) != NULL){
1945 if(!vlput(db->listdb, vbuf, vsiz, numbuf, len, VL_DKEEP)){
1946 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1947 db->fatal = TRUE;
1948 err = TRUE;
1949 }
1950 } else {
1951 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1952 db->fatal = TRUE;
1953 err = TRUE;
1954 }
1955 if(!est_crput(db->attrdb, db->zmode, nid, tbuf, tsiz, CR_DKEEP)){
1956 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1957 db->fatal = TRUE;
1958 err = TRUE;
1959 }
1960 if(cbmaprnum(db->aidxs) > 0){
1961 cbmapiterinit(db->aidxs);
1962 while((kbuf = cbmapiternext(db->aidxs, &ksiz)) != NULL){
1963 if(!(vbuf = cbmapget(attrs, kbuf, ksiz, &vsiz))) continue;
1964 attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
1965 switch(attridx->type){
1966 case ESTIDXATTRSTR:
1967 case ESTIDXATTRNUM:
1968 if(!est_aidx_attr_put(attridx->db, nid, vbuf, vsiz)){
1969 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1970 db->fatal = TRUE;
1971 err = TRUE;
1972 }
1973 break;
1974 default:
1975 if(!est_aidx_seq_put(attridx->db, nid, vbuf, vsiz)){
1976 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1977 db->fatal = TRUE;
1978 err = TRUE;
1979 }
1980 break;
1981 }
1982 }
1983 }
1984 cbmapclose(attrs);
1985 free(tbuf);
1986 } else {
1987 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1988 err = TRUE;
1989 }
1990 if((tbuf = est_crget(tgdb->textdb, tgdb->zmode, oid, &tsiz)) != NULL){
1991 if(!est_crput(db->textdb, db->zmode, nid, tbuf, tsiz, CR_DKEEP)){
1992 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1993 db->fatal = TRUE;
1994 err = TRUE;
1995 }
1996 free(tbuf);
1997 } else {
1998 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
1999 err = TRUE;
2000 }
2001 if((tbuf = est_crget(tgdb->kwddb, tgdb->zmode, oid, &tsiz)) != NULL){
2002 if(!est_crput(db->kwddb, db->zmode, nid, tbuf, tsiz, CR_DKEEP)){
2003 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2004 db->fatal = TRUE;
2005 err = TRUE;
2006 }
2007 free(tbuf);
2008 } else if(dpecode != DP_ENOITEM){
2009 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2010 db->fatal = TRUE;
2011 err = TRUE;
2012 }
2013 if(i % (ESTCCCBFREQ / 10) == 0) est_db_inform(db, "importing documents");
2014 }
2015 CB_LISTOPEN(words);
2016 vlcurfirst(tgdb->fwmdb);
2017 while((kbuf = vlcurkeycache(tgdb->fwmdb, &ksiz)) != NULL){
2018 CB_LISTPUSH(words, kbuf, ksiz);
2019 vlcurnext(tgdb->fwmdb);
2020 }
2021 for(i = 0; i < CB_LISTNUM(words); i++){
2022 kbuf = CB_LISTVAL2(words, i, ksiz);
2023 seqmap = cbmapopenex(tsiz / sizeof(int) + 1);
2024 tbuf = est_idx_scan(tgdb->idxdb, kbuf, ksiz, &tsiz, tgdb->smode);
2025 rp = tbuf;
2026 ep = tbuf + tsiz;
2027 while(rp < ep){
2028 EST_READ_VNUMBUF(rp, oid, vstep);
2029 rp += vstep;
2030 vbuf = cbmapget(idmap, (char *)&oid, sizeof(int), NULL);
2031 nid = vbuf ? *(int *)vbuf : -1;
2032 sp = rp;
2033 switch(tgdb->smode){
2034 case ESTDFSCVOID:
2035 break;
2036 default:
2037 rp++;
2038 break;
2039 case ESTDFSCINT:
2040 case ESTDFSCASIS:
2041 rp += sizeof(int);
2042 break;
2043 }
2044 while(*rp != 0x00){
2045 rp += 2;
2046 }
2047 rp++;
2048 if(nid > 0) cbmapputcat(seqmap, (char *)&nid, sizeof(int), sp, rp - sp);
2049 }
2050 anum = cbmaprnum(seqmap);
2051 CB_MALLOC(ary, anum * sizeof(int) + 1);
2052 cbmapiterinit(seqmap);
2053 for(j = 0; (rp = cbmapiternext(seqmap, NULL)) != NULL; j++){
2054 ary[j] = *(int *)rp;
2055 }
2056 qsort(ary, anum, sizeof(int), est_int_compare);
2057 CB_DATUMOPEN(rbuf);
2058 for(j = 0; j < anum; j++){
2059 EST_SET_VNUMBUF(vstep, numbuf, ary[j]);
2060 CB_DATUMCAT(rbuf, numbuf, vstep);
2061 vbuf = cbmapget(seqmap, (char *)(ary + j), sizeof(int), &vsiz);
2062 CB_DATUMCAT(rbuf, vbuf, vsiz);
2063 }
2064 if(!est_idx_add(db->idxdb, kbuf, ksiz, CB_DATUMPTR(rbuf), CB_DATUMSIZE(rbuf), db->smode)){
2065 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2066 db->fatal = TRUE;
2067 err = TRUE;
2068 }
2069 CB_DATUMCLOSE(rbuf);
2070 free(ary);
2071 cbmapclose(seqmap);
2072 free(tbuf);
2073 vlput(db->fwmdb, kbuf, ksiz, "", 0, VL_DKEEP);
2074 if(i % ESTCCCBFREQ == 0){
2075 est_db_inform(db, "importing words");
2076 if(est_idx_size_current(db->idxdb) >= ESTIDXDBMAX){
2077 est_db_inform(db, "adding a new database file");
2078 est_idx_increment(db->idxdb);
2079 }
2080 }
2081 }
2082 CB_LISTCLOSE(words);
2083 CB_LISTOPEN(words);
2084 vlcurfirst(tgdb->auxdb);
2085 while((kbuf = vlcurkeycache(tgdb->auxdb, &ksiz)) != NULL){
2086 CB_LISTPUSH(words, kbuf, ksiz);
2087 vlcurnext(tgdb->auxdb);
2088 }
2089 for(i = 0; i < CB_LISTNUM(words); i++){
2090 kbuf = CB_LISTVAL2(words, i, ksiz);
2091 vbuf = vlgetcache(tgdb->auxdb, kbuf, ksiz, &vsiz);
2092 CB_DATUMOPEN(rbuf);
2093 rp = vbuf;
2094 ep = vbuf + vsiz;
2095 while(rp < ep){
2096 oid = *(int *)rp;
2097 vbuf = cbmapget(idmap, rp, sizeof(int), NULL);
2098 nid = vbuf ? *(int *)vbuf : -1;
2099 if(nid > 0){
2100 CB_DATUMCAT(rbuf, (char *)&nid, sizeof(int));
2101 CB_DATUMCAT(rbuf, rp + sizeof(int), sizeof(int));
2102 }
2103 rp += sizeof(int) * 2;
2104 }
2105 if(!vlput(db->auxdb, kbuf, ksiz, CB_DATUMPTR(rbuf), CB_DATUMSIZE(rbuf), VL_DCAT)){
2106 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2107 db->fatal = TRUE;
2108 err = TRUE;
2109 }
2110 CB_DATUMCLOSE(rbuf);
2111 anum = 0;
2112 if((vbuf = vlgetcache(tgdb->xfmdb, kbuf, ksiz, NULL)) != NULL) anum += atoi(vbuf);
2113 len = sprintf(numbuf, "%d", anum);
2114 vlput(db->xfmdb, kbuf, ksiz, numbuf, len, VL_DOVER);
2115 if(i % ESTCCCBFREQ == 0) est_db_inform(db, "importing auxiliary words");
2116 }
2117 CB_LISTCLOSE(words);
2118 cbmapclose(idmap);
2119 est_db_inform(db, "closing the target database");
2120 if(!est_db_close(tgdb, &ecode)){
2121 est_set_ecode(&(db->ecode), ecode, __LINE__);
2122 return FALSE;
2123 }
2124 if(!est_db_flush(db, -1)) err = TRUE;
2125 return err ? FALSE : TRUE;
2126 }
2127
2128
2129 /* Add a document to a database. */
est_db_put_doc(ESTDB * db,ESTDOC * doc,int options)2130 int est_db_put_doc(ESTDB *db, ESTDOC *doc, int options){
2131 CBMAP *ocmap, *fmap, *qmap;
2132 CBLIST *words;
2133 CBDATUM *ocbuf;
2134 ESTATTRIDX *attridx;
2135 md5_state_t ms;
2136 const char *uri, *ndig, *text, *word, *fnext, *snext, *kbuf, *vbuf;
2137 unsigned char junc[2], c;
2138 char dobuf[32], dsbuf[64], *wp, *odig, wbuf[ESTWORDMAXLEN+3], *sbuf, nbuf[ESTNUMBUFSIZ];
2139 int i, j, id, err, wnum, wsiz, fnsiz, snsiz, *np, score, num, ksiz, vsiz, ssiz;
2140 double tune, weight;
2141 assert(db && doc);
2142 if(!dpwritable(db->metadb)){
2143 est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
2144 return FALSE;
2145 }
2146 if(!doc->attrs || !(uri = cbmapget(doc->attrs, ESTDATTRURI, -1, NULL)) || uri[0] == '\0'){
2147 est_set_ecode(&(db->ecode), ESTEINVAL, __LINE__);
2148 return FALSE;
2149 }
2150 if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
2151 if(!(ndig = cbmapget(doc->attrs, ESTDATTRDIGEST, -1, NULL))){
2152 md5_init(&ms);
2153 for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
2154 vbuf = CB_LISTVAL2(doc->dtexts, i, vsiz);
2155 md5_append(&ms, (md5_byte_t *)vbuf, vsiz);
2156 md5_append(&ms, (md5_byte_t *)"\n", 1);
2157 }
2158 if((vbuf = cbmapget(doc->attrs, "", 0, &vsiz)) != NULL){
2159 md5_append(&ms, (md5_byte_t *)"\t", 1);
2160 md5_append(&ms, (md5_byte_t *)vbuf, vsiz);
2161 md5_append(&ms, (md5_byte_t *)"\n", 1);
2162 }
2163 md5_finish(&ms, (md5_byte_t *)dobuf);
2164 wp = dsbuf;
2165 for(i = 0; i < 16; i++){
2166 wp += sprintf(wp, "%02x", ((unsigned char *)dobuf)[i]);
2167 }
2168 ndig = dsbuf;
2169 cbmapput(doc->attrs, ESTDATTRDIGEST, -1, ndig, -1, FALSE);
2170 }
2171 if((id = est_db_uri_to_id(db, uri)) > 0){
2172 if((odig = est_db_get_doc_attr(db, id, ESTDATTRDIGEST)) != NULL){
2173 if(!strcmp(odig, ndig)){
2174 free(odig);
2175 doc->id = id;
2176 sprintf(nbuf, "%d", id);
2177 cbmapput(doc->attrs, ESTDATTRID, -1, nbuf, -1, TRUE);
2178 return est_db_edit_doc(db, doc);
2179 }
2180 free(odig);
2181 }
2182 if(!est_db_out_doc(db, id, (options & ESTPDCLEAN) ? ESTODCLEAN : 0)) return FALSE;
2183 }
2184 doc->id = ++(db->dseq);
2185 sprintf(nbuf, "%d", doc->id);
2186 cbmapput(doc->attrs, ESTDATTRID, -1, nbuf, -1, TRUE);
2187 ocmap = cbmapopen();
2188 fmap = cbmapopen();
2189 qmap = cbmapopen();
2190 wnum = 0;
2191 for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
2192 if(i < 0){
2193 if(!(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
2194 } else {
2195 text = CB_LISTVAL(doc->dtexts, i);
2196 }
2197 CB_LISTOPEN(words);
2198 switch(db->amode){
2199 case ESTDFPERFNG:
2200 est_break_text_perfng(text, words, FALSE, TRUE);
2201 break;
2202 case ESTDFCHRCAT:
2203 est_break_text_chrcat(text, words, FALSE);
2204 break;
2205 default:
2206 est_break_text(text, words, FALSE, TRUE);
2207 break;
2208 }
2209 wnum += CB_LISTNUM(words);
2210 for(j = 0; j < CB_LISTNUM(words); j++){
2211 word = CB_LISTVAL2(words, j, wsiz);
2212 if(wsiz > ESTWORDMAXLEN) continue;
2213 fnext = cblistval(words, j + 1, &fnsiz);
2214 snext = cblistval(words, j + 2, &snsiz);
2215 junc[0] = fnext ? dpinnerhash(fnext, fnsiz) % ESTJHASHNUM + 1: 0xff;
2216 junc[1] = snext ? dpouterhash(snext, snsiz) % ESTJHASHNUM + 1: 0xff;
2217 memcpy(wbuf, word, wsiz);
2218 memcpy(wbuf + wsiz, "\t", 1);
2219 memcpy(wbuf + wsiz + 1, junc, 2);
2220 np = (int *)cbmapget(fmap, word, wsiz, NULL);
2221 num = np ? *(int *)np : 0;
2222 num += ESTOCPOINT;
2223 cbmapput(fmap, word, wsiz, (char *)&num, sizeof(int), TRUE);
2224 if(cbmapput(qmap, wbuf, wsiz + 3, "", 0, FALSE))
2225 cbmapputcat(ocmap, word, wsiz, (char *)junc, fnext ? 2 : 0);
2226 }
2227 CB_LISTCLOSE(words);
2228 }
2229 score = (vbuf = cbmapget(doc->attrs, "\t", 1, NULL)) ? atoi(vbuf) : -1;
2230 weight = 1.0;
2231 if(score < 0 && (options & ESTPDWEIGHT) &&
2232 (vbuf = cbmapget(doc->attrs, ESTDATTRWEIGHT, -1, NULL)) != NULL){
2233 weight = strtod(vbuf, NULL);
2234 weight = weight >= 0.01 ? weight : 0.01;
2235 }
2236 tune = sqrt(wnum + 128) / 16.0 / weight;
2237 cbmapiterinit(ocmap);
2238 while((kbuf = cbmapiternext(ocmap, &ksiz)) != NULL){
2239 CB_MAPITERVAL(vbuf, kbuf, vsiz);
2240 if(vsiz > 2) qsort((void *)vbuf, vsiz / 2, 2, est_short_compare);
2241 CB_DATUMOPEN(ocbuf);
2242 EST_SET_VNUMBUF(wsiz, wbuf, doc->id);
2243 CB_DATUMCAT(ocbuf, wbuf, wsiz);
2244 switch(db->smode){
2245 case ESTDFSCVOID:
2246 break;
2247 default:
2248 num = score < 0 ? *(int *)cbmapget(fmap, kbuf, ksiz, NULL) / tune : score;
2249 if(num >= 0x80) num += (0x80 - num) * 0.75;
2250 if(num >= 0xc0) num += (0xc0 - num) * 0.75;
2251 c = num < 0xff ? num : 0xff;
2252 CB_DATUMCAT(ocbuf, (char *)&c, 1);
2253 break;
2254 case ESTDFSCINT:
2255 case ESTDFSCASIS:
2256 num = score < 0 ? *(int *)cbmapget(fmap, kbuf, ksiz, NULL) * 10 / tune : score;
2257 CB_DATUMCAT(ocbuf, (char *)&num, sizeof(int));
2258 break;
2259 }
2260 CB_DATUMCAT(ocbuf, vbuf, vsiz);
2261 c = 0x00;
2262 CB_DATUMCAT(ocbuf, (char *)&c, 1);
2263 cbmapputcat(db->idxcc, kbuf, ksiz, CB_DATUMPTR(ocbuf), CB_DATUMSIZE(ocbuf));
2264 db->icsiz += CB_DATUMSIZE(ocbuf);
2265 CB_DATUMCLOSE(ocbuf);
2266 }
2267 cbmapclose(qmap);
2268 cbmapclose(fmap);
2269 cbmapclose(ocmap);
2270 err = FALSE;
2271 sbuf = cbmapdump(doc->attrs, &ssiz);
2272 if(!est_crput(db->attrdb, db->zmode, doc->id, sbuf, ssiz, CR_DKEEP)){
2273 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2274 db->fatal = TRUE;
2275 err = TRUE;
2276 }
2277 free(sbuf);
2278 sbuf = cblistdump(doc->dtexts, &ssiz);
2279 if(!est_crput(db->textdb, db->zmode, doc->id, sbuf, ssiz, CR_DKEEP)){
2280 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2281 db->fatal = TRUE;
2282 err = TRUE;
2283 }
2284 free(sbuf);
2285 if(doc->kwords && !est_db_put_keywords(db, doc->id, doc->kwords, weight)) err = TRUE;
2286 sprintf(nbuf, "%d", doc->id);
2287 if(!vlput(db->listdb, uri, -1, nbuf, -1, VL_DKEEP)){
2288 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2289 db->fatal = TRUE;
2290 err = TRUE;
2291 }
2292 if(cbmaprnum(db->aidxs) > 0){
2293 cbmapiterinit(db->aidxs);
2294 while((kbuf = cbmapiternext(db->aidxs, &ksiz)) != NULL){
2295 if(!(vbuf = cbmapget(doc->attrs, kbuf, ksiz, &vsiz))) continue;
2296 attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
2297 switch(attridx->type){
2298 case ESTIDXATTRSTR:
2299 case ESTIDXATTRNUM:
2300 if(!est_aidx_attr_put(attridx->db, doc->id, vbuf, vsiz)){
2301 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2302 db->fatal = TRUE;
2303 err = TRUE;
2304 }
2305 break;
2306 default:
2307 if(!est_aidx_seq_put(attridx->db, doc->id, vbuf, vsiz)){
2308 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2309 db->fatal = TRUE;
2310 err = TRUE;
2311 }
2312 break;
2313 }
2314 }
2315 }
2316 db->dnum++;
2317 if(est_db_used_cache_size(db) > db->icmax && !est_db_flush(db, INT_MAX)) err = TRUE;
2318 return err ? FALSE : TRUE;
2319 }
2320
2321
2322 /* Remove a document from a database. */
est_db_out_doc(ESTDB * db,int id,int options)2323 int est_db_out_doc(ESTDB *db, int id, int options){
2324 ESTDOC *doc;
2325 CBLIST *words;
2326 ESTATTRIDX *attridx;
2327 const char *uri, *kbuf, *vbuf, *text, *word;
2328 char numbuf[ESTNUMBUFSIZ];
2329 int i, j, ksiz, vsiz, len, wsiz;
2330 assert(db && id > 0);
2331 if(!dpwritable(db->metadb)){
2332 est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
2333 return FALSE;
2334 }
2335 if(id >= ESTPDOCIDMIN){
2336 est_set_ecode(&(db->ecode), ESTEINVAL, __LINE__);
2337 return FALSE;
2338 }
2339 if(!(doc = est_db_get_doc(db, id, ESTGDNOKWD))) return FALSE;
2340 if(!doc->attrs || !(uri = cbmapget(doc->attrs, ESTDATTRURI, -1, NULL))){
2341 est_doc_delete(doc);
2342 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2343 db->fatal = TRUE;
2344 return FALSE;
2345 }
2346 if(!est_crout(db->attrdb, id) || !est_crout(db->textdb, id) || !vlout(db->listdb, uri, -1)){
2347 est_doc_delete(doc);
2348 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2349 db->fatal = TRUE;
2350 return FALSE;
2351 }
2352 cbmapout(db->attrcc, (char *)&id, sizeof(int));
2353 cbmapout(db->textcc, (char *)&id, sizeof(int));
2354 if(db->spacc) cbmapout(db->spacc, (char *)&id, sizeof(int));
2355 if(cbmaprnum(db->aidxs) > 0){
2356 cbmapiterinit(db->aidxs);
2357 while((kbuf = cbmapiternext(db->aidxs, &ksiz)) != NULL){
2358 if(!(vbuf = cbmapget(doc->attrs, kbuf, ksiz, &vsiz))) continue;
2359 attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
2360 switch(attridx->type){
2361 case ESTIDXATTRSTR:
2362 case ESTIDXATTRNUM:
2363 if(!est_aidx_attr_out(attridx->db, doc->id, vbuf, vsiz)){
2364 est_doc_delete(doc);
2365 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2366 db->fatal = TRUE;
2367 return FALSE;
2368 }
2369 break;
2370 default:
2371 if(!est_aidx_seq_out(attridx->db, doc->id)){
2372 est_doc_delete(doc);
2373 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2374 db->fatal = TRUE;
2375 return FALSE;
2376 }
2377 break;
2378 }
2379 }
2380 }
2381 if(options & ESTODCLEAN){
2382 len = sprintf(numbuf, "\t%d", doc->id);
2383 cbmapput(db->outcc, numbuf, len, "", 0, FALSE);
2384 for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
2385 if(i < 0){
2386 if(!(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
2387 } else {
2388 text = CB_LISTVAL(doc->dtexts, i);
2389 }
2390 CB_LISTOPEN(words);
2391 switch(db->amode){
2392 case ESTDFPERFNG:
2393 est_break_text_perfng(text, words, FALSE, TRUE);
2394 break;
2395 case ESTDFCHRCAT:
2396 est_break_text_chrcat(text, words, FALSE);
2397 break;
2398 default:
2399 est_break_text(text, words, FALSE, TRUE);
2400 break;
2401 }
2402 for(j = 0; j < CB_LISTNUM(words); j++){
2403 word = CB_LISTVAL2(words, j, wsiz);
2404 cbmapput(db->outcc, word, wsiz, "", 0, FALSE);
2405 }
2406 CB_LISTCLOSE(words);
2407 }
2408 if(!est_db_out_keywords(db, id) && db->ecode != ESTENOITEM){
2409 est_doc_delete(doc);
2410 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2411 db->fatal = TRUE;
2412 return FALSE;
2413 }
2414 } else {
2415 if(!est_crout(db->kwddb, id) && dpecode != DP_ENOITEM){
2416 est_doc_delete(doc);
2417 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2418 db->fatal = TRUE;
2419 return FALSE;
2420 }
2421 cbmapout(db->veccc, (char *)&id, sizeof(int));
2422 }
2423 est_doc_delete(doc);
2424 if(!est_db_set_doc_entity(db, id, NULL, -1) && db->ecode != ESTENOITEM) return FALSE;
2425 db->dnum--;
2426 return TRUE;
2427 }
2428
2429
2430 /* Edit attributes of a document object in a database. */
est_db_edit_doc(ESTDB * db,ESTDOC * doc)2431 int est_db_edit_doc(ESTDB *db, ESTDOC *doc){
2432 ESTDOC *odoc;
2433 ESTATTRIDX *attridx;
2434 const char *uri, *tmp, *kbuf, *vbuf;
2435 char *ouri, numbuf[ESTNUMBUFSIZ], *text, *sbuf;
2436 int err, id, oid, ksiz, vsiz, ssiz;
2437 assert(db && doc);
2438 if(!dpwritable(db->metadb)){
2439 est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
2440 return FALSE;
2441 }
2442 id = -1;
2443 uri = NULL;
2444 if(doc->attrs){
2445 if((tmp = cbmapget(doc->attrs, ESTDATTRID, -1, NULL)) != NULL) id = atoi(tmp);
2446 if((tmp = cbmapget(doc->attrs, ESTDATTRURI, -1, NULL)) != NULL) uri = tmp;
2447 }
2448 if(id < 1 || id >= ESTPDOCIDMIN || (doc->id > 0 && doc->id != id) || !uri || uri[0] == '\0'){
2449 est_set_ecode(&(db->ecode), ESTEINVAL, __LINE__);
2450 return FALSE;
2451 }
2452 err = FALSE;
2453 if((oid = est_db_uri_to_id(db, uri)) == -1){
2454 if(!(ouri = est_db_get_doc_attr(db, id, ESTDATTRURI))){
2455 est_set_ecode(&(db->ecode), ESTEINVAL, __LINE__);
2456 return FALSE;
2457 }
2458 sprintf(numbuf, "%d", id);
2459 if(!vlout(db->listdb, ouri, -1) || !vlput(db->listdb, uri, -1, numbuf, -1, VL_DKEEP))
2460 err = TRUE;
2461 free(ouri);
2462 } else if(oid != id){
2463 est_set_ecode(&(db->ecode), ESTEINVAL, __LINE__);
2464 return FALSE;
2465 }
2466 doc->id = id;
2467 if(cbmaprnum(db->aidxs) > 0 && (odoc = est_db_get_doc(db, id, ESTGDNOTEXT))){
2468 if(!odoc->attrs) odoc->attrs = cbmapopenex(ESTMINIBNUM);
2469 cbmapiterinit(db->aidxs);
2470 while((kbuf = cbmapiternext(db->aidxs, &ksiz)) != NULL){
2471 if(!(vbuf = cbmapget(odoc->attrs, kbuf, ksiz, &vsiz))) continue;
2472 attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
2473 switch(attridx->type){
2474 case ESTIDXATTRSTR:
2475 case ESTIDXATTRNUM:
2476 if(!est_aidx_attr_out(attridx->db, id, vbuf, vsiz)){
2477 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2478 db->fatal = TRUE;
2479 err = TRUE;
2480 }
2481 break;
2482 default:
2483 if(!est_aidx_seq_out(attridx->db, id)){
2484 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2485 db->fatal = TRUE;
2486 err = TRUE;
2487 }
2488 break;
2489 }
2490 }
2491 cbmapiterinit(db->aidxs);
2492 while((kbuf = cbmapiternext(db->aidxs, &ksiz)) != NULL){
2493 if(!(vbuf = cbmapget(doc->attrs, kbuf, ksiz, &vsiz))) continue;
2494 attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
2495 switch(attridx->type){
2496 case ESTIDXATTRSTR:
2497 case ESTIDXATTRNUM:
2498 if(!est_aidx_attr_put(attridx->db, id, vbuf, vsiz)){
2499 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2500 db->fatal = TRUE;
2501 err = TRUE;
2502 }
2503 break;
2504 default:
2505 if(!est_aidx_seq_put(attridx->db, id, vbuf, vsiz)){
2506 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2507 db->fatal = TRUE;
2508 err = TRUE;
2509 }
2510 break;
2511 }
2512 }
2513 est_doc_delete(odoc);
2514 }
2515 if((text = est_db_get_doc_attr(db, id, "")) != NULL){
2516 cbmapput(doc->attrs, "", 0, text, -1, TRUE);
2517 free(text);
2518 }
2519 sbuf = cbmapdump(doc->attrs, &ssiz);
2520 if(!est_crput(db->attrdb, db->zmode, id, sbuf, ssiz, CR_DOVER)){
2521 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2522 db->fatal = TRUE;
2523 err = TRUE;
2524 }
2525 free(sbuf);
2526 cbmapout(db->attrcc, (char *)&id, sizeof(int));
2527 if(db->spacc) cbmapout(db->spacc, (char *)&id, sizeof(int));
2528 return err ? FALSE : TRUE;
2529 }
2530
2531
2532 /* Retrieve a document in a database. */
est_db_get_doc(ESTDB * db,int id,int options)2533 ESTDOC *est_db_get_doc(ESTDB *db, int id, int options){
2534 ESTDOC *doc;
2535 const char *cbuf;
2536 char *vbuf, numbuf[ESTNUMBUFSIZ];
2537 int i, csiz, vsiz, num;
2538 assert(db && id > 0);
2539 if(id >= ESTPDOCIDMIN){
2540 if((num = id - ESTPDOCIDMIN) >= CB_LISTNUM(db->pdocs)){
2541 est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
2542 return NULL;
2543 }
2544 if((vbuf = cbreadfile(CB_LISTVAL(db->pdocs, num), NULL)) != NULL){
2545 doc = est_doc_new_from_draft(vbuf);
2546 free(vbuf);
2547 } else {
2548 doc = est_doc_new();
2549 }
2550 doc->id = id;
2551 sprintf(numbuf, "%d", id);
2552 est_doc_add_attr(doc, ESTDATTRID, numbuf);
2553 if(!est_doc_attr(doc, ESTDATTRURI))
2554 est_doc_add_attr(doc, ESTDATTRURI, CB_LISTVAL(db->pdocs, num));
2555 return doc;
2556 }
2557 cbuf = NULL;
2558 if(options & ESTGDNOATTR){
2559 if(crvsiz(db->attrdb, (char *)&id, sizeof(int)) == -1){
2560 if(dpecode == DP_ENOITEM){
2561 est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
2562 return NULL;
2563 } else {
2564 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2565 db->fatal = TRUE;
2566 return NULL;
2567 }
2568 }
2569 vbuf = NULL;
2570 } else if((cbuf = cbmapget(db->attrcc, (char *)&id, sizeof(int), &csiz)) != NULL){
2571 cbmapmove(db->attrcc, (char *)&id, sizeof(int), FALSE);
2572 vbuf = NULL;
2573 } else if(!(vbuf = est_crget(db->attrdb, db->zmode, id, &vsiz))){
2574 if(dpecode == DP_ENOITEM){
2575 est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
2576 return NULL;
2577 } else {
2578 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2579 db->fatal = TRUE;
2580 return NULL;
2581 }
2582 }
2583 doc = est_doc_new();
2584 doc->id = id;
2585 if(cbuf){
2586 doc->attrs = cbmapload(cbuf, csiz);
2587 } else if(vbuf){
2588 doc->attrs = cbmapload(vbuf, vsiz);
2589 if(db->acmnum > 0) cbmapput(db->attrcc, (char *)&id, sizeof(int), vbuf, vsiz, TRUE);
2590 free(vbuf);
2591 if(cbmaprnum(db->attrcc) > db->acmnum){
2592 num = cbmaprnum(db->attrcc) * 0.1 + 1;
2593 cbmapiterinit(db->attrcc);
2594 for(i = 0; i < num && (cbuf = cbmapiternext(db->attrcc, NULL)) != NULL; i++){
2595 cbmapout(db->attrcc, cbuf, sizeof(int));
2596 }
2597 }
2598 } else {
2599 doc->attrs = NULL;
2600 }
2601 if(!(options & ESTGDNOTEXT)){
2602 if((cbuf = cbmapget(db->textcc, (char *)&id, sizeof(int), &csiz)) != NULL){
2603 cbmapmove(db->textcc, (char *)&id, sizeof(int), FALSE);
2604 doc->dtexts = cblistload(cbuf, csiz);
2605 } else {
2606 if(!(vbuf = est_crget(db->textdb, db->zmode, id, &vsiz))){
2607 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
2608 db->fatal = TRUE;
2609 est_doc_delete(doc);
2610 return NULL;
2611 }
2612 doc->dtexts = cblistload(vbuf, vsiz);
2613 if(db->tcmnum > 0) cbmapput(db->textcc, (char *)&id, sizeof(int), vbuf, vsiz, TRUE);
2614 free(vbuf);
2615 if(cbmaprnum(db->textcc) > db->tcmnum){
2616 num = cbmaprnum(db->textcc) * 0.1 + 1;
2617 cbmapiterinit(db->textcc);
2618 for(i = 0; i < num &&(cbuf = cbmapiternext(db->textcc, NULL)) != NULL; i++){
2619 cbmapout(db->textcc, cbuf, sizeof(int));
2620 }
2621 }
2622 }
2623 }
2624 if(!(options & ESTGDNOKWD)) doc->kwords = est_db_get_keywords(db, id);
2625 return doc;
2626 }
2627
2628
2629 /* Retrieve the value of an attribute of a document in a database. */
est_db_get_doc_attr(ESTDB * db,int id,const char * name)2630 char *est_db_get_doc_attr(ESTDB *db, int id, const char *name){
2631 ESTATTRIDX *attridx;
2632 ESTDOC *doc;
2633 const char *cbuf;
2634 char *mbuf, *vbuf;
2635 int cb, csiz, msiz, vsiz;
2636 assert(db && id > 0 && name);
2637 if(id >= ESTPDOCIDMIN){
2638 if(!(doc = est_db_get_doc(db, id, 0))){
2639 est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
2640 return NULL;
2641 }
2642 if(!(cbuf = est_doc_attr(doc, name))){
2643 est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
2644 est_doc_delete(doc);
2645 return NULL;
2646 }
2647 vbuf = cbmemdup(cbuf, -1);
2648 est_doc_delete(doc);
2649 return vbuf;
2650 }
2651 cb = db->spacc && !strcmp(name, db->scname);
2652 if(cb && (cbuf = cbmapget(db->spacc, (char *)&id, sizeof(int), &csiz)) != NULL){
2653 cbmapmove(db->spacc, (char *)&id, sizeof(int), FALSE);
2654 return cbmemdup(cbuf, csiz);
2655 }
2656 if((attridx = (ESTATTRIDX *)cbmapget(db->aidxs, name, -1, NULL)) != NULL &&
2657 attridx->type == ESTIDXATTRSEQ){
2658 if(!(vbuf = est_aidx_seq_get(attridx->db, id, &vsiz))){
2659 est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
2660 return NULL;
2661 }
2662 if(cb) cbmapput(db->spacc, (char *)&id, sizeof(int), vbuf, vsiz, FALSE);
2663 return vbuf;
2664 }
2665 if(!(mbuf = est_crget(db->attrdb, db->zmode, id, &msiz))){
2666 est_set_ecode(&(db->ecode), dpecode == DP_ENOITEM ? ESTENOITEM : ESTEDB, __LINE__);
2667 return NULL;
2668 }
2669 if(!(vbuf = cbmaploadone(mbuf, msiz, name, -1, &vsiz))){
2670 est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
2671 free(mbuf);
2672 return NULL;
2673 }
2674 if(cb) cbmapput(db->spacc, (char *)&id, sizeof(int), vbuf, vsiz, FALSE);
2675 free(mbuf);
2676 return vbuf;
2677 }
2678
2679
2680 /* Get the ID of a document spacified by URI. */
est_db_uri_to_id(ESTDB * db,const char * uri)2681 int est_db_uri_to_id(ESTDB *db, const char *uri){
2682 const char *vbuf;
2683 int id;
2684 assert(db && uri);
2685 if(!(vbuf = vlgetcache(db->listdb, uri, -1, NULL))){
2686 if(CB_LISTNUM(db->pdocs) > 0 && (id = est_pidx_uri_to_id(db, uri)) > 0) return id;
2687 est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
2688 return -1;
2689 }
2690 return atoi(vbuf);
2691 }
2692
2693
2694 /* Get the name of a database. */
est_db_name(ESTDB * db)2695 const char *est_db_name(ESTDB *db){
2696 assert(db);
2697 return db->name;
2698 }
2699
2700
2701 /* Get the number of documents in a database. */
est_db_doc_num(ESTDB * db)2702 int est_db_doc_num(ESTDB *db){
2703 assert(db);
2704 return db->dnum;
2705 }
2706
2707
2708 /* Get the number of words in a database. */
est_db_word_num(ESTDB * db)2709 int est_db_word_num(ESTDB *db){
2710 int wnum;
2711 assert(db);
2712 wnum = vlrnum(db->fwmdb);
2713 return wnum > 0 ? wnum : 0;
2714 }
2715
2716
2717 /* Get the size of a database. */
est_db_size(ESTDB * db)2718 double est_db_size(ESTDB *db){
2719 ESTATTRIDX *attridx;
2720 const char *kbuf;
2721 double size;
2722 assert(db);
2723 size = (double)dpfsiz(db->metadb) + est_idx_size(db->idxdb) + vlfsiz(db->fwmdb) +
2724 vlfsiz(db->auxdb) + vlfsiz(db->xfmdb) + crfsizd(db->attrdb) + crfsizd(db->textdb) +
2725 crfsizd(db->kwddb) + vlfsiz(db->listdb);
2726 if(cbmaprnum(db->aidxs) > 0){
2727 cbmapiterinit(db->aidxs);
2728 while((kbuf = cbmapiternext(db->aidxs, NULL)) != NULL){
2729 attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
2730 switch(attridx->type){
2731 case ESTIDXATTRSTR:
2732 case ESTIDXATTRNUM:
2733 size += vlfsiz(attridx->db);
2734 break;
2735 default:
2736 size += dpfsiz(attridx->db);
2737 break;
2738 }
2739 }
2740 }
2741 return size;
2742 }
2743
2744
2745 /* Search documents corresponding a condition for a database. */
est_db_search(ESTDB * db,ESTCOND * cond,int * nump,CBMAP * hints)2746 int *est_db_search(ESTDB *db, ESTCOND *cond, int *nump, CBMAP *hints){
2747 ESTSCORE *scores, *tscores;
2748 CBMAP *svmap, *ordattrs;
2749 CBLIST *terms;
2750 const char *term, *rp;
2751 char *tmp, numbuf[ESTNUMBUFSIZ];
2752 const int *nscores;
2753 int i, j, snum, ign, nsnum, unum, knum, mnum, top, pcnum, ncnum, tsnum, add;
2754 int nnum, id, score, hnum, len, rest, *rval, rnum;
2755 double tune;
2756 assert(db && cond && nump);
2757 if(cond->auxwords) cbmapclose(cond->auxwords);
2758 cond->auxwords = cbmapopenex(ESTMINIBNUM);
2759 scores = NULL;
2760 snum = 0;
2761 ign = -1;
2762 nscores = cond->nscores;
2763 nsnum = cond->nsnum;
2764 ordattrs = cbmapopenex(cond->order ? (CB_LISTNUM(db->pdocs) + ESTMINIBNUM) : 1);
2765 if(cond->phrase){
2766 if(cbstrfwmatch(cond->phrase, ESTOPID)){
2767 if((id = atoi(cond->phrase + strlen(ESTOPID))) > 0){
2768 CB_MALLOC(scores, sizeof(ESTSCORE));
2769 scores[0].id = id;
2770 scores[0].score = 0;
2771 scores[0].value = NULL;
2772 snum = 1;
2773 } else {
2774 CB_MALLOC(scores, 1);
2775 snum = 0;
2776 }
2777 } else if(cbstrfwmatch(cond->phrase, ESTOPURI)){
2778 rp = cond->phrase + strlen(ESTOPURI);
2779 while(*rp > '\0' && *rp <= ' '){
2780 rp++;
2781 }
2782 if((id = est_db_uri_to_id(db, rp)) > 0){
2783 CB_MALLOC(scores, sizeof(ESTSCORE));
2784 scores[0].id = id;
2785 scores[0].score = 0;
2786 scores[0].value = NULL;
2787 snum = 1;
2788 } else {
2789 CB_MALLOC(scores, 1);
2790 snum = 0;
2791 }
2792 } else if(cbstrfwmatch(cond->phrase, ESTOPSIMILAR)){
2793 rp = cond->phrase + strlen(ESTOPSIMILAR);
2794 while(*rp > '\0' && *rp <= ' '){
2795 rp++;
2796 }
2797 knum = -1;
2798 unum = -1;
2799 mnum = -1;
2800 if(*rp >= '0' && *rp <= '9'){
2801 knum = atoi(rp);
2802 while(*rp >= '0' && *rp <= '9'){
2803 rp++;
2804 }
2805 while(*rp > '\0' && *rp <= ' '){
2806 rp++;
2807 }
2808 if(*rp >= '0' && *rp <= '9'){
2809 unum = atoi(rp);
2810 while(*rp >= '0' && *rp <= '9'){
2811 rp++;
2812 }
2813 while(*rp > '\0' && *rp <= ' '){
2814 rp++;
2815 }
2816 if(*rp >= '0' && *rp <= '9'){
2817 mnum = atoi(rp);
2818 while(*rp >= '0' && *rp <= '9'){
2819 rp++;
2820 }
2821 while(*rp > '\0' && *rp <= ' '){
2822 rp++;
2823 }
2824 }
2825 }
2826 }
2827 if(knum < 1) knum = ESTSMLRKNUM;
2828 if(unum < 1) unum = ESTSMLRUNUM;
2829 if(mnum < 1) mnum = ESTSMLRMNUM;
2830 svmap = est_phrase_vector(rp);
2831 scores = est_search_similar(db, svmap, &snum, knum, unum, mnum, cond->tfidf,
2832 cond->order ? ESTSMLRNMIN : 0.0, cond->auxmin, cond->auxwords);
2833 cbmapclose(svmap);
2834 } else if(cbstrfwmatch(cond->phrase, ESTOPRANK)){
2835 rp = cond->phrase + strlen(ESTOPRANK);
2836 while(*rp > '\0' && *rp <= ' '){
2837 rp++;
2838 }
2839 top = atoi(rp);
2840 while((*rp >= '0' && *rp <= '9') || *rp == '-'){
2841 rp++;
2842 }
2843 while(*rp > '\0' && *rp <= ' '){
2844 rp++;
2845 }
2846 scores = est_search_rank(db, rp, top, &snum);
2847 } else {
2848 switch(cond->pmode){
2849 default:
2850 terms = est_phrase_terms(cond->phrase);
2851 break;
2852 case ESTPMSIMPLE:
2853 tmp = est_phrase_from_simple(cond->phrase);
2854 terms = est_phrase_terms(tmp);
2855 free(tmp);
2856 break;
2857 case ESTPMROUGH:
2858 tmp = est_phrase_from_rough(cond->phrase);
2859 terms = est_phrase_terms(tmp);
2860 free(tmp);
2861 break;
2862 case ESTPMUNION:
2863 tmp = est_phrase_from_union(cond->phrase);
2864 terms = est_phrase_terms(tmp);
2865 free(tmp);
2866 break;
2867 case ESTPMISECT:
2868 tmp = est_phrase_from_isect(cond->phrase);
2869 terms = est_phrase_terms(tmp);
2870 free(tmp);
2871 break;
2872 }
2873 pcnum = 0;
2874 ncnum = 0;
2875 add = TRUE;
2876 for(i = 0; i < CB_LISTNUM(terms); i++){
2877 term = CB_LISTVAL(terms, i);
2878 if(!strcmp(term, ESTOPISECT)){
2879 add = TRUE;
2880 } else if(!strcmp(term, ESTOPDIFF)){
2881 add = FALSE;
2882 } else {
2883 if(!strcmp(term, ESTOPUVSET)){
2884 tscores = est_search_uvset(db, &tsnum, hints, add);
2885 } else {
2886 tscores = est_search_union(db, term, cond->gstep, cond->cbxpn, &tsnum, hints, add,
2887 add && !cond->order ? cond->auxmin : -1, cond->auxwords);
2888 }
2889 if(add){
2890 if(db->smode != ESTDFSCASIS){
2891 if(cond->tfidf){
2892 tune = pow(tsnum + 64, 0.4);
2893 for(j = 0; j < tsnum; j++){
2894 tscores[j].score *= 100.0 / tune;
2895 }
2896 } else {
2897 for(j = 0; j < tsnum; j++){
2898 tscores[j].score *= 10;
2899 }
2900 }
2901 }
2902 pcnum++;
2903 } else {
2904 ncnum++;
2905 }
2906 if(scores){
2907 CB_REALLOC(scores, (snum + tsnum) * sizeof(ESTSCORE) + 1);
2908 for(j = 0; j < tsnum; j++){
2909 scores[snum+j].id = tscores[j].id;
2910 scores[snum+j].score = add ? tscores[j].score : -1;
2911 scores[snum+j].value = NULL;
2912 }
2913 snum += tsnum;
2914 free(tscores);
2915 } else {
2916 scores = tscores;
2917 snum = tsnum;
2918 }
2919 }
2920 }
2921 if(scores){
2922 if(pcnum > 1 || ncnum > 0){
2923 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
2924 nnum = 0;
2925 for(i = 0; i < snum; i++){
2926 id = scores[i].id;
2927 score = scores[i].score;
2928 hnum = score >= 0 ? 1 : 0;
2929 for(j = i + 1; j < snum && scores[j].id == id; j++){
2930 if(score >= 0 && scores[j].score >= 0){
2931 if(db->smode != ESTDFSCASIS) score += scores[j].score;
2932 hnum++;
2933 } else {
2934 score = -1;
2935 }
2936 }
2937 if(score >= 0 && hnum >= pcnum){
2938 scores[nnum].id = id;
2939 scores[nnum].score = score;
2940 scores[nnum].value = NULL;
2941 nnum++;
2942 }
2943 i = j - 1;
2944 }
2945 snum = nnum;
2946 }
2947 } else {
2948 CB_MALLOC(scores, 1);
2949 snum = 0;
2950 }
2951 CB_LISTCLOSE(terms);
2952 }
2953 } else if(cond->attrs){
2954 if(nscores && nsnum < ESTAISNUMMIN * 4){
2955 CB_MALLOC(scores, nsnum * sizeof(ESTSCORE) + 1);
2956 nnum = 0;
2957 for(i = 0; i < nsnum; i++){
2958 sprintf(numbuf, "%d", nscores[i]);
2959 if((id = est_db_uri_to_id(db, numbuf)) > 0){
2960 scores[nnum].id = id;
2961 scores[nnum].score = nscores[i];
2962 scores[nnum].value = NULL;
2963 nnum++;
2964 }
2965 }
2966 snum = nnum;
2967 nscores = NULL;
2968 nsnum = -1;
2969 } else {
2970 scores = NULL;
2971 for(i = 0; i < CB_LISTNUM(cond->attrs); i++){
2972 if((scores = est_search_aidx_attr(db, CB_LISTVAL(cond->attrs, i), &snum)) != NULL){
2973 ign = i;
2974 break;
2975 }
2976 }
2977 if(!scores) scores = est_search_uvset(db, &snum, hints, TRUE);
2978 }
2979 } else {
2980 CB_MALLOC(scores, 1);
2981 snum = 0;
2982 }
2983 if(CB_LISTNUM(db->pdocs) > 0) scores = est_search_pidxs(db, cond, scores, &snum, ordattrs);
2984 if(nscores && cond->phrase && cond->phrase[0] != '\0'){
2985 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
2986 nnum = 0;
2987 j = 0;
2988 for(i = 0; i < snum; i++){
2989 id = scores[i].id;
2990 score = scores[i].score;
2991 while(j < nsnum && nscores[j] > score){
2992 j++;
2993 }
2994 if(j < nsnum && nscores[j] == score){
2995 scores[nnum].id = id;
2996 scores[nnum].score = score;
2997 scores[nnum].value = NULL;
2998 nnum++;
2999 j++;
3000 }
3001 }
3002 snum = nnum;
3003 }
3004 if(cbmaprnum(db->outcc) > 0){
3005 tsnum = 0;
3006 for(i = 0; i < snum; i++){
3007 len = sprintf(numbuf, "\t%d", scores[i].id);
3008 if(cbmapget(db->outcc, numbuf, len, NULL)) continue;
3009 scores[tsnum++] = scores[i];
3010 }
3011 snum = tsnum;
3012 }
3013 if(cond->max > 0 && cond->max * ESTATTRALW + 1 < snum && cond->attrs &&
3014 !cond->order && !cond->distinct){
3015 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
3016 nnum = est_narrow_scores(db, cond->attrs, ign, cond->order, cond->distinct, scores, snum,
3017 cond->max * ESTATTRALW + 1, &rest, ordattrs);
3018 if(hints){
3019 sprintf(numbuf, "%d",
3020 rest > cond->max / 2 ? (int)(snum * (nnum / (double)(snum - rest))) : nnum);
3021 cbmapput(hints, "", 0, numbuf, -1, TRUE);
3022 }
3023 snum = nnum;
3024 } else {
3025 if(cond->attrs || cond->order || cond->distinct)
3026 snum = est_narrow_scores(db, cond->attrs, ign, cond->order, cond->distinct, scores, snum,
3027 INT_MAX, &rest, ordattrs);
3028 if(!cond->order) qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
3029 if(hints){
3030 sprintf(numbuf, "%d", snum);
3031 cbmapput(hints, "", 0, numbuf, -1, TRUE);
3032 }
3033 }
3034 if(cond->shadows) cbmapclose(cond->shadows);
3035 if(cond->ecllim >= 0.0){
3036 cond->shadows = cbmapopenex(snum + 1);
3037 snum = est_eclipse_scores(db, scores, snum, cond->max > 0 ? cond->max : snum,
3038 ESTECLKNUM, cond->tfidf, cond->ecllim, cond->shadows);
3039 } else {
3040 cond->shadows = NULL;
3041 }
3042 rnum = snum - cond->skip;
3043 if(rnum < 0) rnum = 0;
3044 if(cond->max >= 0 && cond->max < rnum) rnum = cond->max;
3045 CB_MALLOC(rval, rnum * sizeof(int) + 1);
3046 tscores = scores + cond->skip;
3047 for(i = 0; i < rnum; i++){
3048 rval[i] = tscores[i].id;
3049 }
3050 if(cond->scfb){
3051 if(rnum > 0){
3052 CB_REALLOC(cond->scores, rnum * sizeof(int) + 1);
3053 for(i = 0; i < rnum; i++){
3054 cond->scores[i] = tscores[i].score;
3055 }
3056 cond->snum = rnum;
3057 } else {
3058 free(cond->scores);
3059 cond->scores = NULL;
3060 cond->snum = 0;
3061 }
3062 }
3063 *nump = rnum;
3064 if(*nump < 1) est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
3065 cbmapclose(ordattrs);
3066 free(scores);
3067 return rval;
3068 }
3069
3070
3071 /* Search documents of plural databases. */
est_db_search_meta(ESTDB ** dbs,int dbnum,ESTCOND * cond,int * nump,CBMAP * hints)3072 int *est_db_search_meta(ESTDB **dbs, int dbnum, ESTCOND *cond, int *nump, CBMAP *hints){
3073 ESTMETASCORE *scores, *tscores;
3074 ESTCOND *tcond;
3075 CBMAP *thints, *umap;
3076 const char *kbuf, *otype, *rp;
3077 char *distinct, numbuf[ESTNUMBUFSIZ], *oname, *wp, *vbuf;
3078 int i, j, max, skip, smax, snum, *res, rnum, ksiz, num;
3079 time_t tval;
3080 assert(dbs && dbnum >= 0 && cond && nump);
3081 max = cond->max;
3082 if(cond->distinct) cond->max = -1;
3083 skip = cond->skip;
3084 cond->skip = 0;
3085 distinct = cond->distinct;
3086 cond->distinct = NULL;
3087 smax = ESTALLOCUNIT;
3088 CB_MALLOC(scores, smax * sizeof(ESTMETASCORE));
3089 snum = 0;
3090 for(i = 0; i < dbnum; i++){
3091 if(cond->mask & (1 << i)) continue;
3092 tcond = est_cond_dup(cond);
3093 est_cond_set_options(tcond, ESTCONDSCFB);
3094 thints = cbmapopenex(ESTMINIBNUM);
3095 res = est_db_search(dbs[i], tcond, &rnum, thints);
3096 for(j = 0; j < rnum; j++){
3097 if(snum >= smax){
3098 smax *= 2;
3099 CB_REALLOC(scores, smax * sizeof(ESTMETASCORE));
3100 }
3101 scores[snum].db = i;
3102 scores[snum].id = res[j];
3103 scores[snum].score = est_cond_score(tcond, j);
3104 scores[snum].value = NULL;
3105 snum++;
3106 }
3107 if(hints){
3108 cbmapiterinit(thints);
3109 while((kbuf = cbmapiternext(thints, &ksiz)) != NULL){
3110 num = atoi(cbmapiterval(kbuf, NULL));
3111 if((rp = cbmapget(hints, kbuf, ksiz, NULL)) != NULL) num += atoi(rp);
3112 sprintf(numbuf, "%d", num);
3113 cbmapput(hints, kbuf, ksiz, numbuf, -1, TRUE);
3114 }
3115 }
3116 free(res);
3117 cbmapclose(thints);
3118 est_cond_delete(tcond);
3119 }
3120 oname = NULL;
3121 otype = NULL;
3122 if(cond->order){
3123 oname = cbmemdup(cond->order, -1);
3124 cbstrtrim(oname);
3125 otype = ESTORDSTRA;
3126 if((wp = strchr(oname, ' ')) != NULL){
3127 *wp = '\0';
3128 rp = wp + 1;
3129 while(*rp == ' '){
3130 rp++;
3131 }
3132 otype = rp;
3133 }
3134 }
3135 if(oname){
3136 if(!cbstricmp(oname, ESTORDIDA)){
3137 qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_id_asc);
3138 } else if(!cbstricmp(oname, ESTORDIDD)){
3139 qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_id_desc);
3140 } else if(!cbstricmp(oname, ESTORDSCA)){
3141 qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_score_asc);
3142 } else if(!cbstricmp(oname, ESTORDSCD)){
3143 qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_score_desc);
3144 } else {
3145 for(i = 0; i < snum; i++){
3146 scores[i].value = est_db_get_doc_attr(dbs[scores[i].db], scores[i].id, oname);
3147 if(!scores[i].value) scores[i].value = cbmemdup("", 0);
3148 }
3149 if(!cbstricmp(otype, ESTORDSTRA)){
3150 qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_str_asc);
3151 } else if(!cbstricmp(otype, ESTORDSTRD)){
3152 qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_str_desc);
3153 } else if(!cbstricmp(otype, ESTORDNUMA)){
3154 for(i = 0; i < snum; i++){
3155 tval = cbstrmktime(scores[i].value);
3156 free(scores[i].value);
3157 scores[i].value = (void *)tval;
3158 }
3159 qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_num_asc);
3160 for(i = 0; i < snum; i++){
3161 scores[i].value = NULL;
3162 }
3163 } else if(!cbstricmp(otype, ESTORDNUMD)){
3164 for(i = 0; i < snum; i++){
3165 tval = cbstrmktime(scores[i].value);
3166 free(scores[i].value);
3167 scores[i].value = (void *)tval;
3168 }
3169 qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_num_desc);
3170 for(i = 0; i < snum; i++){
3171 scores[i].value = NULL;
3172 }
3173 }
3174 for(i = 0; i < snum; i++){
3175 free(scores[i].value);
3176 }
3177 }
3178 free(oname);
3179 } else {
3180 qsort(scores, snum, sizeof(ESTMETASCORE), est_metascore_compare_by_score_desc);
3181 }
3182 if(distinct){
3183 umap = cbmapopenex(snum + 1);
3184 rnum = 0;
3185 for(i = 0; i < snum; i++){
3186 vbuf = est_db_get_doc_attr(dbs[scores[i].db], scores[i].id, distinct);
3187 if(!vbuf) vbuf = cbmemdup("", 0);
3188 if(cbmapput(umap, vbuf, -1, "", 0, FALSE)) scores[rnum++] = scores[i];
3189 free(vbuf);
3190 }
3191 snum = rnum;
3192 cbmapclose(umap);
3193 }
3194 rnum = snum - skip;
3195 if(rnum < 0) rnum = 0;
3196 if(cond->max >= 0 && cond->max < rnum) rnum = cond->max;
3197 CB_MALLOC(res, rnum * sizeof(int) * 2 + 1);
3198 tscores = scores + skip;
3199 for(i = 0; i < rnum; i++){
3200 res[i*2] = tscores[i].db;
3201 res[i*2+1] = tscores[i].id;
3202 }
3203 if(cond->scfb){
3204 if(rnum > 0){
3205 CB_REALLOC(cond->scores, rnum * sizeof(int) + 1);
3206 for(i = 0; i < rnum; i++){
3207 cond->scores[i] = tscores[i].score;
3208 }
3209 cond->snum = rnum;
3210 } else {
3211 free(cond->scores);
3212 cond->scores = NULL;
3213 cond->snum = 0;
3214 }
3215 }
3216 *nump = rnum * 2;
3217 free(scores);
3218 cond->max = max;
3219 cond->skip = skip;
3220 cond->distinct = distinct;
3221 return res;
3222 }
3223
3224
3225 /* Check whether a document object matches the phrase of a search condition object definitely. */
est_db_scan_doc(ESTDB * db,ESTDOC * doc,ESTCOND * cond)3226 int est_db_scan_doc(ESTDB *db, ESTDOC *doc, ESTCOND *cond){
3227 struct { char *word; int num; } wsets[ESTSCANWNUM];
3228 CBLIST *terms, *words;
3229 const char *term, *text;
3230 unsigned char *rbuf;
3231 char *tmp;
3232 int i, j, k, wsnum, add, rsiz, hit;
3233 assert(db && doc && cond);
3234 if(!cond->phrase || cbstrfwmatch(cond->phrase, ESTOPSIMILAR) ||
3235 cbstrfwmatch(cond->phrase, ESTOPID) || cbstrfwmatch(cond->phrase, ESTOPURI)) return FALSE;
3236 if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
3237 switch(cond->pmode){
3238 default:
3239 terms = est_phrase_terms(cond->phrase);
3240 break;
3241 case ESTPMSIMPLE:
3242 tmp = est_phrase_from_simple(cond->phrase);
3243 terms = est_phrase_terms(tmp);
3244 free(tmp);
3245 break;
3246 case ESTPMROUGH:
3247 tmp = est_phrase_from_rough(cond->phrase);
3248 terms = est_phrase_terms(tmp);
3249 free(tmp);
3250 break;
3251 case ESTPMUNION:
3252 tmp = est_phrase_from_union(cond->phrase);
3253 terms = est_phrase_terms(tmp);
3254 free(tmp);
3255 break;
3256 case ESTPMISECT:
3257 tmp = est_phrase_from_isect(cond->phrase);
3258 terms = est_phrase_terms(tmp);
3259 free(tmp);
3260 break;
3261 }
3262 wsnum = 0;
3263 add = TRUE;
3264 for(i = 0; i < CB_LISTNUM(terms); i++){
3265 term = CB_LISTVAL(terms, i);
3266 if(!strcmp(term, ESTOPISECT)){
3267 add = TRUE;
3268 } else if(!strcmp(term, ESTOPDIFF)){
3269 add = FALSE;
3270 } else if(add && strcmp(term, ESTOPUVSET)){
3271 if(term[0] == ' '){
3272 term++;
3273 if(term[0] == 'b'){
3274 term++;
3275 } else if(term[0] == 'e'){
3276 term++;
3277 }
3278 }
3279 words = cbsplit(term, -1, "\t");
3280 while(wsnum < ESTSCANWNUM && CB_LISTNUM(words) > 0){
3281 wsets[wsnum].word = cblistshift(words, NULL);
3282 wsets[wsnum].num = i;
3283 wsnum++;
3284 }
3285 CB_LISTCLOSE(words);
3286 }
3287 }
3288 for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
3289 if(i < 0){
3290 if(!doc->attrs || !(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
3291 } else {
3292 text = CB_LISTVAL(doc->dtexts, i);
3293 }
3294 rbuf = (unsigned char *)est_uconv_in(text, strlen(text), &rsiz);
3295 est_canonicalize_text(rbuf, rsiz, FALSE);
3296 tmp = est_uconv_out((char *)rbuf, rsiz, &rsiz);
3297 for(j = 0; j < wsnum; j++){
3298 if(!wsets[j].word) continue;
3299 if(est_strstr_sparse(tmp, wsets[j].word)){
3300 for(k = 0; k < wsnum; k++){
3301 if(!wsets[k].word) continue;
3302 if(wsets[k].num == wsets[j].num){
3303 free(wsets[k].word);
3304 wsets[k].word = NULL;
3305 }
3306 }
3307 }
3308 }
3309 free(tmp);
3310 free(rbuf);
3311 }
3312 hit = TRUE;
3313 for(i = 0; i < wsnum; i++){
3314 if(!wsets[i].word) continue;
3315 free(wsets[i].word);
3316 hit = FALSE;
3317 }
3318 CB_LISTCLOSE(terms);
3319 return hit;
3320 }
3321
3322
3323 /* Set the maximum size of the cache memory of a database. */
est_db_set_cache_size(ESTDB * db,size_t size,int anum,int tnum,int rnum)3324 void est_db_set_cache_size(ESTDB *db, size_t size, int anum, int tnum, int rnum){
3325 assert(db);
3326 if(dpwritable(db->metadb) && size >= 0) db->icmax = size;
3327 if(anum >= 0) db->acmnum = anum;
3328 if(tnum >= 0) db->tcmnum = tnum;
3329 if(rnum >= 0) db->rcmnum = rnum;
3330 db->vcmnum = db->acmnum / 2;
3331 }
3332
3333
3334 /* Add a pseudo index directory to a database. */
est_db_add_pseudo_index(ESTDB * db,const char * path)3335 int est_db_add_pseudo_index(ESTDB *db, const char *path){
3336 CBLIST *files;
3337 const char *file;
3338 char pbuf[ESTPATHBUFSIZ];
3339 int i, len;
3340 assert(db && path);
3341 if(!(files = cbdirlist(path))){
3342 est_set_ecode(&(db->ecode), ESTEINVAL, __LINE__);
3343 return FALSE;
3344 }
3345 cblistsort(files);
3346 for(i = 0; i < CB_LISTNUM(files); i++){
3347 file = CB_LISTVAL(files, i);
3348 if(!strcmp(file, ESTCDIRSTR) || !strcmp(file, ESTPDIRSTR)) continue;
3349 len = sprintf(pbuf, "%s%c%s", path, ESTPATHCHR, file);
3350 CB_LISTPUSH(db->pdocs, pbuf, len);
3351 }
3352 CB_LISTCLOSE(files);
3353 if(db->puris){
3354 cbmapclose(db->puris);
3355 db->puris = NULL;
3356 }
3357 return TRUE;
3358 }
3359
3360
3361
3362 /*************************************************************************************************
3363 * features for experts
3364 *************************************************************************************************/
3365
3366
3367 /* Handle to the file of random number generator. */
3368 FILE *est_random_ifp = NULL;
3369
3370
3371 /* POSIX signal handlers. */
3372 void (*est_signal_handlers[ESTSIGNUM])(int);
3373
3374
3375 /* Break a sentence of text and extract words. */
est_break_text(const char * text,CBLIST * list,int norm,int tail)3376 void est_break_text(const char *text, CBLIST *list, int norm, int tail){
3377 CBLIST *words;
3378 const unsigned char *word, *next;
3379 unsigned char *utext;
3380 char *tmp;
3381 int i, j, k, size, cc, wsiz, nsiz, tsiz;
3382 assert(text && list);
3383 utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
3384 if(norm) est_normalize_text(utext, size, &size);
3385 est_canonicalize_text(utext, size, FALSE);
3386 CB_LISTOPEN(words);
3387 for(i = 0; i < size; i += 2){
3388 cc = est_char_category(utext[i] * 0x100 + utext[i+1]);
3389 for(j = i + 2; j < size; j += 2){
3390 if(est_char_category(utext[j] * 0x100 + utext[j+1]) != cc) break;
3391 }
3392 switch(cc){
3393 case ESTDELIMCHR:
3394 case ESTWESTALPH:
3395 CB_LISTPUSH(words, (char *)(utext + i), j - i);
3396 break;
3397 case ESTEASTALPH:
3398 for(k = i; k < j; k += 2){
3399 if(j - k >= 4){
3400 CB_LISTPUSH(words, (char *)(utext + k), 4);
3401 } else {
3402 CB_LISTPUSH(words, (char *)(utext + k), 2);
3403 }
3404 }
3405 break;
3406 default:
3407 break;
3408 }
3409 i = j - 2;
3410 }
3411 for(i = 0; i < CB_LISTNUM(words); i++){
3412 word = (unsigned char *)CB_LISTVAL2(words, i, wsiz);
3413 if(est_char_category(word[0] * 0x100 + word[1]) == ESTEASTALPH && wsiz == 2 &&
3414 i < CB_LISTNUM(words) - 1){
3415 next = (unsigned char *)cblistval(words, i + 1, &nsiz);
3416 if(nsiz > 4) nsiz = 4;
3417 if(est_char_category(next[0] * 0x100 + next[1]) == ESTEASTALPH && nsiz > 2) nsiz = 2;
3418 CB_MALLOC(tmp, wsiz + nsiz + 1);
3419 memcpy(tmp, word, wsiz);
3420 memcpy(tmp + wsiz, next, nsiz);
3421 cblistover(words, i, tmp, wsiz + nsiz);
3422 free(tmp);
3423 }
3424 }
3425 for(i = 0; i < CB_LISTNUM(words); i++){
3426 word = (unsigned char *)CB_LISTVAL2(words, i, wsiz);
3427 if(!tail && wsiz == 2 && i == CB_LISTNUM(words) - 1){
3428 if(est_char_category(word[0] * 0x100 + word[1]) == ESTEASTALPH) continue;
3429 }
3430 tmp = est_uconv_out((char *)word, wsiz, &tsiz);
3431 CB_LISTPUSHBUF(list, tmp, tsiz);
3432 }
3433 CB_LISTCLOSE(words);
3434 free(utext);
3435 }
3436
3437
3438 /* Break a sentence of text and extract words using perfect N-gram analyzer. */
est_break_text_perfng(const char * text,CBLIST * list,int norm,int tail)3439 void est_break_text_perfng(const char *text, CBLIST *list, int norm, int tail){
3440 CBLIST *words;
3441 const unsigned char *word, *next;
3442 unsigned char *utext;
3443 char *tmp;
3444 int i, j, k, size, cc, wsiz, nsiz, tsiz;
3445 assert(text && list);
3446 utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
3447 if(norm) est_normalize_text(utext, size, &size);
3448 est_canonicalize_text(utext, size, FALSE);
3449 CB_LISTOPEN(words);
3450 for(i = 0; i < size; i += 2){
3451 cc = est_char_category_perfng(utext[i] * 0x100 + utext[i+1]);
3452 for(j = i + 2; j < size; j += 2){
3453 if(est_char_category_perfng(utext[j] * 0x100 + utext[j+1]) != cc) break;
3454 }
3455 switch(cc){
3456 case ESTEASTALPH:
3457 for(k = i; k < j; k += 2){
3458 if(j - k >= 4){
3459 CB_LISTPUSH(words, (char *)(utext + k), 4);
3460 } else {
3461 CB_LISTPUSH(words, (char *)(utext + k), 2);
3462 }
3463 }
3464 break;
3465 default:
3466 break;
3467 }
3468 i = j - 2;
3469 }
3470 for(i = 0; i < CB_LISTNUM(words); i++){
3471 word = (unsigned char *)CB_LISTVAL2(words, i, wsiz);
3472 if(est_char_category_perfng(word[0] * 0x100 + word[1]) == ESTEASTALPH && wsiz == 2 &&
3473 i < CB_LISTNUM(words) - 1){
3474 next = (unsigned char *)cblistval(words, i + 1, &nsiz);
3475 if(nsiz > 4) nsiz = 4;
3476 if(est_char_category_perfng(next[0] * 0x100 + next[1]) == ESTEASTALPH && nsiz > 2) nsiz = 2;
3477 CB_MALLOC(tmp, wsiz + nsiz + 1);
3478 memcpy(tmp, word, wsiz);
3479 memcpy(tmp + wsiz, next, nsiz);
3480 cblistover(words, i, tmp, wsiz + nsiz);
3481 free(tmp);
3482 }
3483 }
3484 for(i = 0; i < CB_LISTNUM(words); i++){
3485 word = (unsigned char *)CB_LISTVAL2(words, i, wsiz);
3486 if(!tail && wsiz == 2 && i == CB_LISTNUM(words) - 1){
3487 if(est_char_category_perfng(word[0] * 0x100 + word[1]) == ESTEASTALPH) continue;
3488 }
3489 tmp = est_uconv_out((char *)word, wsiz, &tsiz);
3490 CB_LISTPUSHBUF(list, tmp, tsiz);
3491 }
3492 CB_LISTCLOSE(words);
3493 free(utext);
3494 }
3495
3496
3497 /* Make a snippet of an arbitrary string. */
est_str_make_snippet(const char * str,const CBLIST * words,int wwidth,int hwidth,int awidth)3498 char *est_str_make_snippet(const char *str, const CBLIST *words,
3499 int wwidth, int hwidth, int awidth){
3500 assert(str && words && wwidth >= 0 && hwidth >= 0 && awidth >= 0);
3501 return est_make_snippet(str, strlen(str), words, wwidth, hwidth, awidth);
3502 }
3503
3504
3505 /* Break a sentence of text and extract words, using character category analyzer.
3506 `text' specifies a sentence of text.
3507 `list' specifies a list object to which extract words are added.
3508 `norm' specifies whether to normalize the text. */
est_break_text_chrcat(const char * text,CBLIST * list,int norm)3509 void est_break_text_chrcat(const char *text, CBLIST *list, int norm){
3510 unsigned char *utext;
3511 char *tmp;
3512 int i, j, size, cc, tsiz;
3513 assert(text && list);
3514 utext = (unsigned char *)est_uconv_in(text, strlen(text), &size);
3515 if(norm) est_normalize_text(utext, size, &size);
3516 est_canonicalize_text(utext, size, FALSE);
3517 for(i = 0; i < size; i += 2){
3518 cc = est_char_category_chrcat(utext[i] * 0x100 + utext[i+1]);
3519 for(j = i + 2; j < size; j += 2){
3520 if(est_char_category_chrcat(utext[j] * 0x100 + utext[j+1]) != cc &&
3521 (cc != ESTWESTALPH || utext[j] != 0x00 || utext[j+1] != 0x2d) &&
3522 (cc != ESTHIRAGANA || utext[j] != 0x30 || utext[j+1] != 0xfc)) break;
3523 }
3524 if(cc != ESTSPACECHR){
3525 tmp = est_uconv_out((char *)(utext + i), j - i, &tsiz);
3526 CB_LISTPUSHBUF(list, tmp, tsiz);
3527 }
3528 i = j - 2;
3529 }
3530 free(utext);
3531 }
3532
3533
3534 /* Convert the character encoding of a string. */
est_iconv(const char * ptr,int size,const char * icode,const char * ocode,int * sp,int * mp)3535 char *est_iconv(const char *ptr, int size,
3536 const char *icode, const char *ocode, int *sp, int *mp){
3537 iconv_t ic;
3538 char *obuf, *wp, *rp;
3539 size_t isiz, osiz;
3540 int miss;
3541 assert(ptr && icode && ocode);
3542 if(size < 0) size = strlen(ptr);
3543 if(icode[0] == 'x' && icode[1] == '-'){
3544 if(!cbstricmp(icode, "x-sjis")){
3545 icode = "Shift_JIS";
3546 } else if(!cbstricmp(icode, "x-ujis")){
3547 icode = "EUC-JP";
3548 } else if(!cbstricmp(icode, "x-euc-jp")){
3549 icode = "EUC-JP";
3550 }
3551 } else if(icode[0] == 'w' || icode[0] == 'W'){
3552 if(!cbstricmp(icode, "windows-31j")){
3553 icode = "CP932";
3554 }
3555 }
3556 if(ocode[0] == 'x' && ocode[1] == '-'){
3557 if(!cbstricmp(ocode, "x-sjis")){
3558 ocode = "Shift_JIS";
3559 } else if(!cbstricmp(ocode, "x-ujis")){
3560 ocode = "EUC-JP";
3561 } else if(!cbstricmp(ocode, "x-euc-jp")){
3562 ocode = "EUC-JP";
3563 }
3564 } else if(ocode[0] == 'w' || ocode[0] == 'W'){
3565 if(!cbstricmp(ocode, "windows-31j")){
3566 ocode = "CP932";
3567 }
3568 }
3569 if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) return NULL;
3570 isiz = size;
3571 osiz = isiz * 5;
3572 CB_MALLOC(obuf, osiz + 1);
3573 wp = obuf;
3574 rp = (char *)ptr;
3575 miss = 0;
3576 while(isiz > 0){
3577 if(iconv(ic, (void *)&rp, &isiz, &wp, &osiz) == -1){
3578 if(errno == EILSEQ && (*rp == 0x5c || *rp == 0x7e)){
3579 *wp = *rp;
3580 wp++;
3581 rp++;
3582 isiz--;
3583 } else if(errno == EILSEQ || errno == EINVAL){
3584 rp++;
3585 isiz--;
3586 miss++;
3587 } else {
3588 break;
3589 }
3590 }
3591 }
3592 *wp = '\0';
3593 if(sp) *sp = wp - obuf;
3594 if(mp) *mp = miss;
3595 if(iconv_close(ic) == -1){
3596 free(obuf);
3597 return NULL;
3598 }
3599 return obuf;
3600 }
3601
3602
3603 /* Detect the encoding of a string automatically. */
est_enc_name(const char * ptr,int size,int plang)3604 const char *est_enc_name(const char *ptr, int size, int plang){
3605 const char *hypo;
3606 int i, lim, miss, ascii, cr;
3607 assert(ptr);
3608 if(size < 0) size = strlen(ptr);
3609 if(size > ESTICCHECKSIZ) size = ESTICCHECKSIZ;
3610 if(size >= 2 && (!memcmp(ptr, "\xfe\xff", 2) || !memcmp(ptr, "\xff\xfe", 2))) return "UTF-16";
3611 ascii = TRUE;
3612 cr = FALSE;
3613 lim = size - 1;
3614 for(i = 0; i < lim; i += 2){
3615 if(ptr[i] == 0x0) return "UTF-16BE";
3616 if(ptr[i+1] == 0x0) return "UTF-16LE";
3617 if(ptr[i] < 0x0 || ptr[i] == 0x1b){
3618 ascii = FALSE;
3619 } else if(ptr[i] == 0xd){
3620 cr = TRUE;
3621 }
3622 }
3623 if(ascii) return "US-ASCII";
3624 switch(plang){
3625 case ESTLANGEN:
3626 if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
3627 return "ISO-8859-1";
3628 case ESTLANGJA:
3629 lim = size - 3;
3630 for(i = 0; i < lim; i++){
3631 if(ptr[i] == 0x1b){
3632 i++;
3633 if(ptr[i] == '(' && strchr("BJHI", ptr[i+1])) return "ISO-2022-JP";
3634 if(ptr[i] == '$' && strchr("@B(", ptr[i+1])) return "ISO-2022-JP";
3635 }
3636 }
3637 if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
3638 hypo = NULL;
3639 if(cr){
3640 if((miss = est_enc_miss(ptr, size, "Shift_JIS", "EUC-JP")) < 1) return "Shift_JIS";
3641 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "Shift_JIS";
3642 if((miss = est_enc_miss(ptr, size, "EUC-JP", "UTF-16BE")) < 1) return "EUC-JP";
3643 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "EUC-JP";
3644 } else {
3645 if((miss = est_enc_miss(ptr, size, "EUC-JP", "UTF-16BE")) < 1) return "EUC-JP";
3646 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "EUC-JP";
3647 if((miss = est_enc_miss(ptr, size, "Shift_JIS", "EUC-JP")) < 1) return "Shift_JIS";
3648 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "Shift_JIS";
3649 }
3650 if((miss = est_enc_miss(ptr, size, "UTF-8", "UTF-16BE")) < 1) return "UTF-8";
3651 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "UTF-8";
3652 if((miss = est_enc_miss(ptr, size, "CP932", "UTF-16BE")) < 1) return "CP932";
3653 if(!hypo && miss / (double)size <= ESTICALLWRAT) hypo = "CP932";
3654 return hypo ? hypo : "ISO-8859-1";
3655 case ESTLANGZH:
3656 if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
3657 if(est_enc_miss(ptr, size, "EUC-CN", "UTF-16BE") < 1) return "EUC-CN";
3658 if(est_enc_miss(ptr, size, "BIG5", "UTF-16BE") < 1) return "BIG5";
3659 return "ISO-8859-1";
3660 case ESTLANGKO:
3661 if(est_enc_miss(ptr, size, "UTF-8", "UTF-16BE") < 1) return "UTF-8";
3662 if(est_enc_miss(ptr, size, "EUC-KR", "UTF-16BE") < 1) return "EUC-KR";
3663 return "ISO-8859-1";
3664 default:
3665 break;
3666 }
3667 return "ISO-8859-1";
3668 }
3669
3670
3671 /* Convert a UTF-8 string into UTF-16BE. */
est_uconv_in(const char * ptr,int size,int * sp)3672 char *est_uconv_in(const char *ptr, int size, int *sp){
3673 const unsigned char *rp;
3674 char *rbuf, *wp;
3675 assert(ptr && size >= 0 && sp);
3676 rp = (unsigned char *)ptr;
3677 CB_MALLOC(rbuf, size * 2 + 1);
3678 wp = rbuf;
3679 while(rp < (unsigned char *)ptr + size){
3680 if(*rp < 0x7f){
3681 *(wp++) = 0x00;
3682 *(wp++) = *rp;
3683 rp += 1;
3684 } else if(*rp < 0xdf){
3685 if(rp >= (unsigned char *)ptr + size - 1) break;
3686 *(wp++) = (rp[0] & 0x1f) >> 2;
3687 *(wp++) = (rp[0] << 6) | (rp[1] & 0x3f);
3688 rp += 2;
3689 } else if(*rp < 0xf0){
3690 if(rp >= (unsigned char *)ptr + size - 2) break;
3691 *(wp++) = (rp[0] << 4) | ((rp[1] & 0x3f) >> 2);
3692 *(wp++) = (rp[1] << 6) | (rp[2] & 0x3f);
3693 rp += 3;
3694 } else if(*rp < 0xf8){
3695 if(rp >= (unsigned char *)ptr + size - 3) break;
3696 *(wp++) = 0x00;
3697 *(wp++) = '?';
3698 rp += 4;
3699 } else if(*rp < 0xfb){
3700 if(rp >= (unsigned char *)ptr + size - 4) break;
3701 *(wp++) = 0x00;
3702 *(wp++) = '?';
3703 rp += 5;
3704 } else if(*rp < 0xfd){
3705 if(rp >= (unsigned char *)ptr + size - 5) break;
3706 *(wp++) = 0x00;
3707 *(wp++) = '?';
3708 rp += 6;
3709 } else {
3710 break;
3711 }
3712 }
3713 *wp = '\0';
3714 *sp = wp - rbuf;
3715 return rbuf;
3716 }
3717
3718
3719 /* Convert a UTF-16BE string into UTF-8. */
est_uconv_out(const char * ptr,int size,int * sp)3720 char *est_uconv_out(const char *ptr, int size, int *sp){
3721 const unsigned char *rp;
3722 char *rbuf, *wp;
3723 int c;
3724 assert(ptr && size >= 0);
3725 if(size % 2 != 0) size--;
3726 rp = (unsigned char *)ptr;
3727 CB_MALLOC(rbuf, size * 2 + 1);
3728 wp = rbuf;
3729 while(rp < (unsigned char *)ptr + size){
3730 c = rp[0] * 0x100 + rp[1];
3731 if(c < 0x0080){
3732 *(wp++) = rp[1];
3733 } else if(c < 0x0900){
3734 *(wp++) = 0xc0 | (rp[0] << 2) | ((rp[1] >> 6) & 0x03);
3735 *(wp++) = 0x80 | (rp[1] & 0x3f);
3736 } else {
3737 *(wp++) = 0xe0 | ((rp[0] >> 4) & 0x0f);
3738 *(wp++) = 0x80 | ((rp[0] & 0x0f) << 2) | ((rp[1] >> 6) & 0x03);
3739 *(wp++) = 0x80 | (rp[1] & 0x3f);
3740 }
3741 rp += 2;
3742 }
3743 *wp = '\0';
3744 if(sp) *sp = wp - rbuf;
3745 return rbuf;
3746 }
3747
3748
3749 /* Compress a serial object with ZLIB. */
est_deflate(const char * ptr,int size,int * sp,int mode)3750 char *est_deflate(const char *ptr, int size, int *sp, int mode){
3751 #if ESTUSEZLIB
3752 z_stream zs;
3753 char *buf;
3754 unsigned char obuf[ESTIOBUFSIZ];
3755 int rv, asiz, bsiz, osiz;
3756 if(size < 0) size = strlen(ptr);
3757 zs.zalloc = Z_NULL;
3758 zs.zfree = Z_NULL;
3759 zs.opaque = Z_NULL;
3760 switch(mode){
3761 case -1:
3762 if(deflateInit2(&zs, 5, Z_DEFLATED, -15, 7, Z_DEFAULT_STRATEGY) != Z_OK)
3763 return NULL;
3764 break;
3765 case 1:
3766 if(deflateInit2(&zs, 6, Z_DEFLATED, 15 + 16, 9, Z_DEFAULT_STRATEGY) != Z_OK)
3767 return NULL;
3768 break;
3769 default:
3770 if(deflateInit2(&zs, 6, Z_DEFLATED, 15, 8, Z_DEFAULT_STRATEGY) != Z_OK)
3771 return NULL;
3772 break;
3773 }
3774 asiz = size + 16;
3775 if(asiz < ESTIOBUFSIZ) asiz = ESTIOBUFSIZ;
3776 CB_MALLOC(buf, asiz);
3777 bsiz = 0;
3778 zs.next_in = (unsigned char *)ptr;
3779 zs.avail_in = size;
3780 zs.next_out = obuf;
3781 zs.avail_out = ESTIOBUFSIZ;
3782 while((rv = deflate(&zs, Z_FINISH)) == Z_OK){
3783 osiz = ESTIOBUFSIZ - zs.avail_out;
3784 if(bsiz + osiz > asiz){
3785 asiz = asiz * 2 + osiz;
3786 CB_REALLOC(buf, asiz);
3787 }
3788 memcpy(buf + bsiz, obuf, osiz);
3789 bsiz += osiz;
3790 zs.next_out = obuf;
3791 zs.avail_out = ESTIOBUFSIZ;
3792 }
3793 if(rv != Z_STREAM_END){
3794 free(buf);
3795 deflateEnd(&zs);
3796 return NULL;
3797 }
3798 osiz = ESTIOBUFSIZ - zs.avail_out;
3799 if(bsiz + osiz + 1 > asiz){
3800 asiz = asiz * 2 + osiz;
3801 CB_REALLOC(buf, asiz);
3802 }
3803 memcpy(buf + bsiz, obuf, osiz);
3804 bsiz += osiz;
3805 buf[bsiz] = '\0';
3806 if(mode == -1) bsiz++;
3807 *sp = bsiz;
3808 deflateEnd(&zs);
3809 return buf;
3810 #else
3811 if(size < 0) size = strlen(ptr);
3812 *sp = size;
3813 return cbmemdup(ptr, size);
3814 #endif
3815 }
3816
3817
3818 /* Decompress a serial object compressed with ZLIB. */
est_inflate(const char * ptr,int size,int * sp,int mode)3819 char *est_inflate(const char *ptr, int size, int *sp, int mode){
3820 #if ESTUSEZLIB
3821 z_stream zs;
3822 char *buf;
3823 unsigned char obuf[ESTIOBUFSIZ];
3824 int rv, asiz, bsiz, osiz;
3825 zs.zalloc = Z_NULL;
3826 zs.zfree = Z_NULL;
3827 zs.opaque = Z_NULL;
3828 switch(mode){
3829 case -1:
3830 if(inflateInit2(&zs, -15) != Z_OK) return NULL;
3831 break;
3832 case 1:
3833 if(inflateInit2(&zs, 15 + 16) != Z_OK) return NULL;
3834 break;
3835 default:
3836 if(inflateInit2(&zs, 15) != Z_OK) return NULL;
3837 break;
3838 }
3839 asiz = size * 2 + 16;
3840 if(asiz < ESTIOBUFSIZ) asiz = ESTIOBUFSIZ;
3841 CB_MALLOC(buf, asiz);
3842 bsiz = 0;
3843 zs.next_in = (unsigned char *)ptr;
3844 zs.avail_in = size;
3845 zs.next_out = obuf;
3846 zs.avail_out = ESTIOBUFSIZ;
3847 while((rv = inflate(&zs, Z_NO_FLUSH)) == Z_OK){
3848 osiz = ESTIOBUFSIZ - zs.avail_out;
3849 if(bsiz + osiz >= asiz){
3850 asiz = asiz * 2 + osiz;
3851 CB_REALLOC(buf, asiz);
3852 }
3853 memcpy(buf + bsiz, obuf, osiz);
3854 bsiz += osiz;
3855 zs.next_out = obuf;
3856 zs.avail_out = ESTIOBUFSIZ;
3857 }
3858 if(rv != Z_STREAM_END){
3859 free(buf);
3860 inflateEnd(&zs);
3861 return NULL;
3862 }
3863 osiz = ESTIOBUFSIZ - zs.avail_out;
3864 if(bsiz + osiz >= asiz){
3865 asiz = asiz * 2 + osiz;
3866 CB_REALLOC(buf, asiz);
3867 }
3868 memcpy(buf + bsiz, obuf, osiz);
3869 bsiz += osiz;
3870 buf[bsiz] = '\0';
3871 if(sp) *sp = bsiz;
3872 inflateEnd(&zs);
3873 return buf;
3874 #else
3875 if(sp) *sp = size;
3876 return cbmemdup(ptr, size);
3877 #endif
3878 }
3879
3880
3881 /* Compress a serial object with LZO. */
est_lzoencode(const char * ptr,int size,int * sp)3882 char *est_lzoencode(const char *ptr, int size, int *sp){
3883 #if ESTUSELZO
3884 char wrkmem[LZO1X_1_MEM_COMPRESS];
3885 lzo_bytep buf;
3886 lzo_uint bsiz;
3887 if(size < 0) size = strlen(ptr);
3888 CB_MALLOC(buf, size + size / 16 + 80);
3889 if(lzo1x_1_compress((lzo_bytep)ptr, size, buf, &bsiz, wrkmem) != LZO_E_OK){
3890 free(buf);
3891 return NULL;
3892 }
3893 buf[bsiz] = '\0';
3894 *sp = bsiz;
3895 return (char *)buf;
3896 #else
3897 if(size < 0) size = strlen(ptr);
3898 *sp = size;
3899 return cbmemdup(ptr, size);
3900 #endif
3901 }
3902
3903
3904 /* Decompress a serial object compressed with LZO. */
est_lzodecode(const char * ptr,int size,int * sp)3905 char *est_lzodecode(const char *ptr, int size, int *sp){
3906 #if ESTUSELZO
3907 lzo_bytep buf;
3908 lzo_uint bsiz;
3909 int rat, rv;
3910 rat = 4;
3911 while(TRUE){
3912 bsiz = (size + 256) * rat + 3;
3913 CB_MALLOC(buf, bsiz + 1);
3914 rv = lzo1x_decompress_safe((lzo_bytep)(ptr), size, buf, &bsiz, NULL);
3915 if(rv == LZO_E_OK){
3916 break;
3917 } else if(rv == LZO_E_OUTPUT_OVERRUN){
3918 free(buf);
3919 rat *= 2;
3920 } else {
3921 free(buf);
3922 return NULL;
3923 }
3924 }
3925 buf[bsiz] = '\0';
3926 if(sp) *sp = bsiz;
3927 return (char *)buf;
3928 #else
3929 if(sp) *sp = size;
3930 return cbmemdup(ptr, size);
3931 #endif
3932 }
3933
3934
3935 /* Compress a serial object with BZIP2. */
est_bzencode(const char * ptr,int size,int * sp)3936 char *est_bzencode(const char *ptr, int size, int *sp){
3937 #if ESTUSEBZIP
3938 bz_stream zs;
3939 char *buf, obuf[ESTIOBUFSIZ];
3940 int rv, asiz, bsiz, osiz;
3941 if(size < 0) size = strlen(ptr);
3942 zs.bzalloc = NULL;
3943 zs.bzfree = NULL;
3944 zs.opaque = NULL;
3945 if(BZ2_bzCompressInit(&zs, 9, 0, 30) != BZ_OK) return NULL;
3946 asiz = size + 16;
3947 if(asiz < ESTIOBUFSIZ) asiz = ESTIOBUFSIZ;
3948 CB_MALLOC(buf, asiz);
3949 bsiz = 0;
3950 zs.next_in = (char *)ptr;
3951 zs.avail_in = size;
3952 zs.next_out = obuf;
3953 zs.avail_out = ESTIOBUFSIZ;
3954 while((rv = BZ2_bzCompress(&zs, BZ_FINISH)) == BZ_FINISH_OK){
3955 osiz = ESTIOBUFSIZ - zs.avail_out;
3956 if(bsiz + osiz > asiz){
3957 asiz = asiz * 2 + osiz;
3958 CB_REALLOC(buf, asiz);
3959 }
3960 memcpy(buf + bsiz, obuf, osiz);
3961 bsiz += osiz;
3962 zs.next_out = obuf;
3963 zs.avail_out = ESTIOBUFSIZ;
3964 }
3965 if(rv != BZ_STREAM_END){
3966 free(buf);
3967 BZ2_bzCompressEnd(&zs);
3968 return NULL;
3969 }
3970 osiz = ESTIOBUFSIZ - zs.avail_out;
3971 if(bsiz + osiz + 1 > asiz){
3972 asiz = asiz * 2 + osiz;
3973 CB_REALLOC(buf, asiz);
3974 }
3975 memcpy(buf + bsiz, obuf, osiz);
3976 bsiz += osiz;
3977 buf[bsiz] = '\0';
3978 *sp = bsiz;
3979 BZ2_bzCompressEnd(&zs);
3980 return buf;
3981 #else
3982 if(size < 0) size = strlen(ptr);
3983 *sp = size;
3984 return cbmemdup(ptr, size);
3985 #endif
3986 }
3987
3988
3989 /* Decompress a serial object compressed with BZIP2. */
est_bzdecode(const char * ptr,int size,int * sp)3990 char *est_bzdecode(const char *ptr, int size, int *sp){
3991 #if ESTUSEBZIP
3992 bz_stream zs;
3993 char *buf, obuf[ESTIOBUFSIZ];
3994 int rv, asiz, bsiz, osiz;
3995 zs.bzalloc = NULL;
3996 zs.bzfree = NULL;
3997 zs.opaque = NULL;
3998 if(BZ2_bzDecompressInit(&zs, 0, 0) != BZ_OK) return NULL;
3999 asiz = size * 2 + 16;
4000 if(asiz < ESTIOBUFSIZ) asiz = ESTIOBUFSIZ;
4001 CB_MALLOC(buf, asiz);
4002 bsiz = 0;
4003 zs.next_in = (char *)ptr;
4004 zs.avail_in = size;
4005 zs.next_out = obuf;
4006 zs.avail_out = ESTIOBUFSIZ;
4007 while((rv = BZ2_bzDecompress(&zs)) == BZ_OK){
4008 osiz = ESTIOBUFSIZ - zs.avail_out;
4009 if(bsiz + osiz >= asiz){
4010 asiz = asiz * 2 + osiz;
4011 CB_REALLOC(buf, asiz);
4012 }
4013 memcpy(buf + bsiz, obuf, osiz);
4014 bsiz += osiz;
4015 zs.next_out = obuf;
4016 zs.avail_out = ESTIOBUFSIZ;
4017 }
4018 if(rv != BZ_STREAM_END){
4019 free(buf);
4020 BZ2_bzDecompressEnd(&zs);
4021 return NULL;
4022 }
4023 osiz = ESTIOBUFSIZ - zs.avail_out;
4024 if(bsiz + osiz >= asiz){
4025 asiz = asiz * 2 + osiz;
4026 CB_REALLOC(buf, asiz);
4027 }
4028 memcpy(buf + bsiz, obuf, osiz);
4029 bsiz += osiz;
4030 buf[bsiz] = '\0';
4031 if(sp) *sp = bsiz;
4032 BZ2_bzDecompressEnd(&zs);
4033 return buf;
4034 #else
4035 if(sp) *sp = size;
4036 return cbmemdup(ptr, size);
4037 #endif
4038 }
4039
4040
4041 /* Get the border string for draft data of documents. */
est_border_str(void)4042 const char *est_border_str(void){
4043 static int first = TRUE;
4044 static char border[ESTPATHBUFSIZ];
4045 int t, p;
4046 if(first){
4047 t = (int)(time(NULL) + est_random() * INT_MAX);
4048 p = (int)(getpid() + est_random() * INT_MAX);
4049 sprintf(border, "--------[%08X%08X]--------",
4050 dpouterhash((char *)&t, sizeof(int)), dpouterhash((char *)&p, sizeof(int)));
4051 first = FALSE;
4052 }
4053 return border;
4054 }
4055
4056
4057 /* Get the real random number. */
est_random(void)4058 double est_random(void){
4059 static int first = TRUE;
4060 int num;
4061 if(first && !est_random_ifp){
4062 if((est_random_ifp = fopen("/dev/urandom", "rb")) != NULL){
4063 atexit(est_random_fclose);
4064 } else {
4065 srand(getpid());
4066 }
4067 first = FALSE;
4068 }
4069 if(est_random_ifp){
4070 fread(&num, sizeof(int), 1, est_random_ifp);
4071 return (num & INT_MAX) / (double)INT_MAX;
4072 }
4073 return rand() / (double)RAND_MAX;
4074 }
4075
4076
4077 /* Get the random number in normal distribution. */
est_random_nd(void)4078 double est_random_nd(void){
4079 double d;
4080 d = (sqrt(-2 * log(1.0 - est_random())) * cos(3.1415926535 * 2 * est_random()) + 6.0) / 12.0;
4081 if(d > 1.0) d = 1.0;
4082 if(d < 0.0) d = 0.0;
4083 return d;
4084 }
4085
4086
4087 /* Get an MD5 hash string of a key string. */
est_make_crypt(const char * key)4088 char *est_make_crypt(const char *key){
4089 md5_state_t ms;
4090 char digest[32], str[64], *wp;
4091 int i;
4092 assert(key);
4093 md5_init(&ms);
4094 md5_append(&ms, (md5_byte_t *)key, strlen(key));
4095 md5_finish(&ms, (md5_byte_t *)digest);
4096 wp = str;
4097 for(i = 0; i < 16; i++){
4098 wp += sprintf(wp, "%02x", ((unsigned char *)digest)[i]);
4099 }
4100 return cbmemdup(str, -1);
4101 }
4102
4103
4104 /* Check whether a key matches an MD5 hash string. */
est_match_crypt(const char * key,const char * hash)4105 int est_match_crypt(const char *key, const char *hash){
4106 char *khash;
4107 int rv;
4108 assert(key && hash);
4109 khash = est_make_crypt(key);
4110 rv = !strcmp(khash, hash);
4111 free(khash);
4112 return rv;
4113 }
4114
4115
4116 /* Create a regular expression object. */
est_regex_new(const char * str)4117 void *est_regex_new(const char *str){
4118 regex_t regex;
4119 int options;
4120 assert(str);
4121 options = REG_EXTENDED | REG_NOSUB;
4122 if(str[0] == '*' && str[1] == 'I' && str[2] == ':'){
4123 options |= REG_ICASE;
4124 str += 3;
4125 }
4126 if(regcomp(®ex, str, options) != 0) return NULL;
4127 return cbmemdup((char *)®ex, sizeof(regex_t));
4128 }
4129
4130
4131 /* Delete a regular expression object. */
est_regex_delete(void * regex)4132 void est_regex_delete(void *regex){
4133 assert(regex);
4134 regfree(regex);
4135 free(regex);
4136 }
4137
4138
4139 /* Check whether a regular expression matches a string. */
est_regex_match(const void * regex,const char * str)4140 int est_regex_match(const void *regex, const char *str){
4141 assert(regex && str);
4142 return regexec(regex, str, 0, NULL, 0) == 0;
4143 }
4144
4145
4146 /* Check whether a regular expression matches a string. */
est_regex_match_str(const char * rstr,const char * tstr)4147 int est_regex_match_str(const char *rstr, const char *tstr){
4148 void *regex;
4149 int rv;
4150 assert(rstr && tstr);
4151 if(!(regex = est_regex_new(rstr))) return FALSE;
4152 rv = est_regex_match(regex, tstr);
4153 est_regex_delete(regex);
4154 return rv;
4155 }
4156
4157
4158 /* Replace each substring matching a regular expression string. */
est_regex_replace(const char * str,const char * bef,const char * aft)4159 char *est_regex_replace(const char *str, const char *bef, const char *aft){
4160 regex_t regex;
4161 regmatch_t subs[256];
4162 CBDATUM *datum;
4163 const char *sp, *rp;
4164 int options, first, num;
4165 assert(str && bef && aft);
4166 options = REG_EXTENDED;
4167 if(bef[0] == '*' && bef[1] == 'I' && bef[2] == ':'){
4168 options |= REG_ICASE;
4169 bef += 3;
4170 }
4171 if(bef[0] == '\0' || regcomp(®ex, bef, options) != 0) return cbmemdup(str, -1);
4172 if(regexec(®ex, str, ESTREGSUBMAX, subs, 0) != 0){
4173 regfree(®ex);
4174 return cbmemdup(str, -1);
4175 }
4176 sp = str;
4177 CB_DATUMOPEN(datum);
4178 first = TRUE;
4179 while(sp[0] != '\0' && regexec(®ex, sp, 10, subs, first ? 0 : REG_NOTBOL) == 0){
4180 first = FALSE;
4181 if(subs[0].rm_so == -1) break;
4182 CB_DATUMCAT(datum, sp, subs[0].rm_so);
4183 for(rp = aft; *rp != '\0'; rp++){
4184 if(*rp == '\\'){
4185 if(rp[1] >= '0' && rp[1] <= '9'){
4186 num = rp[1] - '0';
4187 if(subs[num].rm_so != -1 && subs[num].rm_eo != -1)
4188 CB_DATUMCAT(datum, sp + subs[num].rm_so, subs[num].rm_eo - subs[num].rm_so);
4189 ++rp;
4190 } else if(rp[1] != '\0'){
4191 CB_DATUMCAT(datum, ++rp, 1);
4192 }
4193 } else if(*rp == '&'){
4194 CB_DATUMCAT(datum, sp + subs[0].rm_so, subs[0].rm_eo - subs[0].rm_so);
4195 } else {
4196 CB_DATUMCAT(datum, rp, 1);
4197 }
4198 }
4199 sp += subs[0].rm_eo;
4200 if(subs[0].rm_eo < 1) break;
4201 }
4202 CB_DATUMCAT(datum, sp, strlen(sp));
4203 regfree(®ex);
4204 return cbdatumtomalloc(datum, NULL);
4205 }
4206
4207
4208 /* Duplicate a document object. */
est_doc_dup(ESTDOC * doc)4209 ESTDOC *est_doc_dup(ESTDOC *doc){
4210 ESTDOC *ndoc;
4211 assert(doc);
4212 CB_MALLOC(ndoc, sizeof(ESTDOC));
4213 ndoc->id = doc->id;
4214 ndoc->attrs = doc->attrs ? cbmapdup(doc->attrs) : NULL;
4215 ndoc->dtexts = doc->dtexts ? cblistdup(doc->dtexts) : NULL;
4216 ndoc->kwords = doc->kwords ? cbmapdup(doc->kwords) : NULL;
4217 return ndoc;
4218 }
4219
4220
4221 /* Set the ID number of a document object. */
est_doc_set_id(ESTDOC * doc,int id)4222 void est_doc_set_id(ESTDOC *doc, int id){
4223 assert(doc);
4224 doc->id = id;
4225 }
4226
4227
4228 /* Get the hidden texts of a document object. */
est_doc_hidden_texts(ESTDOC * doc)4229 const char *est_doc_hidden_texts(ESTDOC *doc){
4230 const char *rv;
4231 assert(doc);
4232 rv = doc->attrs ? cbmapget(doc->attrs, "", 0, NULL) : NULL;
4233 return rv ? rv : "";
4234 }
4235
4236
4237 /* Reduce the texts to fit to the specified size. */
est_doc_slim(ESTDOC * doc,int len)4238 void est_doc_slim(ESTDOC *doc, int len){
4239 const char *vbuf;
4240 unsigned char *tbuf;
4241 int i, vsiz, tsiz;
4242 assert(doc && len >= 0);
4243 if(!doc->dtexts) return;
4244 if(doc->attrs && cbmapget(doc->attrs, "", 0, &vsiz)) len -= vsiz;
4245 for(i = 0; i < CB_LISTNUM(doc->dtexts); i++){
4246 vbuf = CB_LISTVAL2(doc->dtexts, i, vsiz);
4247 len -= vsiz;
4248 if(len < 0){
4249 tbuf = (unsigned char *)cbmemdup(vbuf, vsiz);
4250 tsiz = vsiz > -len ? vsiz + len : 0;
4251 if(tsiz > 0){
4252 while(tsiz < vsiz){
4253 if(tbuf[tsiz] <= ' ' || (tbuf[tsiz] & 0xf0) == 0xe0) break;
4254 tsiz++;
4255 }
4256 }
4257 while(CB_LISTNUM(doc->dtexts) > i){
4258 CB_LISTDROP(doc->dtexts);
4259 }
4260 CB_LISTPUSHBUF(doc->dtexts, (char *)tbuf, tsiz);
4261 break;
4262 }
4263 }
4264 }
4265
4266
4267 /* Check whether a docuemnt object is empty. */
est_doc_is_empty(ESTDOC * doc)4268 int est_doc_is_empty(ESTDOC *doc){
4269 assert(doc);
4270 if((!doc->dtexts || CB_LISTNUM(doc->dtexts) < 1) &&
4271 (!doc->attrs || !cbmapget(doc->attrs, "", 0, NULL))) return TRUE;
4272 return FALSE;
4273 }
4274
4275
4276 /* Duplicate a condition object. */
est_cond_dup(ESTCOND * cond)4277 ESTCOND *est_cond_dup(ESTCOND *cond){
4278 ESTCOND *ncond;
4279 assert(cond);
4280 CB_MALLOC(ncond, sizeof(ESTCOND));
4281 ncond->phrase = cond->phrase ? cbmemdup(cond->phrase, -1) : NULL;
4282 ncond->gstep = cond->gstep;
4283 ncond->tfidf = cond->tfidf;
4284 ncond->pmode = cond->pmode;
4285 ncond->cbxpn = cond->cbxpn;
4286 ncond->attrs = cond->attrs ? cblistdup(cond->attrs) : NULL;
4287 ncond->order = cond->order ? cbmemdup(cond->order, -1) : NULL;
4288 ncond->max = cond->max;
4289 ncond->skip = cond->skip;
4290 ncond->auxmin = cond->auxmin;
4291 ncond->auxwords = cond->auxwords ? cbmapdup(cond->auxwords) : NULL;
4292 ncond->scfb = cond->scfb;
4293 ncond->scores = cond->scores ?
4294 (int *)cbmemdup((char *)cond->scores, cond->snum * sizeof(int)) : NULL;
4295 ncond->snum = cond->snum;
4296 ncond->nscores = cond->nscores;
4297 ncond->nsnum = cond->nsnum;
4298 ncond->opts = cond->opts;
4299 ncond->ecllim = cond->ecllim;
4300 ncond->shadows = cond->shadows ? cbmapdup(cond->shadows) : NULL;
4301 ncond->distinct = cond->distinct ? cbmemdup(cond->distinct, -1) : NULL;
4302 ncond->mask = cond->mask;
4303 return ncond;
4304 }
4305
4306
4307 /* Get the phrase of a condition object. */
est_cond_phrase(ESTCOND * cond)4308 const char *est_cond_phrase(ESTCOND *cond){
4309 assert(cond);
4310 return cond->phrase;
4311 }
4312
4313
4314 /* Get a list object of attribute expressions of a condition object. */
est_cond_attrs(ESTCOND * cond)4315 const CBLIST *est_cond_attrs(ESTCOND *cond){
4316 assert(cond);
4317 return cond->attrs;
4318 }
4319
4320
4321 /* Get the order expression of a condition object. */
est_cond_order(ESTCOND * cond)4322 const char *est_cond_order(ESTCOND *cond){
4323 assert(cond);
4324 return cond->order;
4325 }
4326
4327
4328 /* Get the maximum number of retrieval of a condition object. */
est_cond_max(ESTCOND * cond)4329 int est_cond_max(ESTCOND *cond){
4330 assert(cond);
4331 return cond->max;
4332 }
4333
4334
4335 /* Get the number of skipped documents of a condition object. */
est_cond_skip(ESTCOND * cond)4336 int est_cond_skip(ESTCOND *cond){
4337 assert(cond);
4338 return cond->skip;
4339 }
4340
4341
4342 /* Get the options of a condition object. */
est_cond_options(ESTCOND * cond)4343 int est_cond_options(ESTCOND *cond){
4344 assert(cond);
4345 return cond->opts;
4346 }
4347
4348
4349 /* Get permission to adopt result of the auxiliary index. */
est_cond_auxiliary(ESTCOND * cond)4350 int est_cond_auxiliary(ESTCOND *cond){
4351 assert(cond);
4352 return cond->auxmin;
4353 }
4354
4355
4356 /* Get the attribute distinction filter. */
est_cond_distinct(ESTCOND * cond)4357 const char *est_cond_distinct(ESTCOND *cond){
4358 assert(cond);
4359 return cond->distinct;
4360 }
4361
4362
4363 /* Get the mask of targets of meta search. */
est_cond_mask(ESTCOND * cond)4364 int est_cond_mask(ESTCOND *cond){
4365 assert(cond);
4366 return cond->mask;
4367 }
4368
4369
4370 /* Get the score of a document corresponding to a condition object. */
est_cond_score(ESTCOND * cond,int index)4371 int est_cond_score(ESTCOND *cond, int index){
4372 assert(cond);
4373 if(!cond->scores || index < 0 || index >= cond->snum) return -1;
4374 return cond->scores[index];
4375 }
4376
4377
4378 /* Get the score array of corresponding documents of a condition object. */
est_cond_scores(ESTCOND * cond,int * nump)4379 const int *est_cond_scores(ESTCOND *cond, int *nump){
4380 assert(cond && nump);
4381 *nump = cond->snum;
4382 return cond->scores;
4383 }
4384
4385
4386 /* Set the narrowing scores of a condition object. */
est_cond_set_narrowing_scores(ESTCOND * cond,const int * scores,int num)4387 void est_cond_set_narrowing_scores(ESTCOND *cond, const int *scores, int num){
4388 assert(cond && scores && num >= 0);
4389 cond->nscores = scores;
4390 cond->nsnum = num;
4391 }
4392
4393
4394 /* Check whether a condition object has used the auxiliary index. */
est_cond_auxiliary_word(ESTCOND * cond,const char * word)4395 int est_cond_auxiliary_word(ESTCOND *cond, const char *word){
4396 assert(cond && word);
4397 if(!cond->auxwords) return FALSE;
4398 if(word[0] != '\0') return cbmapget(cond->auxwords, word, -1, NULL) != NULL;
4399 return cbmaprnum(cond->auxwords) > 0;
4400 }
4401
4402
4403 /* Get an array of ID numbers of eclipsed docuemnts of a document in a condition object. */
est_cond_shadows(ESTCOND * cond,int id,int * np)4404 const int *est_cond_shadows(ESTCOND *cond, int id, int *np){
4405 const char *vbuf;
4406 int vsiz;
4407 assert(cond && id > 0 && np);
4408 if(!cond->shadows || !(vbuf = cbmapget(cond->shadows, (char *)&id, sizeof(int), &vsiz))){
4409 *np = 0;
4410 return (int *)"";
4411 }
4412 *np = vsiz / sizeof(int);
4413 return (int *)vbuf;
4414 }
4415
4416
4417 /* Set the callback function for query expansion. */
est_cond_set_expander(ESTCOND * cond,void (* func)(const char *,CBLIST *))4418 void est_cond_set_expander(ESTCOND *cond, void (*func)(const char *, CBLIST *)){
4419 assert(cond && func);
4420 cond->cbxpn = func;
4421 }
4422
4423
4424 /* Set the error code of a database. */
est_db_set_ecode(ESTDB * db,int ecode)4425 void est_db_set_ecode(ESTDB *db, int ecode){
4426 assert(db);
4427 est_set_ecode(&(db->ecode), ecode, __LINE__);
4428 }
4429
4430
4431 /* Check whether an option is set. */
est_db_check_option(ESTDB * db,int option)4432 int est_db_check_option(ESTDB *db, int option){
4433 assert(db);
4434 switch(option){
4435 case ESTDBREADER:
4436 return !dpwritable(db->metadb);
4437 case ESTDBWRITER:
4438 return dpwritable(db->metadb);
4439 case ESTDBCREAT:
4440 return -1;
4441 case ESTDBTRUNC:
4442 return -1;
4443 case ESTDBNOLCK:
4444 return -1;
4445 case ESTDBLCKNB:
4446 return -1;
4447 case ESTDBPERFNG:
4448 return db->amode == ESTDFPERFNG;
4449 case ESTDBCHRCAT:
4450 return db->amode == ESTDFCHRCAT;
4451 case ESTDBSMALL:
4452 return -1;
4453 case ESTDBLARGE:
4454 return -1;
4455 case ESTDBHUGE:
4456 return -1;
4457 case ESTDBHUGE2:
4458 return -1;
4459 case ESTDBHUGE3:
4460 return -1;
4461 case ESTDBSCVOID:
4462 return db->smode == ESTDFSCVOID;
4463 case ESTDBSCINT:
4464 return db->smode == ESTDFSCINT;
4465 case ESTDBSCASIS:
4466 return db->smode == ESTDFSCASIS;
4467 default:
4468 break;
4469 }
4470 return -1;
4471 }
4472
4473
4474 /* Get the inode number of a database. */
est_db_inode(ESTDB * db)4475 int est_db_inode(ESTDB *db){
4476 assert(db);
4477 return db->inode;
4478 }
4479
4480
4481 /* Set the entity data of a document in a database. */
est_db_set_doc_entity(ESTDB * db,int id,const char * ptr,int size)4482 int est_db_set_doc_entity(ESTDB *db, int id, const char *ptr, int size){
4483 int err;
4484 assert(db && id > 0);
4485 if(!dpwritable(db->metadb)){
4486 est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
4487 return FALSE;
4488 }
4489 err = FALSE;
4490 if(ptr){
4491 if(!crputlob(db->textdb, (char *)&id, sizeof(int), ptr, size, CR_DOVER)){
4492 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
4493 err = TRUE;
4494 }
4495 } else {
4496 if(!croutlob(db->textdb, (char *)&id, sizeof(int))){
4497 est_set_ecode(&(db->ecode), dpecode == DP_ENOITEM ? ESTENOITEM : ESTEDB, __LINE__);
4498 err = TRUE;
4499 }
4500 }
4501 return err ? FALSE : TRUE;
4502 }
4503
4504
4505 /* Set the maximum number of expansion of wild cards. */
est_db_set_wildmax(ESTDB * db,int num)4506 void est_db_set_wildmax(ESTDB *db, int num){
4507 assert(db && num >= 0);
4508 db->wildmax = num;
4509 }
4510
4511
4512 /* Get the entity data of a document in a database. */
est_db_get_doc_entity(ESTDB * db,int id,int * sp)4513 char *est_db_get_doc_entity(ESTDB *db, int id, int *sp){
4514 char *ptr;
4515 assert(db && id > 0 && sp);
4516 if(!(ptr = crgetlob(db->textdb, (char *)&id, sizeof(int), 0, -1, sp))){
4517 est_set_ecode(&(db->ecode), dpecode == DP_ENOITEM ? ESTENOITEM : ESTEDB, __LINE__);
4518 return NULL;
4519 }
4520 return ptr;
4521 }
4522
4523
4524 /* Add a piece of meta data to a database. */
est_db_add_meta(ESTDB * db,const char * name,const char * value)4525 void est_db_add_meta(ESTDB *db, const char *name, const char *value){
4526 assert(db && name);
4527 if(!dpwritable(db->metadb)){
4528 est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
4529 return;
4530 }
4531 if(!db->metacc) est_db_prepare_meta(db);
4532 if(value){
4533 cbmapput(db->metacc, name, -1, value, -1, TRUE);
4534 } else {
4535 cbmapout(db->metacc, name, -1);
4536 }
4537 }
4538
4539
4540 /* Get a list of names of meta data of a database. */
est_db_meta_names(ESTDB * db)4541 CBLIST *est_db_meta_names(ESTDB *db){
4542 assert(db);
4543 if(!db->metacc) est_db_prepare_meta(db);
4544 return cbmapkeys(db->metacc);
4545 }
4546
4547
4548 /* Get the value of a piece of meta data of a database. */
est_db_meta(ESTDB * db,const char * name)4549 char *est_db_meta(ESTDB *db, const char *name){
4550 const char *vbuf;
4551 int vsiz;
4552 assert(db && name);
4553 if(!db->metacc) est_db_prepare_meta(db);
4554 if(!(vbuf = cbmapget(db->metacc, name, -1, &vsiz))) return NULL;
4555 return cbmemdup(vbuf, vsiz);
4556 }
4557
4558
4559 /* Extract keywords of a document object. */
est_db_etch_doc(ESTDB * db,ESTDOC * doc,int max)4560 CBMAP *est_db_etch_doc(ESTDB *db, ESTDOC *doc, int max){
4561 ESTKEYSC *scores;
4562 CBMAP *keys, *umap;
4563 CBLIST *words;
4564 const char *text, *word, *vbuf;
4565 const unsigned char *uword;
4566 char numbuf[ESTNUMBUFSIZ];
4567 int i, wsiz, num, smax, snum, vsiz;
4568 assert(doc && max >= 0);
4569 if(!doc->dtexts) return cbmapopenex(1);
4570 keys = cbmapopenex(max * 2 + 1);
4571 CB_LISTOPEN(words);
4572 for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
4573 if(i < 0){
4574 if(!doc->attrs || !(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
4575 } else {
4576 text = CB_LISTVAL(doc->dtexts, i);
4577 }
4578 if(db){
4579 switch(db->amode){
4580 case ESTDFPERFNG:
4581 est_break_text_perfng(text, words, FALSE, FALSE);
4582 break;
4583 case ESTDFCHRCAT:
4584 est_break_text_chrcat(text, words, FALSE);
4585 break;
4586 default:
4587 est_break_text(text, words, FALSE, FALSE);
4588 break;
4589 }
4590 } else {
4591 est_break_text(text, words, FALSE, FALSE);
4592 }
4593 }
4594 umap = cbmapopenex(CB_LISTNUM(words) + 1);
4595 for(i = 0; i < CB_LISTNUM(words); i++){
4596 word = CB_LISTVAL2(words, i, wsiz);
4597 if(wsiz > ESTWORDMAXLEN) continue;
4598 num = (vbuf = cbmapget(umap, word, wsiz, NULL)) ? *(int *)vbuf + 1 : 1;
4599 cbmapput(umap, word, wsiz, (char *)&num, sizeof(int), TRUE);
4600 }
4601 CB_MALLOC(scores, cbmaprnum(umap) * sizeof(ESTKEYSC) + 1);
4602 snum = 0;
4603 cbmapiterinit(umap);
4604 while((uword = (unsigned char *)cbmapiternext(umap, &wsiz)) != NULL){
4605 scores[snum].word = (char *)uword;
4606 scores[snum].wsiz = wsiz;
4607 scores[snum].pt = (vbuf = cbmapiterval((char *)uword, NULL)) ? *(int *)vbuf : 0;
4608 if(uword[0] >= 0xe3){
4609 if(wsiz <= 3){
4610 scores[snum].pt /= 2;
4611 if((uword[0] == 0xe3 && (uword[1] == 0x80 || uword[1] == 0x81 ||
4612 (uword[1] == 0x82 && uword[2] <= 0x9f))) ||
4613 (uword[0] == 0xef && uword[1] >= 0xbc)) scores[snum].pt /= 2;
4614 } else {
4615 if((uword[0] == 0xe3 && (uword[1] == 0x80 || uword[1] == 0x81 ||
4616 (uword[1] == 0x82 && uword[2] <= 0x9f))) ||
4617 (uword[0] == 0xef && uword[1] >= 0xbc)) scores[snum].pt /= 2;
4618 if((uword[3] == 0xe3 && (uword[4] == 0x80 || uword[4] == 0x81 ||
4619 (uword[4] == 0x82 && uword[5] <= 0x9f))) ||
4620 (uword[3] == 0xef && uword[4] >= 0xbc)) scores[snum].pt /= 2;
4621 }
4622 } else if((uword[0] > '\0' && uword[0] <= '/') || (uword[0] >= ':' && uword[0] <= '@') ||
4623 (uword[0] >= '[' && uword[0] <= '`') || (uword[0] >= '{' && uword[0] <= '~')){
4624 scores[snum].pt /= 25;
4625 if(wsiz <= 1) scores[snum].pt /= 2;
4626 } else {
4627 switch(wsiz){
4628 case 1:
4629 scores[snum].pt /= 9;
4630 break;
4631 case 2:
4632 scores[snum].pt /= 5;
4633 break;
4634 case 3:
4635 scores[snum].pt /= 3;
4636 break;
4637 case 4:
4638 scores[snum].pt /= 2;
4639 break;
4640 case 5:
4641 scores[snum].pt /= 1.5;
4642 break;
4643 case 6:
4644 scores[snum].pt /= 1.25;
4645 break;
4646 }
4647 }
4648 snum++;
4649 }
4650 qsort(scores, snum, sizeof(ESTKEYSC), est_keysc_compare);
4651 smax = max * (db ? ESTKEYSCALW : 1) + 1;
4652 snum = snum > smax ? smax : snum;
4653 if(db){
4654 for(i = 0; i < snum; i++){
4655 if((vbuf = cbmapget(db->keycc, scores[i].word, scores[i].wsiz, NULL)) != NULL){
4656 cbmapmove(db->keycc, scores[i].word, scores[i].wsiz, FALSE);
4657 vsiz = *(int *)vbuf;
4658 } else {
4659 if(db->dfdb){
4660 if((vsiz = dpgetwb(db->dfdb, scores[i].word, scores[i].wsiz,
4661 0, ESTNUMBUFSIZ - 1, numbuf)) > 0){
4662 numbuf[vsiz] = '\0';
4663 vsiz = atoi(numbuf);
4664 } else {
4665 vsiz = 0;
4666 }
4667 } else {
4668 vsiz = est_idx_vsiz(db->idxdb, scores[i].word, scores[i].wsiz);
4669 if(cbmapget(db->idxcc, scores[i].word, scores[i].wsiz, &num)) vsiz += num;
4670 }
4671 cbmapput(db->keycc, scores[i].word, scores[i].wsiz, (char *)&vsiz, sizeof(int), FALSE);
4672 }
4673 scores[i].pt *= 100000.0 / pow(vsiz + 64, 0.6);
4674 }
4675 if(db->kcmnum >= 0 && cbmaprnum(db->keycc) > db->kcmnum){
4676 num = db->kcmnum * 0.1 + 1;
4677 cbmapiterinit(db->keycc);
4678 for(i = 0; i < num && (word = cbmapiternext(db->keycc, &wsiz)) != NULL; i++){
4679 cbmapout(db->keycc, word, wsiz);
4680 }
4681 }
4682 qsort(scores, snum, sizeof(ESTKEYSC), est_keysc_compare);
4683 }
4684 for(i = 0; i < snum && i < max; i++){
4685 vsiz = sprintf(numbuf, "%d", scores[i].pt > 0 ? scores[i].pt : 1);
4686 cbmapput(keys, scores[i].word, scores[i].wsiz, numbuf, vsiz, FALSE);
4687 }
4688 free(scores);
4689 cbmapclose(umap);
4690 CB_LISTCLOSE(words);
4691 return keys;
4692 }
4693
4694
4695 /* Retrieve a map object of keywords. */
est_db_put_keywords(ESTDB * db,int id,CBMAP * kwords,double weight)4696 int est_db_put_keywords(ESTDB *db, int id, CBMAP *kwords, double weight){
4697 const char *kbuf;
4698 char *mbuf;
4699 int err, ksiz, pair[2], msiz;
4700 assert(db && id > 0 && kwords && weight >= 0.0);
4701 if(!dpwritable(db->metadb)){
4702 est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
4703 return FALSE;
4704 }
4705 if(crvsiz(db->attrdb, (char *)&id, sizeof(int)) == -1){
4706 est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
4707 return FALSE;
4708 }
4709 err = FALSE;
4710 if(!est_db_out_keywords(db, id) && db->ecode != ESTENOITEM) err = TRUE;
4711 pair[0] = id;
4712 cbmapiterinit(kwords);
4713 while((kbuf = cbmapiternext(kwords, &ksiz)) != NULL){
4714 if(ksiz < 1 || (kbuf[0] >= '\0' && kbuf[0] <= ' ')) continue;
4715 pair[1] = (int)(atoi(cbmapiterval(kbuf, NULL)) * weight);
4716 cbmapputcat(db->auxcc, kbuf, ksiz, (char *)pair, sizeof(pair));
4717 db->icsiz += sizeof(pair);
4718 }
4719 mbuf = cbmapdump(kwords, &msiz);
4720 if(!est_crput(db->kwddb, db->zmode, id, mbuf, msiz, CR_DOVER)){
4721 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
4722 db->fatal = TRUE;
4723 err = TRUE;
4724 }
4725 free(mbuf);
4726 return err ? FALSE : TRUE;
4727 }
4728
4729
4730 /* Remove keywords of a document. */
est_db_out_keywords(ESTDB * db,int id)4731 int est_db_out_keywords(ESTDB *db, int id){
4732 CBMAP *kwords;
4733 const char *word;
4734 char wbuf[ESTWORDMAXLEN+3], *tbuf;
4735 int wsiz;
4736 assert(db && id > 0);
4737 if(!dpwritable(db->metadb)){
4738 est_set_ecode(&(db->ecode), ESTEACCES, __LINE__);
4739 return FALSE;
4740 }
4741 if(id >= ESTPDOCIDMIN){
4742 est_set_ecode(&(db->ecode), ESTEINVAL, __LINE__);
4743 return FALSE;
4744 }
4745 if((kwords = est_db_get_keywords(db, id)) != NULL){
4746 cbmapiterinit(kwords);
4747 while((word = cbmapiternext(kwords, &wsiz)) != NULL){
4748 if(wsiz > ESTWORDMAXLEN){
4749 tbuf = cbsprintf(" %s", word);
4750 cbmapput(db->outcc, tbuf, wsiz + 1, "", 0, FALSE);
4751 free(tbuf);
4752 } else {
4753 sprintf(wbuf, " %s", word);
4754 cbmapput(db->outcc, wbuf, wsiz + 1, "", 0, FALSE);
4755 }
4756 }
4757 cbmapclose(kwords);
4758 }
4759 if(!est_crout(db->kwddb, id)){
4760 if(dpecode == DP_ENOITEM){
4761 est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
4762 } else {
4763 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
4764 db->fatal = TRUE;
4765 }
4766 return FALSE;
4767 }
4768 cbmapout(db->veccc, (char *)&id, sizeof(int));
4769 return TRUE;
4770 }
4771
4772
4773 /* Retrieve a map object of keywords. */
est_db_get_keywords(ESTDB * db,int id)4774 CBMAP *est_db_get_keywords(ESTDB *db, int id){
4775 CBMAP *kwords;
4776 const char *cbuf;
4777 char *mbuf;
4778 int i, csiz, msiz, num;
4779 assert(db && id > 0);
4780 if((cbuf = cbmapget(db->veccc, (char *)&id, sizeof(int), &csiz)) != NULL){
4781 cbmapmove(db->veccc, (char *)&id, sizeof(int), FALSE);
4782 return cbmapload(cbuf, csiz);
4783 }
4784 if(!(mbuf = est_crget(db->kwddb, db->zmode, id, &msiz))){
4785 est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
4786 return NULL;
4787 }
4788 kwords = cbmapload(mbuf, msiz);
4789 if(db->vcmnum > 0) cbmapput(db->veccc, (char *)&id, sizeof(int), mbuf, msiz, TRUE);
4790 free(mbuf);
4791 if(cbmaprnum(db->veccc) > db->vcmnum){
4792 num = cbmaprnum(db->veccc) * 0.1 + 1;
4793 cbmapiterinit(db->veccc);
4794 for(i = 0; i < num && (cbuf = cbmapiternext(db->veccc, NULL)) != NULL; i++){
4795 cbmapout(db->veccc, cbuf, sizeof(int));
4796 }
4797 }
4798 return kwords;
4799 }
4800
4801
4802 /* Mesure the total size of each inner records of a stored document. */
est_db_measure_doc(ESTDB * db,int id,int parts)4803 int est_db_measure_doc(ESTDB *db, int id, int parts){
4804 int sum, num;
4805 assert(db && id > 0);
4806 sum = 0;
4807 if((parts & ESTMDATTR) && (num = crvsiz(db->attrdb, (char *)&id, sizeof(int))) > 0) sum += num;
4808 if((parts & ESTMDTEXT) && (num = crvsiz(db->textdb, (char *)&id, sizeof(int))) > 0) sum += num;
4809 if((parts & ESTMDKWD) && (num = crvsiz(db->kwddb, (char *)&id, sizeof(int))) > 0) sum += num;
4810 return sum;
4811 }
4812
4813
4814 /* Initialize the iterator of a database. */
est_db_iter_init(ESTDB * db,const char * prev)4815 int est_db_iter_init(ESTDB *db, const char *prev){
4816 char *vbuf;
4817 assert(db);
4818 if(prev){
4819 if(!vlcurjump(db->listdb, prev, -1, VL_JFORWARD)) return dpecode == DP_ENOITEM;
4820 if((vbuf = vlcurkey(db->listdb, NULL)) != NULL){
4821 if(strcmp(prev, vbuf) >= 0) vlcurnext(db->listdb);
4822 free(vbuf);
4823 }
4824 return TRUE;
4825 }
4826 return vlcurfirst(db->listdb) || dpecode == DP_ENOITEM;
4827 }
4828
4829
4830 /* Get the next ID of the iterator of a database. */
est_db_iter_next(ESTDB * db)4831 int est_db_iter_next(ESTDB *db){
4832 char *vbuf;
4833 int id;
4834 assert(db);
4835 if(!(vbuf = vlcurval(db->listdb, NULL))){
4836 if(dpecode == DP_ENOITEM){
4837 est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
4838 return 0;
4839 } else {
4840 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
4841 db->fatal = TRUE;
4842 return -1;
4843 }
4844 }
4845 id = atoi(vbuf);
4846 free(vbuf);
4847 vlcurnext(db->listdb);
4848 return id;
4849 }
4850
4851
4852 /* Initialize the word iterator of a database. */
est_db_word_iter_init(ESTDB * db)4853 int est_db_word_iter_init(ESTDB *db){
4854 assert(db);
4855 if(!vlcurfirst(db->fwmdb) && dpecode != DP_ENOITEM){
4856 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
4857 db->fatal = TRUE;
4858 return FALSE;
4859 }
4860 return TRUE;
4861 }
4862
4863
4864 /* Get the next word of the word iterator of a database. */
est_db_word_iter_next(ESTDB * db)4865 char *est_db_word_iter_next(ESTDB *db){
4866 char *word;
4867 assert(db);
4868 if(!(word = vlcurkey(db->fwmdb, NULL))){
4869 if(dpecode == DP_ENOITEM){
4870 est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
4871 } else {
4872 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
4873 db->fatal = TRUE;
4874 }
4875 return NULL;
4876 }
4877 vlcurnext(db->fwmdb);
4878 return word;
4879 }
4880
4881
4882 /* Get the size of the record of a word. */
est_db_word_rec_size(ESTDB * db,const char * word)4883 int est_db_word_rec_size(ESTDB *db, const char *word){
4884 int num;
4885 assert(db && word);
4886 if(!cbmapget(db->idxcc, word, -1, &num)) num = 0;
4887 return est_idx_vsiz(db->idxdb, word, strlen(word)) + num;
4888 }
4889
4890
4891 /* Get the number of unique keywords in a database. */
est_db_keyword_num(ESTDB * db)4892 int est_db_keyword_num(ESTDB *db){
4893 int wnum;
4894 assert(db);
4895 wnum = vlrnum(db->xfmdb);
4896 return wnum > 0 ? wnum : 0;
4897 }
4898
4899
4900 /* Initialize the keyword iterator of a database. */
est_db_keyword_iter_init(ESTDB * db)4901 int est_db_keyword_iter_init(ESTDB *db){
4902 assert(db);
4903 if(!vlcurfirst(db->xfmdb) && dpecode != DP_ENOITEM){
4904 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
4905 db->fatal = TRUE;
4906 return FALSE;
4907 }
4908 return TRUE;
4909 }
4910
4911
4912 /* Get the next keyword of the word iterator of a database. */
est_db_keyword_iter_next(ESTDB * db)4913 char *est_db_keyword_iter_next(ESTDB *db){
4914 char *word;
4915 assert(db);
4916 if(!(word = vlcurkey(db->xfmdb, NULL))){
4917 if(dpecode == DP_ENOITEM){
4918 est_set_ecode(&(db->ecode), ESTENOITEM, __LINE__);
4919 } else {
4920 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
4921 db->fatal = TRUE;
4922 }
4923 return NULL;
4924 }
4925 vlcurnext(db->xfmdb);
4926 return word;
4927 }
4928
4929
4930 /* Get the size of the record of a keyword. */
est_db_keyword_rec_size(ESTDB * db,const char * word)4931 int est_db_keyword_rec_size(ESTDB *db, const char *word){
4932 const char *kbuf;
4933 assert(db && word);
4934 return (kbuf = vlgetcache(db->xfmdb, word, -1, NULL)) != NULL ? atoi(kbuf) : 0;
4935 }
4936
4937
4938 /* Search documents corresponding a keyword for a database. */
est_db_keyword_search(ESTDB * db,const char * word,int * nump)4939 int *est_db_keyword_search(ESTDB *db, const char *word, int *nump){
4940 int i, *res, rnum;
4941 assert(db && word && nump);
4942 if(!(res = (int *)vlget(db->auxdb, word, -1, &rnum))){
4943 *nump = 0;
4944 CB_MALLOC(res, 1);
4945 return res;
4946 }
4947 rnum /= sizeof(int) * 2;
4948 for(i = 0; i < rnum; i++){
4949 res[i] = res[i*2];
4950 }
4951 *nump = rnum;
4952 return res;
4953 }
4954
4955
4956 /* Get the number of records in the cache memory of a database. */
est_db_cache_num(ESTDB * db)4957 int est_db_cache_num(ESTDB *db){
4958 assert(db);
4959 return cbmaprnum(db->idxcc);
4960 }
4961
4962
4963 /* Get the size of used cache region. */
est_db_used_cache_size(ESTDB * db)4964 int est_db_used_cache_size(ESTDB *db){
4965 assert(db);
4966 return (db->icsiz + (cbmaprnum(db->idxcc) + cbmaprnum(db->auxcc)) *
4967 (sizeof(CBMAPDATUM) + ESTWORDAVGLEN)) * ESTMEMIRATIO;
4968 }
4969
4970
4971 /* Set the special cache for narrowing and sorting with document attributes. */
est_db_set_special_cache(ESTDB * db,const char * name,int num)4972 void est_db_set_special_cache(ESTDB *db, const char *name, int num){
4973 assert(db && name && num >= 0);
4974 if(db->spacc){
4975 free(db->scname);
4976 cbmapclose(db->spacc);
4977 }
4978 db->spacc = cbmapopenex(num + 1);
4979 db->scmnum = num;
4980 db->scname = cbmemdup(name, -1);
4981 }
4982
4983
4984 /* Set the callback function for database events. */
est_db_set_informer(ESTDB * db,void (* func)(const char *,void *),void * opaque)4985 void est_db_set_informer(ESTDB *db, void (*func)(const char *, void *), void *opaque){
4986 assert(db && func);
4987 db->infocb = func;
4988 db->infoop = opaque;
4989 est_db_inform(db, "status");
4990 }
4991
4992
4993 /* Fill the cache for keys for TF-IDF. */
est_db_fill_key_cache(ESTDB * db)4994 void est_db_fill_key_cache(ESTDB *db){
4995 const char *kbuf;
4996 char *msg;
4997 int i, ksiz, vsiz;
4998 assert(db);
4999 vlcurfirst(db->fwmdb);
5000 for(i = 0; (kbuf = vlcurkeycache(db->fwmdb, &ksiz)) != NULL; i++){
5001 vsiz = est_idx_vsiz(db->idxdb, kbuf, ksiz);
5002 cbmapput(db->keycc, kbuf, ksiz, (char *)&vsiz, sizeof(int), TRUE);
5003 vlcurnext(db->fwmdb);
5004 if(i % ESTCCCBFREQ == 0){
5005 msg = cbsprintf("filling the key cache for TF-IDF (%d)", i + 1);
5006 est_db_inform(db, msg);
5007 free(msg);
5008 }
5009 }
5010 db->kcmnum = -1;
5011 }
5012
5013
5014 /* Set the database of document frequency. */
est_db_set_dfdb(ESTDB * db,DEPOT * dfdb)5015 void est_db_set_dfdb(ESTDB *db, DEPOT *dfdb){
5016 assert(db);
5017 db->dfdb = dfdb;
5018 }
5019
5020
5021 /* Clear the result cache. */
est_db_refresh_rescc(ESTDB * db)5022 void est_db_refresh_rescc(ESTDB *db){
5023 ESTSCORE sc;
5024 const char *word;
5025 int size;
5026 assert(db);
5027 sc.id = -1;
5028 sc.score = 0;
5029 sc.value = NULL;
5030 cbmapiterinit(db->rescc);
5031 while((word = cbmapiternext(db->rescc, &size)) != NULL){
5032 cbmapput(db->rescc, word, size, (char *)&sc, sizeof(ESTSCORE), TRUE);
5033 }
5034 }
5035
5036
5037 /* Charge the result cache. */
est_db_charge_rescc(ESTDB * db,int max)5038 void est_db_charge_rescc(ESTDB *db, int max){
5039 CBLIST *words;
5040 const char *word, *vbuf;
5041 int i, num, size, vsiz;
5042 assert(db);
5043 if(max < 0) max = INT_MAX;
5044 CB_LISTOPEN(words);
5045 cbmapiterinit(db->rescc);
5046 while((word = cbmapiternext(db->rescc, &size)) != NULL){
5047 CB_MAPITERVAL(vbuf, word, vsiz);
5048 if(vsiz == sizeof(ESTSCORE) && ((ESTSCORE *)vbuf)->id == -1) CB_LISTPUSH(words, word, size);
5049 }
5050 num = CB_LISTNUM(words);
5051 for(i = 0; i < max && i < num; i++){
5052 word = cblistval(words, num - i - 1, &size);
5053 free(est_search_union(db, word, 1, NULL, &size, NULL, TRUE, -1, NULL));
5054 }
5055 CB_LISTCLOSE(words);
5056 }
5057
5058
5059 /* Get a list of words in the result cache. */
est_db_list_rescc(ESTDB * db)5060 CBLIST *est_db_list_rescc(ESTDB *db){
5061 CBLIST *words;
5062 const char *word;
5063 int size;
5064 assert(db);
5065 CB_LISTOPEN(words);
5066 cbmapiterinit(db->rescc);
5067 while((word = cbmapiternext(db->rescc, &size)) != NULL){
5068 cblistunshift(words, word, size);
5069 }
5070 return words;
5071 }
5072
5073
5074 /* Get the number of pseudo documents in a database. */
est_db_pseudo_doc_num(ESTDB * db)5075 int est_db_pseudo_doc_num(ESTDB *db){
5076 assert(db);
5077 return cblistnum(db->pdocs);
5078 }
5079
5080
5081 /* Get a list of expressions of attribute indexes of a database. */
est_db_attr_index_exprs(ESTDB * db)5082 CBLIST *est_db_attr_index_exprs(ESTDB *db){
5083 ESTATTRIDX *attridx;
5084 CBLIST *list;
5085 const char *kbuf;
5086 char *expr;
5087 assert(db);
5088 list = cblistopen();
5089 cbmapiterinit(db->aidxs);
5090 while((kbuf = cbmapiternext(db->aidxs, NULL)) != NULL){
5091 attridx = (ESTATTRIDX *)cbmapiterval(kbuf, NULL);
5092 switch(attridx->type){
5093 case ESTIDXATTRSTR:
5094 expr = cbsprintf("%s=str", kbuf);
5095 break;
5096 case ESTIDXATTRNUM:
5097 expr = cbsprintf("%s=num", kbuf);
5098 break;
5099 default:
5100 expr = cbsprintf("%s=seq", kbuf);
5101 break;
5102 }
5103 CB_LISTPUSHBUF(list, expr, strlen(expr));
5104 }
5105 return list;
5106 }
5107
5108
5109 /* Interrupt long time processing. */
est_db_interrupt(ESTDB * db)5110 void est_db_interrupt(ESTDB *db){
5111 assert(db);
5112 db->intflag = TRUE;
5113 }
5114
5115
5116 /* Repair a broken database directory. */
est_db_repair(const char * name,int options,int * ecp)5117 int est_db_repair(const char *name, int options, int *ecp){
5118 ESTDB *db;
5119 DEPOT *depot, *metadb;
5120 CURIA *curia, *attrdb, *textdb, *kwddb;
5121 VILLA *villa, *listdb;
5122 CBLIST *list;
5123 CBMAP *aidxs, *attrs;
5124 ESTATTRIDX attridx, *attridxp;
5125 void *aidxdb;
5126 const char *elem, *abuf;
5127 char path[ESTPATHBUFSIZ], *kbuf, vbuf[ESTNUMBUFSIZ], *dec, *mbuf;
5128 int i, err, idmax, flags, zmode, dnum, dseq, ksiz, vsiz, type, id, msiz, esiz, asiz;
5129 assert(name && ecp);
5130 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTMETADBNAME);
5131 if(est_inode(path) == -1){
5132 est_set_ecode(ecp, ESTEIO, __LINE__);
5133 return FALSE;
5134 }
5135 if(!(options & ESTRPSTRICT) && (depot= dpopen(path, DP_OWRITER, -1)) != NULL){
5136 dpclose(depot);
5137 } else {
5138 dprepair(path);
5139 }
5140 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTIDXDBNAME);
5141 if(est_inode(path) == -1){
5142 est_set_ecode(ecp, ESTEIO, __LINE__);
5143 return FALSE;
5144 }
5145 if((list = cbdirlist(path)) != NULL){
5146 for(i = 1; i < CB_LISTNUM(list); i++){
5147 elem = CB_LISTVAL(list, i);
5148 if(elem[0] < '0' || elem[0] > '9') continue;
5149 sprintf(path, "%s%c%s%c%s", name, ESTPATHCHR, ESTIDXDBNAME, ESTPATHCHR, elem);
5150 if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5151 vlclose(villa);
5152 } else {
5153 vlrepair(path, VL_CMPLEX);
5154 }
5155 }
5156 CB_LISTCLOSE(list);
5157 }
5158 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTFWMDBNAME);
5159 if(est_inode(path) == -1){
5160 est_set_ecode(ecp, ESTEIO, __LINE__);
5161 return FALSE;
5162 }
5163 if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5164 vlclose(villa);
5165 } else {
5166 vlrepair(path, VL_CMPLEX);
5167 }
5168 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTAUXDBNAME);
5169 if(est_inode(path) == -1){
5170 est_set_ecode(ecp, ESTEIO, __LINE__);
5171 return FALSE;
5172 }
5173 if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5174 vlclose(villa);
5175 } else {
5176 vlrepair(path, VL_CMPLEX);
5177 }
5178 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTXFMDBNAME);
5179 if(est_inode(path) == -1){
5180 est_set_ecode(ecp, ESTEIO, __LINE__);
5181 return FALSE;
5182 }
5183 if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5184 vlclose(villa);
5185 } else {
5186 vlrepair(path, VL_CMPLEX);
5187 }
5188 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTATTRDBNAME);
5189 if(est_inode(path) == -1){
5190 est_set_ecode(ecp, ESTEIO, __LINE__);
5191 return FALSE;
5192 }
5193 if(!(options & ESTRPSTRICT) && (curia = cropen(path, CR_OWRITER, -1, -1)) != NULL){
5194 crclose(curia);
5195 } else {
5196 crrepair(path);
5197 }
5198 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTTEXTDBNAME);
5199 if(est_inode(path) == -1){
5200 est_set_ecode(ecp, ESTEIO, __LINE__);
5201 return FALSE;
5202 }
5203 if(!(options & ESTRPSTRICT) && (curia = cropen(path, CR_OWRITER, -1, -1)) != NULL){
5204 crclose(curia);
5205 } else {
5206 crrepair(path);
5207 }
5208 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTKWDDBNAME);
5209 if(est_inode(path) == -1){
5210 est_set_ecode(ecp, ESTEIO, __LINE__);
5211 return FALSE;
5212 }
5213 if(!(options & ESTRPSTRICT) && (curia = cropen(path, CR_OWRITER, -1, -1)) != NULL){
5214 crclose(curia);
5215 } else {
5216 crrepair(path);
5217 }
5218 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTLISTDBNAME);
5219 if(est_inode(path) == -1){
5220 est_set_ecode(ecp, ESTEIO, __LINE__);
5221 return FALSE;
5222 }
5223 if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5224 vlclose(villa);
5225 } else {
5226 vlrepair(path, VL_CMPLEX);
5227 }
5228 if((list = cbdirlist(name)) != NULL){
5229 for(i = 0; i < CB_LISTNUM(list); i++){
5230 elem = CB_LISTVAL(list, i);
5231 if(cbstrfwmatch(elem, ESTAISEQPREF)){
5232 sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
5233 if(!(options & ESTRPSTRICT) && (depot = dpopen(path, DP_OWRITER, -1)) != NULL){
5234 dpclose(depot);
5235 } else {
5236 dprepair(path);
5237 }
5238 } else if(cbstrfwmatch(elem, ESTAISTRPREF)){
5239 sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
5240 if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5241 vlclose(villa);
5242 } else {
5243 vlrepair(path, VL_CMPLEX);
5244 }
5245 } else if(cbstrfwmatch(elem, ESTAINUMPREF)){
5246 sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
5247 if(!(options & ESTRPSTRICT) && (villa = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5248 vlclose(villa);
5249 } else {
5250 vlrepair(path, VL_CMPLEX);
5251 }
5252 }
5253 }
5254 CB_LISTCLOSE(list);
5255 }
5256 if((options & ESTRPSHODDY) && (db = est_db_open(name, ESTDBWRITER, ecp)) != NULL){
5257 if(!est_db_close(db, ecp)) return FALSE;
5258 return TRUE;
5259 }
5260 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTMETADBNAME);
5261 metadb = dpopen(path, DP_OWRITER, -1);
5262 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTATTRDBNAME);
5263 attrdb = cropen(path, CR_OWRITER, -1, -1);
5264 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTTEXTDBNAME);
5265 textdb = cropen(path, CR_OWRITER, -1, -1);
5266 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTKWDDBNAME);
5267 kwddb = cropen(path, CR_OWRITER, -1, -1);
5268 sprintf(path, "%s%c%s", name, ESTPATHCHR, ESTLISTDBNAME);
5269 listdb = vlopen(path, VL_OWRITER, VL_CMPLEX);
5270 if(!attrdb || !textdb || !kwddb || !listdb){
5271 if(listdb) vlclose(listdb);
5272 if(kwddb) crclose(kwddb);
5273 if(textdb) crclose(textdb);
5274 if(attrdb) crclose(attrdb);
5275 if(metadb) dpclose(metadb);
5276 est_set_ecode(ecp, ESTEDB, __LINE__);
5277 return FALSE;
5278 }
5279 aidxs = cbmapopenex(ESTMINIBNUM);
5280 if((list = cbdirlist(name)) != NULL){
5281 for(i = 0; i < CB_LISTNUM(list); i++){
5282 elem = CB_LISTVAL(list, i);
5283 dec = NULL;
5284 type = -1;
5285 if(cbstrfwmatch(elem, ESTAISEQPREF)){
5286 dec = est_hex_decode(elem + strlen(ESTAISEQPREF));
5287 type = ESTIDXATTRSEQ;
5288 } else if(cbstrfwmatch(elem, ESTAISTRPREF)){
5289 dec = est_hex_decode(elem + strlen(ESTAISTRPREF));
5290 type = ESTIDXATTRSTR;
5291 } else if(cbstrfwmatch(elem, ESTAINUMPREF)){
5292 dec = est_hex_decode(elem + strlen(ESTAINUMPREF));
5293 type = ESTIDXATTRNUM;
5294 }
5295 if(dec){
5296 sprintf(path, "%s%c%s", name, ESTPATHCHR, elem);
5297 switch(type){
5298 case ESTIDXATTRSTR:
5299 if((aidxdb = vlopen(path, VL_OWRITER, VL_CMPLEX)) != NULL){
5300 vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
5301 vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
5302 attridx.db = aidxdb;
5303 attridx.type = type;
5304 cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
5305 }
5306 break;
5307 case ESTIDXATTRNUM:
5308 if((aidxdb = vlopen(path, VL_OWRITER, est_aidx_numcmp)) != NULL){
5309 vlsettuning(aidxdb, ESTAIDXLRM, ESTAIDXNIM, ESTAIDXLCN, ESTAIDXNCN);
5310 vlsetfbpsiz(aidxdb, ESTAIDXVLFBP);
5311 attridx.db = aidxdb;
5312 attridx.type = type;
5313 cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
5314 }
5315 break;
5316 default:
5317 if((aidxdb = dpopen(path, DP_OWRITER, crbnum(attrdb) / ESTAIBDIAM)) != NULL){
5318 dpsetfbpsiz(aidxdb, ESTAIDXDPFBP);
5319 attridx.db = aidxdb;
5320 attridx.type = type;
5321 cbmapput(aidxs, dec, -1, (char *)&attridx, sizeof(ESTATTRIDX), FALSE);
5322 }
5323 break;
5324 }
5325 free(dec);
5326 }
5327 }
5328 CB_LISTCLOSE(list);
5329 }
5330 err = FALSE;
5331 idmax = 0;
5332 if((vsiz = dpgetwb(metadb, ESTKEYDNUM, -1, 0, ESTNUMBUFSIZ - 1, vbuf)) > 0){
5333 vbuf[vsiz] = '\0';
5334 idmax = atoi(vbuf);
5335 }
5336 flags = dpgetflags(metadb);
5337 zmode = 0;
5338 if(flags & ESTDFZLIB){
5339 zmode = ESTDFZLIB;
5340 } else if(flags & ESTDFLZO){
5341 zmode = ESTDFLZO;
5342 } else if(flags & ESTDFBZIP){
5343 zmode = ESTDFBZIP;
5344 }
5345 dnum = 0;
5346 dseq = 0;
5347 CB_LISTOPEN(list);
5348 if(!criterinit(attrdb)) err = TRUE;
5349 while((kbuf = criternext(attrdb, &ksiz)) != NULL){
5350 if(ksiz == sizeof(int) && (id = *(int *)kbuf) > 0 && id <= idmax &&
5351 crvsiz(attrdb, kbuf, ksiz) > 0 && crvsiz(textdb, kbuf, ksiz) > 0){
5352 dnum++;
5353 if(dseq < id) dseq = id;
5354 if(options & ESTRPSTRICT){
5355 if((mbuf = est_crget(attrdb, zmode, id, &msiz)) != NULL){
5356 attrs = cbmapload(mbuf, msiz);
5357 if((elem = cbmapget(attrs, ESTDATTRURI, -1, &esiz)) != NULL){
5358 vsiz = sprintf(vbuf, "%d", id);
5359 vlput(listdb, elem, esiz, vbuf, vsiz, VL_DKEEP);
5360 }
5361 if(cbmaprnum(aidxs) > 0){
5362 cbmapiterinit(aidxs);
5363 while((abuf = cbmapiternext(aidxs, &asiz)) != NULL){
5364 if(!(elem = cbmapget(attrs, abuf, asiz, &esiz))) continue;
5365 attridxp = (ESTATTRIDX *)cbmapiterval(abuf, NULL);
5366 switch(attridxp->type){
5367 case ESTIDXATTRSTR:
5368 case ESTIDXATTRNUM:
5369 est_aidx_attr_put(attridxp->db, id, elem, esiz);
5370 break;
5371 default:
5372 est_aidx_seq_put(attridxp->db, id, elem, esiz);
5373 break;
5374 }
5375 }
5376 }
5377 cbmapclose(attrs);
5378 free(mbuf);
5379 }
5380 }
5381 } else {
5382 CB_LISTPUSH(list, kbuf, ksiz);
5383 }
5384 free(kbuf);
5385 }
5386 if(dpecode != DP_ENOITEM) err = TRUE;
5387 for(i = 0; i < CB_LISTNUM(list); i++){
5388 elem = CB_LISTVAL2(list, i, esiz);
5389 crout(attrdb, elem, esiz);
5390 crout(textdb, elem, esiz);
5391 crout(kwddb, elem, esiz);
5392 }
5393 CB_LISTCLOSE(list);
5394 sprintf(vbuf, "%d", dseq);
5395 if(!dpput(metadb, ESTKEYDSEQ, -1, vbuf, -1, DP_DOVER)) err = TRUE;
5396 sprintf(vbuf, "%d", dnum);
5397 if(!dpput(metadb, ESTKEYDNUM, -1, vbuf, -1, DP_DOVER)) err = TRUE;
5398 cbmapiterinit(aidxs);
5399 while((elem = cbmapiternext(aidxs, NULL)) != NULL){
5400 attridxp = (ESTATTRIDX *)cbmapiterval(elem, NULL);
5401 switch(attridxp->type){
5402 case ESTIDXATTRSTR:
5403 case ESTIDXATTRNUM:
5404 if(!vlclose(attridxp->db)) err = TRUE;
5405 break;
5406 default:
5407 if(!dpclose(attridxp->db)) err = TRUE;
5408 break;
5409 }
5410 }
5411 cbmapclose(aidxs);
5412 if(!vlclose(listdb)) err = TRUE;
5413 if(!crclose(kwddb)) err = TRUE;
5414 if(!crclose(textdb)) err = TRUE;
5415 if(!crclose(attrdb)) err = TRUE;
5416 if(!dpclose(metadb)) err = TRUE;
5417 if(err){
5418 est_set_ecode(ecp, ESTEDB, __LINE__);
5419 return FALSE;
5420 }
5421 return err ? FALSE : TRUE;
5422 }
5423
5424
5425 /* Extract words for snippet from hints of search. */
est_hints_to_words(CBMAP * hints)5426 CBLIST *est_hints_to_words(CBMAP *hints){
5427 CBLIST *words;
5428 const char *kbuf;
5429 int ksiz;
5430 assert(hints);
5431 CB_LISTOPEN(words);
5432 cbmapiterinit(hints);
5433 while((kbuf = cbmapiternext(hints, &ksiz)) != NULL){
5434 if(ksiz < 1 || atoi(cbmapget(hints, kbuf, ksiz, NULL)) < 0) continue;
5435 CB_LISTPUSH(words, kbuf, ksiz);
5436 }
5437 return words;
5438 }
5439
5440
5441 /* Add a record into a result map for logical operation. */
est_resmap_add(CBMAP * map,const char * key,int score,int method)5442 void est_resmap_add(CBMAP *map, const char *key, int score, int method){
5443 int elem[2], *ep, size;
5444 assert(map && key);
5445 size = strlen(key);
5446 if((ep = (int *)cbmapget(map, key, size, NULL)) != NULL){
5447 elem[0] = ep[0] + 1;
5448 switch(method){
5449 case ESTRMLOSUM:
5450 elem[1] = ep[1] + score;
5451 break;
5452 case ESTRMLOMAX:
5453 elem[1] = score > ep[1] ? score : ep[1];
5454 break;
5455 case ESTRMLOMIN:
5456 elem[1] = score < ep[1] ? score : ep[1];
5457 break;
5458 case ESTRMLOAVG:
5459 elem[1] = (ep[1] * (ep[0] - 1) + score) / ep[0];
5460 break;
5461 default:
5462 elem[1] = score;
5463 break;
5464 }
5465 } else {
5466 elem[0] = 1;
5467 elem[1] = score;
5468 }
5469 cbmapput(map, key, size, (char *)&elem, sizeof(int) * 2, TRUE);
5470 }
5471
5472
5473
5474 /* Compare two result elements by score.
5475 `ap' specifies the pointer to one element.
5476 `bp' specifies the pointer to the other element.
5477 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_resmapelem_compare(const void * ap,const void * bp)5478 static int est_resmapelem_compare(const void *ap, const void *bp){
5479 assert(ap && bp);
5480 return ((ESTRESMAPELEM *)bp)->score - ((ESTRESMAPELEM *)ap)->score;
5481 }
5482
5483
5484 /* Dump a result list of a result map for logical operation. */
est_resmap_dump(CBMAP * map,int min,int * nump)5485 ESTRESMAPELEM *est_resmap_dump(CBMAP *map, int min, int *nump){
5486 ESTRESMAPELEM *elems;
5487 const char *key, *vbuf;
5488 int num, vsiz;
5489 assert(map && min >= 0 && nump);
5490 CB_MALLOC(elems, cbmaprnum(map) * sizeof(ESTRESMAPELEM) + 1);
5491 num = 0;
5492 cbmapiterinit(map);
5493 while((key = cbmapiternext(map, NULL)) != NULL){
5494 CB_MAPITERVAL(vbuf, key, vsiz);
5495 if(((int *)vbuf)[0] < min) continue;
5496 elems[num].key = key;
5497 elems[num].score = ((int *)vbuf)[1];
5498 num++;
5499 }
5500 qsort(elems, num, sizeof(ESTRESMAPELEM), est_resmapelem_compare);
5501 *nump = num;
5502 return elems;
5503 }
5504
5505
5506 /* Reset the environment of the process. */
est_proc_env_reset(void)5507 void est_proc_env_reset(void){
5508 char *value, *pbuf;
5509 cbstdiobin();
5510 putenv("LANG=C");
5511 putenv("LANGUAGE=C");
5512 putenv("LC_CTYPE=C");
5513 putenv("LC_COLLATE=C");
5514 putenv("LC_TIME=C");
5515 putenv("LC_NUMERIC=C");
5516 putenv("LC_MONETARY=C");
5517 putenv("LC_ALL=C");
5518 putenv("EST_VERSION=" _EST_VERSION);
5519 if((value = getenv("PATH")) != NULL){
5520 if(ESTPATHCHR == '\\'){
5521 pbuf = cbsprintf("PATH=%s;C:\\hyperestraier;D:\\hyperestraier;E:\\hyperestraier", value);
5522 } else {
5523 pbuf = cbsprintf("PATH=%s:/bin:/sbin:/usr/bin:/usr/sbin:"
5524 "/usr/local/bin:/usr/local/sbin", value);
5525 }
5526 putenv(pbuf);
5527 cbglobalgc(pbuf, free);
5528 }
5529 }
5530
5531
5532 /* Make a directory. */
est_mkdir(const char * path)5533 int est_mkdir(const char *path){
5534 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
5535 assert(path);
5536 return mkdir(path) == 0 ? TRUE : FALSE;
5537 #else
5538 assert(path);
5539 return mkdir(path, ESTDIRMODE) == 0 ? TRUE : FALSE;
5540 #endif
5541 }
5542
5543
5544 /* Remove a directory and its contents recursively. */
est_rmdir_rec(const char * path)5545 int est_rmdir_rec(const char *path){
5546 CBLIST *files;
5547 const char *file;
5548 char pbuf[ESTPATHBUFSIZ];
5549 int i;
5550 assert(path);
5551 if((files = cbdirlist(path)) != NULL){
5552 for(i = 0; i < CB_LISTNUM(files); i++){
5553 file = CB_LISTVAL(files, i);
5554 if(!strcmp(file, ESTCDIRSTR) || !strcmp(file, ESTPDIRSTR)) continue;
5555 sprintf(pbuf, "%s%c%s", path, ESTPATHCHR, file);
5556 if(unlink(pbuf) == -1) est_rmdir_rec(pbuf);
5557 }
5558 CB_LISTCLOSE(files);
5559 }
5560 return rmdir(path) == 0;
5561 }
5562
5563
5564 /* Get the canonicalized absolute pathname of a file. */
est_realpath(const char * path)5565 char *est_realpath(const char *path){
5566 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
5567 char pbuf[ESTPATHBUFSIZ*2], *p;
5568 assert(path);
5569 if(GetFullPathName(path, ESTPATHBUFSIZ, pbuf, &p) == 0){
5570 if((((path[0] >= 'A' && path[0] <= 'Z') || (path[0] >= 'a' && path[0] <= 'z')) &&
5571 path[1] == ':' && path[2] == ESTPATHCHR) || path[0] == ESTPATHCHR ||
5572 GetFullPathName(ESTCDIRSTR, ESTPATHBUFSIZ, pbuf, &p) == 0){
5573 sprintf(pbuf, "%s", path);
5574 } else {
5575 sprintf(pbuf + strlen(pbuf), "%c%s", ESTPATHCHR, path);
5576 }
5577 }
5578 return cbmemdup(pbuf, -1);
5579 #else
5580 char pbuf[ESTPATHBUFSIZ*2];
5581 assert(path);
5582 if(!realpath(path, pbuf)){
5583 if(path[0] == ESTPATHCHR || !realpath(ESTCDIRSTR, pbuf)){
5584 sprintf(pbuf, "%s", path);
5585 } else {
5586 sprintf(pbuf + strlen(pbuf), "%c%s", ESTPATHCHR, path);
5587 }
5588 }
5589 return cbmemdup(pbuf, -1);
5590 #endif
5591 }
5592
5593
5594 /* Get the inode number of a file. */
est_inode(const char * path)5595 int est_inode(const char *path){
5596 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
5597 char pbuf[ESTPATHBUFSIZ*2], *p;
5598 int inode;
5599 struct stat sbuf;
5600 assert(path);
5601 if(stat(path, &sbuf) == -1) return -1;
5602 if(GetFullPathName(path, ESTPATHBUFSIZ*2, pbuf, &p) != 0){
5603 inode = 11003;
5604 for(p = pbuf; *p != '\0'; p++){
5605 inode = inode * 31 + *(unsigned char *)p;
5606 }
5607 return (inode * 911) & 0x7FFF;
5608 }
5609 return -1;
5610 #else
5611 struct stat sbuf;
5612 assert(path);
5613 if(stat(path, &sbuf) == -1) return -1;
5614 return sbuf.st_ino & INT_MAX;
5615 #endif
5616 }
5617
5618
5619 /* Change modification time of a file. */
est_utime(const char * path,time_t mtime)5620 int est_utime(const char *path, time_t mtime){
5621 struct utimbuf buf;
5622 assert(path);
5623 if(mtime < 0) return utime(path, NULL) == 0;
5624 buf.actime = mtime;
5625 buf.modtime = mtime;
5626 return utime(path, &buf) == 0;
5627 }
5628
5629
5630
5631 /* Get the time of day in milliseconds. */
est_gettimeofday(void)5632 double est_gettimeofday(void){
5633 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
5634 SYSTEMTIME st;
5635 struct tm ts;
5636 GetLocalTime(&st);
5637 memset(&ts, 0, sizeof(ts));
5638 ts.tm_year = st.wYear - 1900;
5639 ts.tm_mon = st.wMonth - 1;
5640 ts.tm_mday = st.wDay;
5641 ts.tm_hour = st.wHour;
5642 ts.tm_min = st.wMinute;
5643 ts.tm_sec = st.wSecond;
5644 return (double)mktime(&ts) * 1000 + (double)st.wMilliseconds;
5645 #else
5646 struct timeval tv;
5647 struct timezone tz;
5648 if(gettimeofday(&tv, &tz) == -1) return 0.0;
5649 return (double)tv.tv_sec * 1000 + (double)tv.tv_usec / 1000;
5650 #endif
5651 }
5652
5653
5654 /* Suspend execution for microsecond intervals. */
est_usleep(unsigned long usec)5655 void est_usleep(unsigned long usec){
5656 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
5657 Sleep(usec / 1000);
5658 #else
5659 usleep(usec);
5660 #endif
5661 }
5662
5663
5664 /* Set a signal handler. */
est_signal(int signum,void (* sighandler)(int))5665 void est_signal(int signum, void (*sighandler)(int)){
5666 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
5667 static int first = TRUE;
5668 int i;
5669 assert(signum >= 0 && sighandler);
5670 if(first){
5671 for(i = 1; i < ESTSIGNUM; i++){
5672 est_signal_handlers[i] = NULL;
5673 }
5674 SetConsoleCtrlHandler((PHANDLER_ROUTINE)est_signal_dispatch, TRUE);
5675 first = FALSE;
5676 }
5677 if(signum >= ESTSIGNUM) return;
5678 if(sighandler == SIG_IGN){
5679 signal(signum, SIG_IGN);
5680 } else if(sighandler == SIG_DFL){
5681 signal(signum, SIG_DFL);
5682 } else {
5683 signal(signum, (void (*)(int))est_signal_dispatch);
5684 est_signal_handlers[signum] = sighandler;
5685 }
5686 #else
5687 static int first = TRUE;
5688 struct sigaction act;
5689 int i;
5690 assert(signum >= 0 && sighandler);
5691 if(first){
5692 for(i = 1; i < ESTSIGNUM; i++){
5693 est_signal_handlers[i] = NULL;
5694 }
5695 first = FALSE;
5696 }
5697 if(signum >= ESTSIGNUM) return;
5698 memset(&act, 0, sizeof(act));
5699 if(sighandler == SIG_IGN){
5700 act.sa_handler = SIG_IGN;
5701 } else if(sighandler == SIG_DFL){
5702 act.sa_handler = SIG_DFL;
5703 } else {
5704 act.sa_handler = (void (*)(int))est_signal_dispatch;
5705 est_signal_handlers[signum] = sighandler;
5706 }
5707 sigemptyset(&act.sa_mask);
5708 act.sa_flags = 0;
5709 sigaction(signum, &act, NULL);
5710 #endif
5711 }
5712
5713
5714 /* Send a signal to a process. */
est_kill(int pid,int sig)5715 int est_kill(int pid, int sig){
5716 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
5717 return FALSE;
5718 #else
5719 return kill(pid, sig) == 0;
5720 #endif
5721 }
5722
5723
5724 /* Get the load ratio of the physical memory. */
est_memory_usage(void)5725 double est_memory_usage(void){
5726 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_) || defined(_SYS_CYGWIN_)
5727 MEMORYSTATUS sbuf;
5728 sbuf.dwLength = sizeof(MEMORYSTATUS);
5729 GlobalMemoryStatus(&sbuf);
5730 return sbuf.dwMemoryLoad / 100.0;
5731 #else
5732 return 0.0;
5733 #endif
5734 }
5735
5736
5737 /* get the media type of an extention */
est_ext_type(const char * ext)5738 const char *est_ext_type(const char *ext){
5739 static const char *list[] = {
5740 ".txt", "text/plain", ".txt.en", "text/plain",
5741 ".txt.ja", "text/plain", ".asc", "text/plain",
5742 ".in", "text/plain", ".c", "text/plain",
5743 ".h", "text/plain", ".cc", "text/plain",
5744 ".java", "text/plain", ".sh", "text/plain",
5745 ".pl", "text/plain", ".py", "text/plain",
5746 ".rb", "text/plain", ".idl", "text/plain",
5747 ".csv", "text/plain", ".log", "text/plain",
5748 ".conf", "text/plain", ".rc", "text/plain",
5749 ".ini", "text/plain", ".html", "text/html",
5750 ".htm", "text/html", ".xhtml", "text/html",
5751 ".xht", "text/html", ".css", "text/css",
5752 ".js", "text/javascript", ".tsv", "text/tab-separated-values",
5753 ".eml", "message/rfc822", ".mime", "message/rfc822",
5754 ".mht", "message/rfc822", ".mhtml", "message/rfc822",
5755 ".sgml", "application/sgml", ".sgm", "application/sgml",
5756 ".xml", "application/xml", ".xsl", "application/xml",
5757 ".xslt", "application/xslt+xml", ".xhtml", "application/xhtml+xml",
5758 ".xht", "application/xhtml+xml", ".rdf", "application/rdf+xml",
5759 ".rss", "application/rss+xml", ".dtd", "application/xml-dtd",
5760 ".rtf", "application/rtf", ".pdf", "application/pdf",
5761 ".ps", "application/postscript", ".eps", "application/postscript",
5762 ".doc", "application/msword", ".xls", "application/vnd.ms-excel",
5763 ".ppt", "application/vnd.ms-powerpoint", ".xdw", "application/vnd.fujixerox.docuworks",
5764 ".swf", "application/x-shockwave-flash", ".zip", "application/zip",
5765 ".tar", "application/x-tar", ".gz", "application/x-gzip",
5766 ".bz2", "application/octet-stream", ".z", "application/octet-stream",
5767 ".lha", "application/octet-stream", ".lzh", "application/octet-stream",
5768 ".cab", "application/octet-stream", ".rar", "application/octet-stream",
5769 ".sit", "application/octet-stream", ".bin", "application/octet-stream",
5770 ".o", "application/octet-stream", ".so", "application/octet-stream",
5771 ".exe", "application/octet-stream", ".dll", "application/octet-stream",
5772 ".class", "application/octet-stream", ".png", "image/png",
5773 ".gif", "image/gif", ".jpg", "image/jpeg",
5774 ".jpeg", "image/jpeg", ".tif", "image/tiff",
5775 ".tiff", "image/tiff", ".bmp", "image/bmp",
5776 ".au", "audio/basic", ".snd", "audio/basic",
5777 ".mid", "audio/midi", ".midi", "audio/midi",
5778 ".mp2", "audio/mpeg", ".mp3", "audio/mpeg",
5779 ".wav", "audio/x-wav", ".mpg", "video/mpeg",
5780 ".mpeg", "video/mpeg", ".qt", "video/quicktime",
5781 ".mov", "video/quicktime", ".avi", "video/x-msvideo",
5782 NULL
5783 };
5784 int i;
5785 assert(ext);
5786 for(i = 0; list[i]; i++){
5787 if(!cbstricmp(ext, list[i])) return list[i+1];
5788 }
5789 return "application/octet-stream";
5790 }
5791
5792
5793 /* Set a seed vector from a map object. */
est_vector_set_seed(CBMAP * svmap,int * svec,int vnum)5794 void est_vector_set_seed(CBMAP *svmap, int *svec, int vnum){
5795 const char *kbuf;
5796 int nnum, ksiz;
5797 assert(svmap && svec && vnum > 0);
5798 cbmapiterinit(svmap);
5799 nnum = 0;
5800 while(nnum < vnum){
5801 if((kbuf = cbmapiternext(svmap, &ksiz)) != NULL){
5802 if(ksiz < 1) continue;
5803 svec[nnum++] = atoi(cbmapiterval(kbuf, NULL));
5804 } else {
5805 svec[nnum++] = 0;
5806 }
5807 }
5808 }
5809
5810
5811 /* Set a target vector from a map object. */
est_vector_set_target(CBMAP * svmap,CBMAP * tvmap,int * tvec,int vnum)5812 void est_vector_set_target(CBMAP *svmap, CBMAP *tvmap, int *tvec, int vnum){
5813 const char *kbuf, *vbuf;
5814 int i, ksiz;
5815 assert(svmap && tvmap && tvec && vnum > 0);
5816 cbmapiterinit(svmap);
5817 for(i = 0; i < vnum; i++){
5818 if((kbuf = cbmapiternext(svmap, &ksiz)) != NULL){
5819 vbuf = cbmapget(tvmap, kbuf, ksiz, NULL);
5820 tvec[i] = vbuf ? atoi(vbuf) : 0;
5821 } else {
5822 tvec[i] = 0;
5823 }
5824 }
5825 }
5826
5827
5828 /* Get the cosine of the angle of two vectors. */
est_vector_cosine(const int * avec,const int * bvec,int vnum)5829 double est_vector_cosine(const int *avec, const int *bvec, int vnum){
5830 int i;
5831 double iprod, aabs, babs;
5832 assert(avec && bvec && vnum >= 0);
5833 iprod = 0.0;
5834 for(i = 0; i < vnum; i++){
5835 iprod += (double)avec[i] * (double)bvec[i];
5836 }
5837 aabs = 0.0;
5838 for(i = 0; i < vnum; i++){
5839 aabs += (double)avec[i] * (double)avec[i];
5840 }
5841 aabs = sqrt(aabs);
5842 babs = 0.0;
5843 for(i = 0; i < vnum; i++){
5844 babs += (double)bvec[i] * (double)bvec[i];
5845 }
5846 babs = sqrt(babs);
5847 if(iprod <= 0.0 || aabs < 1.0 || babs < 1.0) return 0.0;
5848 return iprod / (aabs * babs);
5849 }
5850
5851
5852
5853 /*************************************************************************************************
5854 * private objects
5855 *************************************************************************************************/
5856
5857
5858 /* Set the error code.
5859 `ecp' specifies the pointer to a variable to be assigned.
5860 `value' specifies the error code to be assgined.
5861 `line' specifies the number of the line where the error happened. */
est_set_ecode(int * ecp,int value,int line)5862 static void est_set_ecode(int *ecp, int value, int line){
5863 char buf[ESTPATHBUFSIZ];
5864 assert(ecp && line > 0);
5865 *ecp = value;
5866 if(dpdbgfd >= 0){
5867 fflush(stdout);
5868 fflush(stderr);
5869 sprintf(buf, "* est_set_ecode: %d: [%d] %s\n", line, value, est_err_msg(value));
5870 write(dpdbgfd, buf, strlen(buf));
5871 }
5872 }
5873
5874
5875 /* Encode a string into hexadecimal.
5876 `str' specifies a string.
5877 The return value is the result hexadecimal string. */
est_hex_encode(const char * str)5878 static char *est_hex_encode(const char *str){
5879 char *res, *wp;
5880 assert(str);
5881 CB_MALLOC(res, strlen(str) * 2 + 1);
5882 wp = res;
5883 while(*str != '\0'){
5884 wp += sprintf(wp, "%02X", *(unsigned char *)str);
5885 str++;
5886 }
5887 *wp = '\0';
5888 return res;
5889 }
5890
5891
5892 /* Decode a hexadecimal string into original one.
5893 `str' specifies a hexadecimal string.
5894 The return value is the original string. */
est_hex_decode(const char * str)5895 static char *est_hex_decode(const char *str){
5896 char *res, *wp;
5897 int i, len;
5898 assert(str);
5899 len = strlen(str);
5900 CB_MALLOC(res, len + 1);
5901 wp = res;
5902 for(i = 0; i < len; i += 2){
5903 *(wp++) = (str[i] >= 'A' ? str[i] - 'A' + 10 : str[i] - '0') * 16 +
5904 (str[i+1] >= 'A' ? str[i+1] - 'A' + 10 : str[i+1] - '0');
5905 }
5906 *wp = '\0';
5907 return res;
5908 }
5909
5910
5911 /* Count the number of missing characters when converting.
5912 `ptr' specifies the pointer to a region.
5913 `size' specifies the size of the region.
5914 `icode' specifies the name of encoding of the input string.
5915 `ocode' specifies the name of encoding of the output string.
5916 The return value is the number of missing characters. */
est_enc_miss(const char * ptr,int size,const char * icode,const char * ocode)5917 static int est_enc_miss(const char *ptr, int size, const char *icode, const char *ocode){
5918 iconv_t ic;
5919 char obuf[ESTICCHECKSIZ], *wp, *rp;
5920 size_t isiz, osiz;
5921 int miss;
5922 assert(ptr && size >= 0 && icode && ocode);
5923 isiz = size;
5924 if((ic = iconv_open(ocode, icode)) == (iconv_t)-1) return ESTICMISSMAX;
5925 miss = 0;
5926 rp = (char *)ptr;
5927 while(isiz > 0){
5928 osiz = ESTICCHECKSIZ;
5929 wp = obuf;
5930 if(iconv(ic, (void *)&rp, &isiz, &wp, &osiz) == -1){
5931 if(errno == EILSEQ || errno == EINVAL){
5932 rp++;
5933 isiz--;
5934 miss++;
5935 if(miss >= ESTICMISSMAX) break;
5936 } else {
5937 break;
5938 }
5939 }
5940 }
5941 if(iconv_close(ic) == -1) return ESTICMISSMAX;
5942 return miss;
5943 }
5944
5945
5946 /* Normalize a text.
5947 `utext' specifies a text whose encoding is UTF-16BE.
5948 `size' specifies the size of the text.
5949 `sp' specifies the pointer to a variable to which the size of the result is assigned. */
est_normalize_text(unsigned char * utext,int size,int * sp)5950 static void est_normalize_text(unsigned char *utext, int size, int *sp){
5951 int i, wi, b1, b2;
5952 assert(utext && size >= 0 && sp);
5953 wi = 0;
5954 for(i = 0; i < size - 1; i += 2){
5955 b1 = utext[i];
5956 b2 = utext[i+1];
5957 if(b1 == 0x0){
5958 if(b2 <= 0x8 || (b2 >= 0x0e && b2 <= 0x1f)){
5959 /* control characters */
5960 utext[wi++] = 0x0;
5961 utext[wi++] = 0x20;
5962 } else if(b2 == 0xa0){
5963 /* no-break space */
5964 utext[wi++] = 0x0;
5965 utext[wi++] = 0x20;
5966 } else {
5967 /* (otherwise) */
5968 utext[wi++] = b1;
5969 utext[wi++] = b2;
5970 }
5971 } else if(b1 == 0x20){
5972 if(b2 == 0x2){
5973 /* en space */
5974 utext[wi++] = 0x0;
5975 utext[wi++] = 0x20;
5976 } else if(b2 == 0x3){
5977 /* em space */
5978 utext[wi++] = 0x0;
5979 utext[wi++] = 0x20;
5980 } else if(b2 == 0x9){
5981 /* thin space */
5982 utext[wi++] = 0x0;
5983 utext[wi++] = 0x20;
5984 } else if(b2 == 0x10){
5985 /* hyphen */
5986 utext[wi++] = 0x0;
5987 utext[wi++] = 0x2d;
5988 } else if(b2 == 0x15){
5989 /* fullwidth horizontal line */
5990 utext[wi++] = 0x0;
5991 utext[wi++] = 0x2d;
5992 } else if(b2 == 0x19){
5993 /* apostrophe */
5994 utext[wi++] = 0x0;
5995 utext[wi++] = 0x27;
5996 } else if(b2 == 0x33){
5997 /* double quotes */
5998 utext[wi++] = 0x0;
5999 utext[wi++] = 0x22;
6000 } else {
6001 /* (otherwise) */
6002 utext[wi++] = b1;
6003 utext[wi++] = b2;
6004 }
6005 } else if(b1 == 0x22){
6006 if(b2 == 0x12){
6007 /* minus sign */
6008 utext[wi++] = 0x0;
6009 utext[wi++] = 0x2d;
6010 } else {
6011 /* (otherwise) */
6012 utext[wi++] = b1;
6013 utext[wi++] = b2;
6014 }
6015 } else if(b1 == 0x30){
6016 if(b2 == 0x0){
6017 /* fullwidth space */
6018 utext[wi++] = 0x0;
6019 utext[wi++] = 0x20;
6020 } else {
6021 /* (otherwise) */
6022 utext[wi++] = b1;
6023 utext[wi++] = b2;
6024 }
6025 } else if(b1 == 0xff){
6026 if(b2 == 0x01){
6027 /* fullwidth exclamation */
6028 utext[wi++] = 0x0;
6029 utext[wi++] = 0x21;
6030 } else if(b2 == 0x03){
6031 /* fullwidth igeta */
6032 utext[wi++] = 0x0;
6033 utext[wi++] = 0x23;
6034 } else if(b2 == 0x04){
6035 /* fullwidth dollar */
6036 utext[wi++] = 0x0;
6037 utext[wi++] = 0x24;
6038 } else if(b2 == 0x05){
6039 /* fullwidth parcent */
6040 utext[wi++] = 0x0;
6041 utext[wi++] = 0x25;
6042 } else if(b2 == 0x06){
6043 /* fullwidth ampersand */
6044 utext[wi++] = 0x0;
6045 utext[wi++] = 0x26;
6046 } else if(b2 == 0x0a){
6047 /* fullwidth asterisk */
6048 utext[wi++] = 0x0;
6049 utext[wi++] = 0x2a;
6050 } else if(b2 == 0x0b){
6051 /* fullwidth plus */
6052 utext[wi++] = 0x0;
6053 utext[wi++] = 0x2b;
6054 } else if(b2 == 0x0c){
6055 /* fullwidth comma */
6056 utext[wi++] = 0x0;
6057 utext[wi++] = 0x2c;
6058 } else if(b2 == 0x0e){
6059 /* fullwidth period */
6060 utext[wi++] = 0x0;
6061 utext[wi++] = 0x2e;
6062 } else if(b2 == 0x0f){
6063 /* fullwidth slash */
6064 utext[wi++] = 0x0;
6065 utext[wi++] = 0x2f;
6066 } else if(b2 == 0x1a){
6067 /* fullwidth colon */
6068 utext[wi++] = 0x0;
6069 utext[wi++] = 0x3a;
6070 } else if(b2 == 0x1b){
6071 /* fullwidth semicolon */
6072 utext[wi++] = 0x0;
6073 utext[wi++] = 0x3b;
6074 } else if(b2 == 0x1d){
6075 /* fullwidth equal */
6076 utext[wi++] = 0x0;
6077 utext[wi++] = 0x3d;
6078 } else if(b2 == 0x1f){
6079 /* fullwidth question */
6080 utext[wi++] = 0x0;
6081 utext[wi++] = 0x3f;
6082 } else if(b2 == 0x20){
6083 /* fullwidth atmark */
6084 utext[wi++] = 0x0;
6085 utext[wi++] = 0x40;
6086 } else if(b2 == 0x3c){
6087 /* fullwidth backslash */
6088 utext[wi++] = 0x0;
6089 utext[wi++] = 0x5c;
6090 } else if(b2 == 0x3e){
6091 /* fullwidth circumflex */
6092 utext[wi++] = 0x0;
6093 utext[wi++] = 0x5e;
6094 } else if(b2 == 0x3f){
6095 /* fullwidth underscore */
6096 utext[wi++] = 0x0;
6097 utext[wi++] = 0x5f;
6098 } else if(b2 == 0x5c){
6099 /* fullwidth vertical line */
6100 utext[wi++] = 0x0;
6101 utext[wi++] = 0x7c;
6102 } else if(b2 >= 0x21 && b2 <= 0x3a){
6103 /* fullwidth alphabets */
6104 utext[wi++] = 0x0;
6105 utext[wi++] = b2 - 0x21 + 0x41;
6106 } else if(b2 >= 0x41 && b2 <= 0x5a){
6107 /* fullwidth small alphabets */
6108 utext[wi++] = 0x0;
6109 utext[wi++] = b2 - 0x41 + 0x61;
6110 } else if(b2 >= 0x10 && b2 <= 0x19){
6111 /* fullwidth numbers */
6112 utext[wi++] = 0x0;
6113 utext[wi++] = b2 - 0x10 + 0x30;
6114 } else if(b2 == 0x61){
6115 /* halfwidth full stop */
6116 utext[wi++] = 0x30;
6117 utext[wi++] = 0x2;
6118 } else if(b2 == 0x62){
6119 /* halfwidth left corner */
6120 utext[wi++] = 0x30;
6121 utext[wi++] = 0xc;
6122 } else if(b2 == 0x63){
6123 /* halfwidth right corner */
6124 utext[wi++] = 0x30;
6125 utext[wi++] = 0xd;
6126 } else if(b2 == 0x64){
6127 /* halfwidth comma */
6128 utext[wi++] = 0x30;
6129 utext[wi++] = 0x1;
6130 } else if(b2 == 0x65){
6131 /* halfwidth middle dot */
6132 utext[wi++] = 0x30;
6133 utext[wi++] = 0xfb;
6134 } else if(b2 == 0x66){
6135 /* halfwidth wo */
6136 utext[wi++] = 0x30;
6137 utext[wi++] = 0xf2;
6138 } else if(b2 >= 0x67 && b2 <= 0x6b){
6139 /* halfwidth small a-o */
6140 utext[wi++] = 0x30;
6141 utext[wi++] = (b2 - 0x67) * 2 + 0xa1;
6142 } else if(b2 >= 0x6c && b2 <= 0x6e){
6143 /* halfwidth small ya-yo */
6144 utext[wi++] = 0x30;
6145 utext[wi++] = (b2 - 0x6c) * 2 + 0xe3;
6146 } else if(b2 == 0x6f){
6147 /* halfwidth small tu */
6148 utext[wi++] = 0x30;
6149 utext[wi++] = 0xc3;
6150 } else if(b2 == 0x70){
6151 /* halfwidth prolonged mark */
6152 utext[wi++] = 0x30;
6153 utext[wi++] = 0xfc;
6154 } else if(b2 >= 0x71 && b2 <= 0x75){
6155 /* halfwidth a-o */
6156 utext[wi++] = 0x30;
6157 utext[wi++] = (b2 - 0x71) * 2 + 0xa2;
6158 if(i + 2 < size - 1 && b2 == 0x73 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
6159 utext[wi-1] = 0xf4;
6160 i += 2;
6161 }
6162 } else if(b2 >= 0x76 && b2 <= 0x7a){
6163 /* halfwidth ka-ko */
6164 utext[wi++] = 0x30;
6165 utext[wi++] = (b2 - 0x76) * 2 + 0xab;
6166 if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
6167 utext[wi-1] += 1;
6168 i += 2;
6169 }
6170 } else if(b2 >= 0x7b && b2 <= 0x7f){
6171 /* halfwidth sa-so */
6172 utext[wi++] = 0x30;
6173 utext[wi++] = (b2 - 0x7b) * 2 + 0xb5;
6174 if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
6175 utext[wi-1] += 1;
6176 i += 2;
6177 }
6178 } else if(b2 >= 0x80 && b2 <= 0x84){
6179 /* halfwidth ta-to */
6180 utext[wi++] = 0x30;
6181 utext[wi++] = (b2 - 0x80) * 2 + 0xbf + (b2 >= 0x82 ? 1 : 0);
6182 if(i + 2 < size - 1 && utext[i+2] == 0xff && utext[i+3] == 0x9e){
6183 utext[wi-1] += 1;
6184 i += 2;
6185 }
6186 } else if(b2 >= 0x85 && b2 <= 0x89){
6187 /* halfwidth na-no */
6188 utext[wi++] = 0x30;
6189 utext[wi++] = b2 - 0x85 + 0xca;
6190 } else if(b2 >= 0x8a && b2 <= 0x8e){
6191 /* halfwidth ha-ho */
6192 utext[wi++] = 0x30;
6193 utext[wi++] = (b2 - 0x8a) * 3 + 0xcf;
6194 if(i + 2 < size - 1){
6195 if(utext[i+2] == 0xff && utext[i+3] == 0x9e){
6196 utext[wi-1] += 1;
6197 i += 2;
6198 } else if(utext[i+2] == 0xff && utext[i+3] == 0x9f){
6199 utext[wi-1] += 2;
6200 i += 2;
6201 }
6202 }
6203 } else if(b2 >= 0x8f && b2 <= 0x93){
6204 /* halfwidth ma-mo */
6205 utext[wi++] = 0x30;
6206 utext[wi++] = b2 - 0x8f + 0xde;
6207 } else if(b2 >= 0x94 && b2 <= 0x96){
6208 /* halfwidth ya-yo */
6209 utext[wi++] = 0x30;
6210 utext[wi++] = (b2 - 0x94) * 2 + 0xe4;
6211 } else if(b2 >= 0x97 && b2 <= 0x9b){
6212 /* halfwidth ra-ro */
6213 utext[wi++] = 0x30;
6214 utext[wi++] = b2 - 0x97 + 0xe9;
6215 } else if(b2 == 0x9c){
6216 /* halfwidth wa */
6217 utext[wi++] = 0x30;
6218 utext[wi++] = 0xef;
6219 } else if(b2 == 0x9d){
6220 /* halfwidth wo */
6221 utext[wi++] = 0x30;
6222 utext[wi++] = 0xf3;
6223 } else {
6224 /* (otherwise) */
6225 utext[wi++] = b1;
6226 utext[wi++] = b2;
6227 }
6228 } else {
6229 /* (otherwise) */
6230 utext[wi++] = b1;
6231 utext[wi++] = b2;
6232 }
6233 }
6234 *sp = wi;
6235 }
6236
6237
6238 /* Canonicalize a text for search keys.
6239 `utext' specifies a text whose encoding is UTF-16BE.
6240 `size' specifies the size of the text.
6241 `funcspc' specifies whether to allow functional space characters. */
est_canonicalize_text(unsigned char * utext,int size,int funcspc)6242 static void est_canonicalize_text(unsigned char *utext, int size, int funcspc){
6243 int i;
6244 for(i = 0; i < size; i += 2){
6245 if(utext[i] == 0x0){
6246 if(utext[i+1] < ' '){
6247 /* functional spaces */
6248 if(!funcspc) utext[i+1] = ' ';
6249 } else if(utext[i+1] >= 'A' && utext[i+1] <= 'Z'){
6250 /* ascii */
6251 utext[i+1] += 'a' - 'A';
6252 } else if(utext[i+1] >= 0xc0){
6253 /* latin-1 supplement */
6254 if((utext[i+1] >= 0xc0 && utext[i+1] <= 0xd6) ||
6255 (utext[i+1] >= 0xd8 && utext[i+1] <= 0xde)) utext[i+1] += 0x20;
6256 if(utext[i+1] >= 0xe0 && utext[i+1] <= 0xe5){
6257 utext[i+1] = 'a';
6258 } else if(utext[i+1] == 0xe7){
6259 utext[i+1] = 'c';
6260 } else if(utext[i+1] >= 0xe8 && utext[i+1] <= 0xeb){
6261 utext[i+1] = 'e';
6262 } else if(utext[i+1] >= 0xec && utext[i+1] <= 0xef){
6263 utext[i+1] = 'i';
6264 } else if(utext[i+1] == 0xf1){
6265 utext[i+1] = 'n';
6266 } else if((utext[i+1] >= 0xf2 && utext[i+1] <= 0xf6) || utext[i+1] == 0xf8){
6267 utext[i+1] = 'o';
6268 } else if(utext[i+1] >= 0xf9 && utext[i+1] <= 0xfc){
6269 utext[i+1] = 'u';
6270 } else if(utext[i+1] == 0xfd || utext[i+1] == 0xff){
6271 utext[i+1] = 'y';
6272 }
6273 }
6274 } else if(utext[i] == 0x1){
6275 /* latin extended-a */
6276 if((utext[i+1] <= 0x36 && utext[i+1] % 2 == 0) ||
6277 (utext[i+1] >= 0x39 && utext[i+1] <= 0x47 && utext[i+1] % 2 == 1) ||
6278 (utext[i+1] >= 0x4a && utext[i+1] <= 0x76 && utext[i+1] % 2 == 0) ||
6279 (utext[i+1] >= 0x79 && utext[i+1] <= 0x7d && utext[i+1] % 2 == 1))
6280 utext[i+1] += 0x1;
6281 if(utext[i+1] <= 0x05){
6282 utext[i] = 0x0;
6283 utext[i+1] = 'a';
6284 } else if(utext[i+1] >= 0x06 && utext[i+1] <= 0x0d){
6285 utext[i] = 0x0;
6286 utext[i+1] = 'c';
6287 } else if(utext[i+1] >= 0x0e && utext[i+1] <= 0x11){
6288 utext[i] = 0x0;
6289 utext[i+1] = 'd';
6290 } else if(utext[i+1] >= 0x12 && utext[i+1] <= 0x1b){
6291 utext[i] = 0x0;
6292 utext[i+1] = 'e';
6293 } else if(utext[i+1] >= 0x1c && utext[i+1] <= 0x23){
6294 utext[i] = 0x0;
6295 utext[i+1] = 'g';
6296 } else if(utext[i+1] >= 0x24 && utext[i+1] <= 0x27){
6297 utext[i] = 0x0;
6298 utext[i+1] = 'h';
6299 } else if(utext[i+1] >= 0x28 && utext[i+1] <= 0x31){
6300 utext[i] = 0x0;
6301 utext[i+1] = 'i';
6302 } else if(utext[i+1] >= 0x34 && utext[i+1] <= 0x35){
6303 utext[i] = 0x0;
6304 utext[i+1] = 'j';
6305 } else if(utext[i+1] >= 0x36 && utext[i+1] <= 0x38){
6306 utext[i] = 0x0;
6307 utext[i+1] = 'k';
6308 } else if(utext[i+1] >= 0x39 && utext[i+1] <= 0x42){
6309 utext[i] = 0x0;
6310 utext[i+1] = 'l';
6311 } else if(utext[i+1] >= 0x43 && utext[i+1] <= 0x4b){
6312 utext[i] = 0x0;
6313 utext[i+1] = 'n';
6314 } else if(utext[i+1] >= 0x4c && utext[i+1] <= 0x51){
6315 utext[i] = 0x0;
6316 utext[i+1] = 'o';
6317 } else if(utext[i+1] >= 0x54 && utext[i+1] <= 0x59){
6318 utext[i] = 0x0;
6319 utext[i+1] = 'r';
6320 } else if((utext[i+1] >= 0x5a && utext[i+1] <= 0x61) || utext[i+1] == 0x7f){
6321 utext[i] = 0x0;
6322 utext[i+1] = 's';
6323 } else if(utext[i+1] >= 0x62 && utext[i+1] <= 0x67){
6324 utext[i] = 0x0;
6325 utext[i+1] = 't';
6326 } else if(utext[i+1] >= 0x68 && utext[i+1] <= 0x73){
6327 utext[i] = 0x0;
6328 utext[i+1] = 'u';
6329 } else if(utext[i+1] >= 0x74 && utext[i+1] <= 0x75){
6330 utext[i] = 0x0;
6331 utext[i+1] = 'w';
6332 } else if(utext[i+1] >= 0x76 && utext[i+1] <= 0x78){
6333 utext[i] = 0x0;
6334 utext[i+1] = 'y';
6335 } else if(utext[i+1] >= 0x79 && utext[i+1] <= 0x7e){
6336 utext[i] = 0x0;
6337 utext[i+1] = 'z';
6338 }
6339 } else if(utext[i] == 0x3){
6340 /* greek */
6341 if(utext[i+1] >= 0x91 && utext[i+1] <= 0xa9) utext[i+1] += 0x20;
6342 } else if(utext[i] == 0x4){
6343 /* cyrillic */
6344 if(utext[i+1] >= 0x10 && utext[i+1] <= 0x2f){
6345 utext[i+1] += 0x20;
6346 } else if(utext[i+1] <= 0x0f){
6347 utext[i+1] += 0x50;
6348 }
6349 } else if(utext[i] == 0xff){
6350 /* special */
6351 if(utext[i+1] >= 0xf0){
6352 utext[i] = 0x0;
6353 utext[i+1] = ' ';
6354 }
6355 }
6356 }
6357 }
6358
6359
6360 /* Categorize a character.
6361 `c' specifies the UCS number of a character.
6362 The return value is the category of the character. */
est_char_category(int c)6363 static int est_char_category(int c){
6364 /* ascii space */
6365 if(c <= 0x0020) return ESTSPACECHR;
6366 /* ascii alnum */
6367 if((c >= 0x0030 && c <= 0x0039) || (c >= 0x0041 && c <= 0x005a) ||
6368 (c >= 0x0061 && c <= 0x007a)) return ESTWESTALPH;
6369 /* latin */
6370 if((c >= 0x00c0 && c <= 0x00ff && c != 0x00d7 && c != 0x00f7) || (c >= 0x0100 && c <= 0x017f))
6371 return ESTWESTALPH;
6372 /* arabic and syrian */
6373 if(c >= 0x0600 && c <= 0x08ff) return ESTEASTALPH;
6374 /* south and south east asia */
6375 if((c >= 0x0900 && c <= 0x109f) || (c >= 0x1700 && c <= 0x1cff)) return ESTEASTALPH;
6376 /* cjk and surrogates */
6377 if((c >= 0x1100 && c <= 0x11ff) || (c >= 0x2e80 && c <= 0xdfff) ||
6378 (c >= 0xf900 && c <= 0xfaff) || (c >= 0xff00 && c <= 0xffef)) return ESTEASTALPH;
6379 /* asian presentation forms */
6380 if((c >= 0xfb50 && c <= 0xfdff) || (c >= 0xfe30 && c <= 0xfe4f) ||
6381 (c >= 0xfe70 && c <= 0xfeff)) return ESTEASTALPH;
6382 /* others */
6383 return ESTDELIMCHR;
6384 }
6385
6386
6387 /* Categorize a character for perfect N-gram analyzer.
6388 `c' specifies the UCS number of a character.
6389 The return value is the category of the character. */
est_char_category_perfng(int c)6390 static int est_char_category_perfng(int c){
6391 if(c <= 0x0020) return ESTSPACECHR;
6392 return ESTEASTALPH;
6393 }
6394
6395
6396 /* Categorize a character for character category analyzer.
6397 `c' specifies the UCS number of a character.
6398 The return value is the category of the character. */
est_char_category_chrcat(int c)6399 static int est_char_category_chrcat(int c){
6400 /* ascii space */
6401 if(c <= 0x0020) return ESTSPACECHR;
6402 /* ascii alnum */
6403 if((c >= 0x0030 && c <= 0x0039) || (c >= 0x0041 && c <= 0x005a) ||
6404 (c >= 0x0061 && c <= 0x007a)) return ESTWESTALPH;
6405 /* latin */
6406 if((c >= 0x00c0 && c <= 0x00ff && c != 0x00d7 && c != 0x00f7) || (c >= 0x0100 && c <= 0x017f))
6407 return ESTWESTALPH;
6408 /* arabic and syrian */
6409 if(c >= 0x0600 && c <= 0x08ff) return ESTEASTALPH;
6410 /* south and south east asia */
6411 if((c >= 0x0900 && c <= 0x109f) || (c >= 0x1700 && c <= 0x1cff)) return ESTEASTALPH;
6412 /* hiragana */
6413 if(c >= 0x3040 && c <= 0x309f) return ESTHIRAGANA;
6414 /* katakana */
6415 if(c >= 0x30a0 && c <= 0x30ff) return ESTKATAKANA;
6416 /* hangul */
6417 if((c >= 0x1100 && c <= 0x11ff) || (c >= 0x3130 && c <= 0x318f) ||
6418 (c >= 0xac00 && c <= 0xd7af)) return ESTHANGUL;
6419 /* kanji */
6420 if(c >= 0x4e00 && c <= 0x9faf) return ESTKANJI;
6421 /* other cjk and surrogates */
6422 if((c >= 0x2e80 && c <= 0xdfff) || (c >= 0xf900 && c <= 0xfaff) ||
6423 (c >= 0xff00 && c <= 0xffef)) return ESTEASTALPH;
6424 /* asian presentation forms */
6425 if((c >= 0xfb50 && c <= 0xfdff) || (c >= 0xfe30 && c <= 0xfe4f) ||
6426 (c >= 0xfe70 && c <= 0xfeff)) return ESTEASTALPH;
6427 /* others */
6428 return ESTDELIMCHR;
6429 }
6430
6431
6432 /* Make a snippet of an arbitrary string.
6433 `word' specifies a list object of words to be highlight.
6434 `wwidth' specifies whole width of the result.
6435 `hwidth' specifies width of strings picked up from the beginning of the text.
6436 `awidth' specifies width of strings picked up around each highlighted word.
6437 The return value is a snippet string of the string. */
est_make_snippet(const char * str,int len,const CBLIST * words,int wwidth,int hwidth,int awidth)6438 static char *est_make_snippet(const char *str, int len, const CBLIST *words,
6439 int wwidth, int hwidth, int awidth){
6440 CBDATUM *res;
6441 CBMAP *counts;
6442 CBLIST *rwords;
6443 const char *word, *cval;
6444 const unsigned char *rword;
6445 unsigned char *rtext, *ctext;
6446 int i, j, k, bi, size, wsiz, rwsiz, mywidth, awsiz, csiz;
6447 assert(str && len >= 0 && words && wwidth >= 0 && hwidth >= 0 && awidth >= 0);
6448 CB_DATUMOPEN(res);
6449 CB_LISTOPEN(rwords);
6450 for(i = 0; i < CB_LISTNUM(words); i++){
6451 word = CB_LISTVAL2(words, i, wsiz);
6452 if(wsiz < 1 || !strcmp(word, ESTOPUVSET)) continue;
6453 rtext = (unsigned char *)est_uconv_in(word, wsiz, &size);
6454 est_canonicalize_text(rtext, size, TRUE);
6455 CB_LISTPUSHBUF(rwords, (char *)rtext, size);
6456 }
6457 rtext = (unsigned char *)est_uconv_in(str, len, &size);
6458 ctext = (unsigned char *)cbmemdup((char *)rtext, size);
6459 est_canonicalize_text(ctext, size, FALSE);
6460 mywidth = hwidth;
6461 if(CB_LISTNUM(rwords) < 1) mywidth *= 3;
6462 if(mywidth > wwidth) mywidth = wwidth;
6463 for(i = 0; i < size && mywidth > 0; i += 2){
6464 mywidth -= est_char_category(rtext[i] * 0x100 + rtext[i+1]) == ESTEASTALPH ? 2 : 1;
6465 }
6466 awsiz = size - i;
6467 if(awsiz > ESTWORDMAXLEN) awsiz = ESTWORDMAXLEN;
6468 est_snippet_add_text(rtext, ctext, i, awsiz, res, rwords);
6469 wwidth -= hwidth;
6470 bi = i + 2;
6471 CB_DATUMCAT(res, "\n", 1);
6472 if(awidth > 0){
6473 counts = cbmapopenex(ESTMINIBNUM);
6474 for(i = bi; i < size && wwidth >= 0; i += 2){
6475 for(j = 0; j < CB_LISTNUM(rwords); j++){
6476 rword = (unsigned char *)CB_LISTVAL2(rwords, j, rwsiz);
6477 if(est_str_fwmatch_wide(ctext + i, size - i, rword, rwsiz) > 0 &&
6478 (!(cval = cbmapget(counts, (char *)rword, rwsiz, &csiz)) ||
6479 csiz < (wwidth > awidth * 1.2 ? 2 : 1))){
6480 cbmapputcat(counts, (char *)rword, rwsiz, "*", 1);
6481 if(cbmaprnum(counts) >= CB_LISTNUM(rwords)){
6482 cbmapclose(counts);
6483 counts = cbmapopenex(ESTMINIBNUM);
6484 }
6485 mywidth = awidth / 2 + 1;
6486 for(k = i - 2; k >= bi && mywidth >= 0; k -= 2){
6487 mywidth -= est_char_category(rtext[k] * 0x100 + rtext[k+1]) == ESTEASTALPH ? 2 : 1;
6488 }
6489 bi = k;
6490 mywidth = awidth / 2 + 1;
6491 for(k = i + rwsiz + 2; k < size && mywidth >= 0; k += 2){
6492 mywidth -= est_char_category(rtext[k] * 0x100 + rtext[k+1]) == ESTEASTALPH ? 2 : 1;
6493 }
6494 if(k > size) k = size;
6495 est_snippet_add_text(rtext + bi, ctext + bi, k - bi, 0, res, rwords);
6496 wwidth -= awidth + rwsiz / 2;
6497 bi = k + 2;
6498 i = bi - 2;
6499 CB_DATUMCAT(res, "\n", 1);
6500 break;
6501 }
6502 }
6503 }
6504 cbmapclose(counts);
6505 }
6506 free(ctext);
6507 free(rtext);
6508 CB_LISTCLOSE(rwords);
6509 return cbdatumtomalloc(res, NULL);
6510 }
6511
6512
6513 /* Check whether a string is compsed of CJK characters only.
6514 `str' specifies a string of UTF-8.
6515 The return value is whether the string is compsed of CJK characters only. */
est_check_cjk_only(const char * str)6516 static int est_check_cjk_only(const char *str){
6517 const unsigned char *rp;
6518 int size;
6519 rp = (unsigned char *)str;
6520 size = strlen(str);
6521 while(rp < (unsigned char *)str + size){
6522 if(*rp < 0x7f){
6523 return FALSE;
6524 } else if(*rp < 0xdf){
6525 return FALSE;
6526 } else if(*rp < 0xf0){
6527 if(rp >= (unsigned char *)str + size - 2) break;
6528 rp += 3;
6529 } else if(*rp < 0xf8){
6530 if(rp >= (unsigned char *)str + size - 3) break;
6531 rp += 4;
6532 } else if(*rp < 0xfb){
6533 if(rp >= (unsigned char *)str + size - 4) break;
6534 rp += 5;
6535 } else if(*rp < 0xfd){
6536 if(rp >= (unsigned char *)str + size - 5) break;
6537 rp += 6;
6538 } else {
6539 break;
6540 }
6541 }
6542 return TRUE;
6543 }
6544
6545
6546 /* Convert a simplified phrase into complete form.
6547 `sphrase' specifies a simplified phrase.
6548 The return value is the complete form of the phrase. */
est_phrase_from_simple(const char * sphrase)6549 static char *est_phrase_from_simple(const char *sphrase){
6550 CBDATUM *datum;
6551 const char *oper, *rp, *pv;
6552 unsigned char *utext;
6553 char *rtext;
6554 int size, quote, lw;
6555 assert(sphrase);
6556 CB_DATUMOPEN(datum);
6557 utext = (unsigned char *)est_uconv_in(sphrase, strlen(sphrase), &size);
6558 est_normalize_text(utext, size, &size);
6559 est_canonicalize_text(utext, size, FALSE);
6560 rtext = est_uconv_out((char *)utext, size, NULL);
6561 cbstrsqzspc(rtext);
6562 quote = FALSE;
6563 oper = NULL;
6564 lw = FALSE;
6565 for(rp = rtext; *rp != '\0'; rp++){
6566 if(*rp == '"'){
6567 if(oper){
6568 CB_DATUMCAT(datum, oper, strlen(oper));
6569 oper = NULL;
6570 }
6571 quote = !quote;
6572 continue;
6573 }
6574 if(quote){
6575 CB_DATUMCAT(datum, rp, 1);
6576 continue;
6577 }
6578 switch(*rp){
6579 case ' ':
6580 if(!oper) oper = " AND ";
6581 lw = FALSE;
6582 break;
6583 case '&':
6584 oper = " AND ";
6585 lw = FALSE;
6586 break;
6587 case '|':
6588 oper = " OR ";
6589 lw = FALSE;
6590 break;
6591 case '!':
6592 oper = " ANDNOT ";
6593 lw = FALSE;
6594 break;
6595 default:
6596 if(oper){
6597 CB_DATUMCAT(datum, oper, strlen(oper));
6598 oper = NULL;
6599 }
6600 if(!lw){
6601 pv = rp;
6602 while(*pv != '\0' && *pv != ' '){
6603 pv++;
6604 }
6605 if(pv > rp + 1 && pv[-1] == '*'){
6606 if(rp[0] == '*'){
6607 CB_DATUMCAT(datum, ESTOPWCRX " ", strlen(ESTOPWCRX) + 1);
6608 } else {
6609 CB_DATUMCAT(datum, ESTOPWCBW " ", strlen(ESTOPWCBW) + 1);
6610 }
6611 } else if(pv > rp + 1 && rp[0] == '*'){
6612 if(pv[-1] == '*'){
6613 CB_DATUMCAT(datum, ESTOPWCRX " ", strlen(ESTOPWCRX) + 1);
6614 } else {
6615 CB_DATUMCAT(datum, ESTOPWCEW " ", strlen(ESTOPWCEW) + 1);
6616 }
6617 }
6618 }
6619 if(*rp != '*' || (lw && rp[1] != '\0' && rp[1] != ' ')) CB_DATUMCAT(datum, rp, 1);
6620 lw = TRUE;
6621 }
6622 }
6623 free(rtext);
6624 free(utext);
6625 return cbdatumtomalloc(datum, NULL);
6626 }
6627
6628
6629 /* Convert a rough phrase into complete form.
6630 `rphrase' specifies a simplified phrase.
6631 The return value is the complete form of the phrase. */
est_phrase_from_rough(const char * rphrase)6632 static char *est_phrase_from_rough(const char *rphrase){
6633 CBDATUM *datum;
6634 const char *oper, *rp;
6635 unsigned char *utext;
6636 char *rtext;
6637 int size, quote, lw;
6638 assert(rphrase);
6639 CB_DATUMOPEN(datum);
6640 utext = (unsigned char *)est_uconv_in(rphrase, strlen(rphrase), &size);
6641 est_normalize_text(utext, size, &size);
6642 est_canonicalize_text(utext, size, FALSE);
6643 rtext = est_uconv_out((char *)utext, size, NULL);
6644 cbstrsqzspc(rtext);
6645 quote = FALSE;
6646 oper = NULL;
6647 lw = FALSE;
6648 for(rp = rtext; *rp != '\0'; rp++){
6649 if(*rp == '"'){
6650 if(oper){
6651 CB_DATUMCAT(datum, oper, strlen(oper));
6652 oper = NULL;
6653 }
6654 quote = !quote;
6655 continue;
6656 }
6657 if(quote){
6658 CB_DATUMCAT(datum, rp, 1);
6659 continue;
6660 }
6661 switch(*rp){
6662 case ' ':
6663 if(!oper) oper = " AND ";
6664 lw = FALSE;
6665 break;
6666 case '&':
6667 oper = " AND ";
6668 lw = FALSE;
6669 break;
6670 case '|':
6671 oper = " OR ";
6672 lw = FALSE;
6673 break;
6674 case '-':
6675 if(lw){
6676 CB_DATUMCAT(datum, rp, 1);
6677 } else {
6678 oper = " ANDNOT ";
6679 }
6680 break;
6681 default:
6682 if(oper){
6683 CB_DATUMCAT(datum, oper, strlen(oper));
6684 oper = NULL;
6685 }
6686 CB_DATUMCAT(datum, rp, 1);
6687 lw = TRUE;
6688 }
6689 }
6690 free(rtext);
6691 free(utext);
6692 return cbdatumtomalloc(datum, NULL);
6693 }
6694
6695
6696 /* Convert a union phrase into complete form.
6697 `uphrase' specifies a simplified phrase.
6698 The return value is the complete form of the phrase. */
est_phrase_from_union(const char * uphrase)6699 static char *est_phrase_from_union(const char *uphrase){
6700 CBDATUM *datum;
6701 CBLIST *terms;
6702 const char *term;
6703 unsigned char *utext;
6704 char *rtext;
6705 int i, size;
6706 assert(uphrase);
6707 CB_DATUMOPEN(datum);
6708 utext = (unsigned char *)est_uconv_in(uphrase, strlen(uphrase), &size);
6709 est_normalize_text(utext, size, &size);
6710 est_canonicalize_text(utext, size, FALSE);
6711 rtext = est_uconv_out((char *)utext, size, NULL);
6712 cbstrsqzspc(rtext);
6713 terms = cbsplit(rtext, -1, " ");
6714 for(i = 0; i < CB_LISTNUM(terms); i++){
6715 term = CB_LISTVAL2(terms, i, size);
6716 if(size < 1) continue;
6717 if(CB_DATUMSIZE(datum) > 0) CB_DATUMCAT(datum, " OR ", 4);
6718 CB_DATUMCAT(datum, term, size);
6719 }
6720 CB_LISTCLOSE(terms);
6721 free(rtext);
6722 free(utext);
6723 return cbdatumtomalloc(datum, NULL);
6724 }
6725
6726
6727 /* Convert a intersection phrase into complete form.
6728 `iphrase' specifies a simplified phrase.
6729 The return value is the complete form of the phrase. */
est_phrase_from_isect(const char * iphrase)6730 static char *est_phrase_from_isect(const char *iphrase){
6731 CBDATUM *datum;
6732 CBLIST *terms;
6733 const char *term;
6734 unsigned char *utext;
6735 char *rtext;
6736 int i, size;
6737 assert(iphrase);
6738 CB_DATUMOPEN(datum);
6739 utext = (unsigned char *)est_uconv_in(iphrase, strlen(iphrase), &size);
6740 est_normalize_text(utext, size, &size);
6741 est_canonicalize_text(utext, size, FALSE);
6742 rtext = est_uconv_out((char *)utext, size, NULL);
6743 cbstrsqzspc(rtext);
6744 terms = cbsplit(rtext, -1, " ");
6745 for(i = 0; i < CB_LISTNUM(terms); i++){
6746 term = CB_LISTVAL2(terms, i, size);
6747 if(size < 1) continue;
6748 if(CB_DATUMSIZE(datum) > 0) CB_DATUMCAT(datum, " AND ", 5);
6749 CB_DATUMCAT(datum, term, size);
6750 }
6751 CB_LISTCLOSE(terms);
6752 free(rtext);
6753 free(utext);
6754 return cbdatumtomalloc(datum, NULL);
6755 }
6756
6757
6758 /* Add a string to a snippet.
6759 `rtext' specifies a raw text.
6760 `ctext' specifies a canonicalized text.
6761 `size' specifies the size of the raw text and the canonicalized text.
6762 `awsiz' specifies the size of allowance for matching words.
6763 `res' specifies a datum object for the result.
6764 `rwords' specifies a list object of raw words. */
est_snippet_add_text(const unsigned char * rtext,const unsigned char * ctext,int size,int awsiz,CBDATUM * res,const CBLIST * rwords)6765 static void est_snippet_add_text(const unsigned char *rtext, const unsigned char *ctext,
6766 int size, int awsiz, CBDATUM *res, const CBLIST *rwords){
6767 const unsigned char *rword;
6768 char *orig;
6769 int i, j, bi, rwsiz, step, osiz;
6770 bi = 0;
6771 for(i = 0; i < size; i += 2){
6772 for(j = 0; j < CB_LISTNUM(rwords); j++){
6773 rword = (unsigned char *)CB_LISTVAL2(rwords, j, rwsiz);
6774 if((step = est_str_fwmatch_wide(ctext + i, size + awsiz - i, rword, rwsiz)) > 0){
6775 if(i - bi > 0){
6776 orig = est_uconv_out((char *)rtext + bi, i - bi, &osiz);
6777 CB_DATUMCAT(res, orig, osiz);
6778 CB_DATUMCAT(res, "\n", 1);
6779 free(orig);
6780 }
6781 orig = est_uconv_out((char *)rtext + i, step, &osiz);
6782 CB_DATUMCAT(res, orig, osiz);
6783 free(orig);
6784 CB_DATUMCAT(res, "\t", 1);
6785 orig = est_uconv_out((char *)rword, rwsiz, &osiz);
6786 CB_DATUMCAT(res, orig, osiz);
6787 free(orig);
6788 CB_DATUMCAT(res, "\n", 1);
6789 bi = i + step;
6790 i = bi - 2;
6791 break;
6792 }
6793 }
6794 }
6795 if(i - bi > 0){
6796 orig = est_uconv_out((char *)rtext + bi, i - bi, &osiz);
6797 CB_DATUMCAT(res, orig, osiz);
6798 CB_DATUMCAT(res, "\n", 1);
6799 free(orig);
6800 }
6801 }
6802
6803
6804 /* Check whether a string begins with a key.
6805 `string' specifies a target string whose encoding is UTF-16BE.
6806 `size' specifies the size of the target string.
6807 `key' specifies a key string whose encoding is UTF-16BE.
6808 `ksiz' specifies the size of the key string.
6809 `key' specifies the pointer
6810 The return value is the number of characters of the corresponding string, or 0 if the target
6811 string does not begin with the key. */
est_str_fwmatch_wide(const unsigned char * str,int size,const unsigned char * key,int ksiz)6812 static int est_str_fwmatch_wide(const unsigned char *str, int size,
6813 const unsigned char *key, int ksiz){
6814 int si, ki;
6815 assert(str && size >= 0 && key && ksiz >= 0);
6816 if(size < 2 || ksiz < 2 || (str[0] == 0x0 && str[1] <= 0x20)) return 0;
6817 si = 0;
6818 ki = 0;
6819 while(ki < ksiz){
6820 if(si >= size) return 0;
6821 if(str[si] == 0x0 && str[si+1] <= 0x20){
6822 si += 2;
6823 continue;
6824 }
6825 if(key[ki] == 0x0 && key[ki+1] <= 0x20){
6826 ki += 2;
6827 continue;
6828 }
6829 if(str[si] != key[ki] || str[si+1] != key[ki+1]) return 0;
6830 si += 2;
6831 ki += 2;
6832 }
6833 return si;
6834 }
6835
6836
6837 /* Find the first occurrence of a substring ignoring space characters.
6838 `haystack' specifies a target string.
6839 `needle' specifies a substring.
6840 The the pointer to the first occurrence. */
est_strstr_sparse(const char * haystack,const char * needle)6841 static char *est_strstr_sparse(const char *haystack, const char *needle){
6842 const char *hp, *np;
6843 assert(haystack && needle);
6844 while(*needle > '\0' && *needle <= ' '){
6845 needle++;
6846 }
6847 if(needle[0] == '\0') return (char *)haystack;
6848 while((haystack = strchr(haystack, *needle)) != NULL){
6849 hp = haystack;
6850 np = needle;
6851 while(TRUE){
6852 while(*hp > '\0' && *hp <= ' '){
6853 hp++;
6854 }
6855 while(*np > '\0' && *np <= ' '){
6856 np++;
6857 }
6858 if(*np == '\0') return (char *)haystack;
6859 if(*hp != *np || *hp == '\0') break;
6860 hp++;
6861 np++;
6862 }
6863 haystack++;
6864 }
6865 return NULL;
6866 }
6867
6868
6869 /* Get the last ID number in an index record.
6870 `vbuf' specifies the pointer to the value of a record.
6871 `vsiz' specifies the size of the value.
6872 `smode' specifies a mode of score type.
6873 The return value is the last ID number in a record. */
est_idx_rec_last_id(const char * vbuf,int vsiz,int smode)6874 static int est_idx_rec_last_id(const char *vbuf, int vsiz, int smode){
6875 const char *rp, *ep, *sp;
6876 int cid, vnum, vstep;
6877 assert(vbuf && vsiz >= 0);
6878 cid = 0;
6879 rp = vbuf;
6880 ep = vbuf + vsiz;
6881 while(rp < ep){
6882 EST_READ_VNUMBUF(rp, vnum, vstep);
6883 cid += vnum + 1;
6884 rp += vstep;
6885 sp = rp;
6886 switch(smode){
6887 case ESTDFSCVOID:
6888 break;
6889 default:
6890 rp++;
6891 break;
6892 case ESTDFSCINT:
6893 case ESTDFSCASIS:
6894 rp += sizeof(int);
6895 break;
6896 }
6897 while(*rp != 0x00){
6898 rp += 2;
6899 }
6900 rp++;
6901 }
6902 return cid;
6903 }
6904
6905
6906 /* Encode a raw index record into a gap form.
6907 `datum' specifies a datum to store the result.
6908 `vbuf' specifies the pointer to the value of a raw index record.
6909 `vsiz' specifies the size of the value of the record.
6910 `lid' specifies the last ID number in the existing record.
6911 `smode' specifies a mode of score type. */
est_encode_idx_rec(CBDATUM * datum,const char * vbuf,int vsiz,int lid,int smode)6912 static void est_encode_idx_rec(CBDATUM *datum, const char *vbuf, int vsiz, int lid, int smode){
6913 const char *rp, *ep, *sp;
6914 char nbuf[ESTNUMBUFSIZ];
6915 int cid, vstep;
6916 assert(datum && vbuf && vsiz >= 0);
6917 rp = vbuf;
6918 ep = vbuf + vsiz;
6919 while(rp < ep){
6920 EST_READ_VNUMBUF(rp, cid, vstep);
6921 rp += vstep;
6922 sp = rp;
6923 switch(smode){
6924 case ESTDFSCVOID:
6925 break;
6926 default:
6927 rp++;
6928 break;
6929 case ESTDFSCINT:
6930 case ESTDFSCASIS:
6931 rp += sizeof(int);
6932 break;
6933 }
6934 while(*rp != 0x00){
6935 rp += 2;
6936 }
6937 rp++;
6938 EST_SET_VNUMBUF(vstep, nbuf, cid - lid - 1);
6939 CB_DATUMCAT(datum, nbuf, vstep);
6940 CB_DATUMCAT(datum, sp, rp - sp);
6941 lid = cid;
6942 }
6943 }
6944
6945
6946 /* Decode a gap index record into a raw form.
6947 `datum' specifies a datum to store the result.
6948 `vbuf' specifies the pointer to the value of a gap index record.
6949 `vsiz' specifies the size of the value of the record.
6950 `smode' specifies a mode of score type. */
est_decode_idx_rec(CBDATUM * datum,const char * vbuf,int vsiz,int smode)6951 static void est_decode_idx_rec(CBDATUM *datum, const char *vbuf, int vsiz, int smode){
6952 const char *rp, *ep, *sp;
6953 char nbuf[ESTNUMBUFSIZ];
6954 int cid, vnum, vstep;
6955 assert(datum && vbuf && vsiz >= 0);
6956 rp = vbuf;
6957 ep = vbuf + vsiz;
6958 cid = 0;
6959 while(rp < ep){
6960 EST_READ_VNUMBUF(rp, vnum, vstep);
6961 cid += vnum + 1;
6962 rp += vstep;
6963 sp = rp;
6964 switch(smode){
6965 case ESTDFSCVOID:
6966 break;
6967 default:
6968 rp++;
6969 break;
6970 case ESTDFSCINT:
6971 case ESTDFSCASIS:
6972 rp += sizeof(int);
6973 break;
6974 }
6975 while(*rp != 0x00){
6976 rp += 2;
6977 }
6978 rp++;
6979 EST_SET_VNUMBUF(vstep, nbuf, cid);
6980 CB_DATUMCAT(datum, nbuf, vstep);
6981 CB_DATUMCAT(datum, sp, rp - sp);
6982 }
6983 }
6984
6985
6986 /* Open the inverted index.
6987 `name' specifies the name of a directory.
6988 `omode' specifies an open mode of Villa.
6989 `dnum' specifies the number of database files.
6990 The return value is a database object of the database. */
est_idx_open(const char * name,int omode,int dnum)6991 static ESTIDX *est_idx_open(const char *name, int omode, int dnum){
6992 ESTIDX *idx;
6993 CBLIST *files;
6994 const char *file;
6995 char path[ESTPATHBUFSIZ];
6996 int i, crdnum;
6997 assert(name && dnum > 0);
6998 if(dnum > ESTIDXDMAX) dnum = ESTIDXDMAX;
6999 CB_MALLOC(idx, sizeof(ESTIDX));
7000 if((omode & VL_OCREAT) && !est_mkdir(name) && errno != EEXIST) return NULL;
7001 if((omode & VL_OTRUNC) && (files = cbdirlist(name)) != NULL){
7002 for(i = 0; i < CB_LISTNUM(files); i++){
7003 file = CB_LISTVAL(files, i);
7004 if(!strcmp(file, ESTCDIRSTR) || !strcmp(file, ESTPDIRSTR)) continue;
7005 sprintf(path, "%s%c%s", name, ESTPATHCHR, file);
7006 if(unlink(path) == -1) est_rmdir_rec(path);
7007 }
7008 CB_LISTCLOSE(files);
7009 }
7010 for(i = 0; i < dnum; i++){
7011 sprintf(path, "%s%c%04d", name, ESTPATHCHR, i + 1);
7012 crdnum = vlcrdnum;
7013 vlcrdnum = ESTVLCRDNUM;
7014 if(!(idx->dbs[i] = vlopen(path, omode, VL_CMPLEX))){
7015 while(--i >= 0){
7016 vlclose(idx->dbs[i]);
7017 }
7018 vlcrdnum = crdnum;
7019 return NULL;
7020 }
7021 vlcrdnum = crdnum;
7022 }
7023 idx->name = cbmemdup(name, -1);
7024 idx->omode = omode;
7025 idx->dnum = dnum;
7026 idx->cdb = idx->dbs[dnum-1];
7027 return idx;
7028 }
7029
7030
7031 /* Close the inverted index.
7032 `idx' specifies an object of the inverted index.
7033 The return value is true if success, else it is false. */
est_idx_close(ESTIDX * idx)7034 static int est_idx_close(ESTIDX *idx){
7035 int i, err;
7036 assert(idx);
7037 err = FALSE;
7038 for(i = 0; i < idx->dnum; i++){
7039 if(!vlclose(idx->dbs[i])) err = TRUE;
7040 }
7041 free(idx->name);
7042 free(idx);
7043 return err ? FALSE : TRUE;
7044 }
7045
7046
7047 /* Set the tuning parameters of the inverted index.
7048 `idx' specifies an object of the inverted index.
7049 `lrecmax' specifies the max number of records in a leaf node of B+ tree.
7050 `nidxmax' specifies the max number of indexes in a non-leaf node of B+ tree.
7051 `lcnum' specifies the max number of caching leaf nodes.
7052 `ncnum' specifies the max number of caching non-leaf nodes.
7053 `fbpsiz' specifies the size of the free block pool.
7054 Other parameters are same with `vlsettuning' of Villa. */
est_idx_set_tuning(ESTIDX * idx,int lrecmax,int nidxmax,int lcnum,int ncnum,int fbpsiz)7055 static void est_idx_set_tuning(ESTIDX *idx, int lrecmax, int nidxmax, int lcnum, int ncnum,
7056 int fbpsiz){
7057 int i;
7058 assert(idx);
7059 for(i = 0; i < idx->dnum; i++){
7060 vlsettuning(idx->dbs[i], lrecmax, nidxmax, lcnum, ncnum);
7061 if(fbpsiz > 0) vlsetfbpsiz(idx->dbs[i], fbpsiz);
7062 }
7063 }
7064
7065
7066 /* Increment the inverted index.
7067 `idx' specifies an object of the inverted index. */
est_idx_increment(ESTIDX * idx)7068 static void est_idx_increment(ESTIDX *idx){
7069 char path[ESTPATHBUFSIZ];
7070 int i, min, size, crdnum;
7071 assert(idx);
7072 min = INT_MAX;
7073 for(i = 0; i < idx->dnum; i++){
7074 size = vlfsiz(idx->cdb);
7075 if(size < min) min = size;
7076 }
7077 if(idx->dnum >= ESTIDXDMAX || (idx->dnum >= ESTIDXDSTD && min < ESTIDXDBMAX)){
7078 est_idx_set_current(idx);
7079 return;
7080 }
7081 sprintf(path, "%s%c%04d", idx->name, ESTPATHCHR, idx->dnum + 1);
7082 crdnum = vlcrdnum;
7083 vlcrdnum = ESTVLCRDNUM;
7084 if((idx->dbs[idx->dnum] = vlopen(path, idx->omode | VL_OCREAT | VL_OTRUNC, VL_CMPLEX)) != NULL){
7085 idx->cdb = idx->dbs[idx->dnum];
7086 idx->dnum++;
7087 }
7088 vlcrdnum = crdnum;
7089 }
7090
7091
7092 /* Get the number of files of the inverted index.
7093 The return the number of files of the inverted index. */
est_idx_dnum(ESTIDX * idx)7094 static int est_idx_dnum(ESTIDX *idx){
7095 assert(idx);
7096 return idx->dnum;
7097 }
7098
7099
7100 /* Add a record to the inverted index.
7101 `idx' specifies an object of the inverted index.
7102 `word' specifies a word.
7103 `vbuf' specifies the pointer to the value of a record.
7104 `vsiz' specifies the size of the value.
7105 `smode' specifies a mode of score type.
7106 The return value is true if success, else it is false. */
est_idx_add(ESTIDX * idx,const char * word,int wsiz,const char * vbuf,int vsiz,int smode)7107 static int est_idx_add(ESTIDX *idx, const char *word, int wsiz,
7108 const char *vbuf, int vsiz, int smode){
7109 CBDATUM *datum;
7110 const char *obuf;
7111 int rv, lid, osiz;
7112 assert(idx && word && wsiz >= 0 && vbuf && vsiz >= 0);
7113 CB_DATUMOPEN(datum);
7114 lid = 0;
7115 if((obuf = vlgetcache(idx->cdb, word, wsiz, &osiz)) != NULL)
7116 lid = est_idx_rec_last_id(obuf, osiz, smode);
7117 est_encode_idx_rec(datum, vbuf, vsiz, lid, smode);
7118 rv = vlput(idx->cdb, word, wsiz, CB_DATUMPTR(datum), CB_DATUMSIZE(datum), VL_DCAT);
7119 CB_DATUMCLOSE(datum);
7120 return rv;
7121 }
7122
7123
7124 /* Store a record to a file of the inverted index.
7125 `idx' specifies an object of the inverted index.
7126 `inum' specifies the index of a file of the inverted index.
7127 `word' specifies a word.
7128 `vbuf' specifies the pointer to the value of a record.
7129 `vsiz' specifies the size of the value.
7130 The return value is true if success, else it is false. */
est_idx_put_one(ESTIDX * idx,int inum,const char * word,int wsiz,const char * vbuf,int vsiz)7131 static int est_idx_put_one(ESTIDX *idx, int inum, const char *word, int wsiz,
7132 const char *vbuf, int vsiz){
7133 assert(idx && inum >= 0 && word && wsiz >= 0 && vbuf && vsiz >= 0);
7134 return vsiz > 0 ? vlput(idx->dbs[inum], word, wsiz, vbuf, vsiz, VL_DOVER) :
7135 (vlout(idx->dbs[inum], word, wsiz) || dpecode == DP_ENOITEM);
7136 }
7137
7138
7139 /* Remove a record from the inverted index.
7140 `idx' specifies an object of the inverted index.
7141 `word' specifies a word.
7142 `wsiz' specifies the size of the word.
7143 The return value is true if success, else it is false. Even if no item correspongs, it is
7144 success. */
est_idx_out(ESTIDX * idx,const char * word,int wsiz)7145 static int est_idx_out(ESTIDX *idx, const char *word, int wsiz){
7146 int i, err;
7147 assert(idx && word && wsiz >= 0);
7148 err = FALSE;
7149 for(i = 0; i < idx->dnum; i++){
7150 if(!vlout(idx->dbs[i], word, wsiz) && dpecode != DP_ENOITEM) err = TRUE;
7151 }
7152 return err ? FALSE : TRUE;
7153 }
7154
7155
7156 /* Get a record from the inverted index.
7157 `idx' specifies an object of the inverted index.
7158 `word' specifies a word.
7159 `wsiz' specifies the size of the word.
7160 `sp' specifies the pointer to a variable to which the size of the region of the return value
7161 is assigned.
7162 `smode' specifies a mode of score type.
7163 The return value is the pointer to the region of the value of the corresponding record.
7164 if no item correspongs, empty region is returned. */
est_idx_scan(ESTIDX * idx,const char * word,int wsiz,int * sp,int smode)7165 static char *est_idx_scan(ESTIDX *idx, const char *word, int wsiz, int *sp, int smode){
7166 CBDATUM *datum;
7167 const char *vbuf;
7168 int i, vsiz;
7169 assert(idx && word && wsiz >= 0 && sp);
7170 CB_DATUMOPEN(datum);
7171 for(i = 0; i < idx->dnum; i++){
7172 if(!(vbuf = vlgetcache(idx->dbs[i], word, wsiz, &vsiz))) continue;
7173 est_decode_idx_rec(datum, vbuf, vsiz, smode);
7174 }
7175 return cbdatumtomalloc(datum, sp);
7176 }
7177
7178
7179 /* Get a record from a file of the inverted index.
7180 `idx' specifies an object of the inverted index.
7181 `inum' specifies the index of a file of the inverted index.
7182 `word' specifies a word.
7183 `wsiz' specifies the size of the word.
7184 `sp' specifies the pointer to a variable to which the size of the region of the return value
7185 is assigned.
7186 The return value is the pointer to the region of the value of the corresponding record.
7187 if no item correspongs, `NULL' is returned. Because the region of the return value is
7188 volatile, it sould be copied immediately. */
est_idx_get_one(ESTIDX * idx,int inum,const char * word,int wsiz,int * sp)7189 static const char *est_idx_get_one(ESTIDX *idx, int inum, const char *word, int wsiz, int *sp){
7190 assert(idx && inum >= 0 && word && wsiz >= 0 && sp);
7191 return vlgetcache(idx->dbs[inum], word, wsiz, sp);
7192 }
7193
7194
7195 /* Get the size of the value of a record in the inverted index.
7196 `idx' specifies an object of the inverted index.
7197 `word' specifies a word.
7198 `wsiz' specifies the size of the word.
7199 The return value is the size of the value of the corresponding record.
7200 if no item correspongs, 0 is returned. */
est_idx_vsiz(ESTIDX * idx,const char * word,int wsiz)7201 static int est_idx_vsiz(ESTIDX *idx, const char *word, int wsiz){
7202 int i, sum, vsiz;
7203 assert(idx && word && wsiz >= 0);
7204 sum = 0;
7205 for(i = 0; i < idx->dnum; i++){
7206 if((vsiz = vlvsiz(idx->dbs[i], word, wsiz)) < 1) continue;
7207 sum += vsiz;
7208 }
7209 return sum;
7210 }
7211
7212
7213 /* Get the number of division of the inverted index.
7214 `idx' specifies an object of the inverted index.
7215 The return value is the number of division of the inverted index. */
est_idx_num(ESTIDX * idx)7216 static int est_idx_num(ESTIDX *idx){
7217 assert(idx);
7218 return idx->dnum;
7219 }
7220
7221
7222 /* Get the size of the inverted index.
7223 `idx' specifies an object of the inverted index.
7224 The return value is the size of the inverted index. */
est_idx_size(ESTIDX * idx)7225 static double est_idx_size(ESTIDX *idx){
7226 int i;
7227 double size;
7228 assert(idx);
7229 size = 0;
7230 for(i = 0; i < idx->dnum; i++){
7231 size += vlfsiz(idx->dbs[i]);
7232 }
7233 return size;
7234 }
7235
7236
7237 /* Get the size of the current file of the inverted index.
7238 `idx' specifies an object of the inverted index.
7239 The return value is the size of the current file of the inverted index. */
est_idx_size_current(ESTIDX * idx)7240 static int est_idx_size_current(ESTIDX *idx){
7241 assert(idx);
7242 return vlfsiz(idx->cdb);
7243 }
7244
7245
7246 /* Synchronize updating contents of the inverted index on memory.
7247 `idx' specifies an object of the inverted index.
7248 The return value is true if success, else it is false. */
est_idx_memflush(ESTIDX * idx)7249 static int est_idx_memflush(ESTIDX *idx){
7250 int i;
7251 assert(idx);
7252 for(i = 0; i < idx->dnum; i++){
7253 if(!vlmemflush(idx->dbs[i])) return FALSE;
7254 }
7255 return TRUE;
7256 }
7257
7258
7259 /* Syncronize the inverted index.
7260 `idx' specifies an object of the inverted index.
7261 The return value is true if success, else it is false. */
est_idx_sync(ESTIDX * idx)7262 static int est_idx_sync(ESTIDX *idx){
7263 int i;
7264 assert(idx);
7265 for(i = 0; i < idx->dnum; i++){
7266 if(!vlsync(idx->dbs[i])) return FALSE;
7267 }
7268 return TRUE;
7269 }
7270
7271
7272 /* Optimize the inverted index.
7273 `idx' specifies an object of the inverted index.
7274 The return value is true if success, else it is false. */
est_idx_optimize(ESTIDX * idx)7275 static int est_idx_optimize(ESTIDX *idx){
7276 int i;
7277 assert(idx);
7278 for(i = 0; i < idx->dnum; i++){
7279 if(!vloptimize(idx->dbs[i])) return FALSE;
7280 }
7281 return TRUE;
7282 }
7283
7284
7285 /* Set the current database to the smallest one in the inverted index.
7286 `idx' specifies an object of the inverted index. */
est_idx_set_current(ESTIDX * idx)7287 static void est_idx_set_current(ESTIDX *idx){
7288 int i, size, min;
7289 assert(idx);
7290 min = vlfsiz(idx->cdb);
7291 for(i = 0; i < idx->dnum; i++){
7292 if((size = vlfsiz(idx->dbs[i])) < min){
7293 idx->cdb = idx->dbs[i];
7294 min = size;
7295 }
7296 }
7297 }
7298
7299
7300 /* Store a record related to the ID number of a document.
7301 `curia' specifies a database object.
7302 `zmode' specifies a compression mode.
7303 `id' specifies the ID number of a document.
7304 `vbuf' specifies the pointer to the value of a record.
7305 `vsiz' specifies the size of the value.
7306 The return value is true if success, else it is false. */
est_crput(CURIA * curia,int zmode,int id,const char * vbuf,int vsiz,int dmode)7307 static int est_crput(CURIA *curia, int zmode, int id, const char *vbuf, int vsiz, int dmode){
7308 char *zbuf;
7309 int zsiz;
7310 assert(curia && id > 0 && vbuf && vsiz >= 0);
7311 switch(zmode){
7312 case ESTDFZLIB:
7313 if(!(zbuf = est_deflate(vbuf, vsiz, &zsiz, -1))){
7314 dpecode = ESTEMISC;
7315 return FALSE;
7316 }
7317 if(!crput(curia, (char *)&id, sizeof(int), zbuf, zsiz, dmode)){
7318 free(zbuf);
7319 return FALSE;
7320 }
7321 free(zbuf);
7322 break;
7323 case ESTDFLZO:
7324 if(!(zbuf = est_lzoencode(vbuf, vsiz, &zsiz))){
7325 dpecode = ESTEMISC;
7326 return FALSE;
7327 }
7328 if(!crput(curia, (char *)&id, sizeof(int), zbuf, zsiz, dmode)){
7329 free(zbuf);
7330 return FALSE;
7331 }
7332 free(zbuf);
7333 break;
7334 case ESTDFBZIP:
7335 if(!(zbuf = est_bzencode(vbuf, vsiz, &zsiz))){
7336 dpecode = ESTEMISC;
7337 return FALSE;
7338 }
7339 if(!crput(curia, (char *)&id, sizeof(int), zbuf, zsiz, dmode)){
7340 free(zbuf);
7341 return FALSE;
7342 }
7343 free(zbuf);
7344 break;
7345 default:
7346 if(!crput(curia, (char *)&id, sizeof(int), vbuf, vsiz, dmode)) return FALSE;
7347 break;
7348 }
7349 return TRUE;
7350 }
7351
7352
7353 /* Remove a record related to the ID number of a document.
7354 `curia' specifies a database object.
7355 `id' specifies the ID number of a document.
7356 The return value is true if success, else it is false. */
est_crout(CURIA * curia,int id)7357 static int est_crout(CURIA *curia, int id){
7358 assert(curia && id > 0);
7359 return crout(curia, (char *)&id, sizeof(int));
7360 }
7361
7362
7363 /* Get a record related to the ID number of a document.
7364 `curia' specifies a database object.
7365 `zmode' specifies a compression mode.
7366 `id' specifies the ID number of a document.
7367 `sp' specifies the pointer to a variable to which the size of the region of the return value
7368 is assigned.
7369 The return value is the pointer to the region of the value of the corresponding record. */
est_crget(CURIA * curia,int zmode,int id,int * sp)7370 static char *est_crget(CURIA *curia, int zmode, int id, int *sp){
7371 char *zbuf, *vbuf;
7372 int zsiz;
7373 assert(curia && id > 0 && sp);
7374 switch(zmode){
7375 case ESTDFZLIB:
7376 if(!(zbuf = crget(curia, (char *)&id, sizeof(int), 0, -1, &zsiz))) return NULL;
7377 if(!(vbuf = est_inflate(zbuf, zsiz, sp, -1))){
7378 free(zbuf);
7379 return NULL;
7380 }
7381 free(zbuf);
7382 break;
7383 case ESTDFLZO:
7384 if(!(zbuf = crget(curia, (char *)&id, sizeof(int), 0, -1, &zsiz))) return NULL;
7385 if(!(vbuf = est_lzodecode(zbuf, zsiz, sp))){
7386 free(zbuf);
7387 return NULL;
7388 }
7389 free(zbuf);
7390 break;
7391 case ESTDFBZIP:
7392 if(!(zbuf = crget(curia, (char *)&id, sizeof(int), 0, -1, &zsiz))) return NULL;
7393 if(!(vbuf = est_bzdecode(zbuf, zsiz, sp))){
7394 free(zbuf);
7395 return NULL;
7396 }
7397 free(zbuf);
7398 break;
7399 default:
7400 if(!(vbuf = crget(curia, (char *)&id, sizeof(int), 0, -1, sp))) return NULL;
7401 break;
7402 }
7403 return vbuf;
7404 }
7405
7406
7407 /* Add an attribute of a document to a sequencial attribute index.
7408 `db' specifies a handle of a sequencial attribute index.
7409 `id' specifies the ID number of a document.
7410 `vbuf' specifies the pointer to the attribute value.
7411 `vsiz' specifies the size of the attribute value.
7412 The return value is true if success, else it is false. */
est_aidx_seq_put(DEPOT * db,int id,const char * vbuf,int vsiz)7413 static int est_aidx_seq_put(DEPOT *db, int id, const char *vbuf, int vsiz){
7414 int err;
7415 assert(db && id >= 0 && vbuf && vsiz >= 0);
7416 err = FALSE;
7417 if(!dpput(db, (char *)&id, sizeof(int), vbuf, vsiz, DP_DKEEP)) err = TRUE;
7418 return err ? FALSE : TRUE;
7419 }
7420
7421
7422 /* Remove an attribute of a document from a sequencial attribute index.
7423 `db' specifies a handle of a sequencial attribute index.
7424 `id' specifies the ID number of a document.
7425 The return value is true if success, else it is false. */
est_aidx_seq_out(DEPOT * db,int id)7426 static int est_aidx_seq_out(DEPOT *db, int id){
7427 int err;
7428 assert(db && id >= 0);
7429 err = FALSE;
7430 if(!dpout(db, (char *)&id, sizeof(int))) err = TRUE;
7431 return err ? FALSE : TRUE;
7432 }
7433
7434
7435 /* Retrieve the value of an attribute of a document in a sequencial attribute index.
7436 `db' specifies a handle of a sequencial attribute index.
7437 `id' specifies the ID number of a document.
7438 The return value is the value of the attribute or `NULL' if no attribute. */
est_aidx_seq_get(DEPOT * db,int id,int * sp)7439 static char *est_aidx_seq_get(DEPOT *db, int id, int *sp){
7440 assert(db && id >= 0 && sp);
7441 return dpget(db, (char *)&id, sizeof(int), 0, -1, sp);
7442 }
7443
7444
7445 /* Narrow scores of search candidates with a sequencial attribute index.
7446 `db' specifies a handle of a sequencial attribute index.
7447 `pdocs' specifies a list of pseudo documents.
7448 `cop' specifies the pointer to the operator.
7449 `sign' specifies the sign of operation.
7450 `oval' specifies the operation value.
7451 `osiz' specifies the size of the operation value
7452 `sval' specifies the operation value of small cases.
7453 `ssiz' specifies the size of the operation value of small cases.
7454 `regex' specifies the regular expressions.
7455 `onum' specifies the numeric value.
7456 `scores' specifies an array of scores of search candidates.
7457 `snum' specifies the number of the array.
7458 `limit' specifies the limit number to check.
7459 `restp' specifies the pointer to a variable to which rest number to be checked is assigned.
7460 The return value is the new number of the array. */
est_aidx_seq_narrow(DEPOT * db,const CBLIST * pdocs,const char * cop,int sign,const char * oval,int osiz,const char * sval,int ssiz,const void * regex,int onum,ESTSCORE * scores,int snum,int limit,int * restp)7461 static int est_aidx_seq_narrow(DEPOT *db, const CBLIST *pdocs, const char *cop, int sign,
7462 const char *oval, int osiz, const char *sval, int ssiz,
7463 const void *regex, int onum, ESTSCORE *scores, int snum,
7464 int limit, int *restp){
7465 char vbuf[ESTAIKBUFSIZ];
7466 int i, nnum, vsiz;
7467 assert(db && cop && oval && osiz >= 0 && scores && snum >= 0 && limit >= 0 && restp);
7468 nnum = 0;
7469 for(i = 0; i < snum; i++){
7470 if(nnum >= limit){
7471 *restp = snum - i;
7472 break;
7473 }
7474 if(scores[i].id >= ESTPDOCIDMIN){
7475 scores[nnum].id = scores[i].id;
7476 scores[nnum].score = scores[i].score;
7477 nnum++;
7478 continue;
7479 }
7480 if((vsiz = dpgetwb(db, (char *)&(scores[i].id), sizeof(int), 0, ESTAIKBUFSIZ - 1, vbuf)) < 0)
7481 continue;
7482 vbuf[vsiz] = '\0';
7483 if(est_match_attr(vbuf, vsiz, cop, sign, oval, osiz, sval, ssiz, regex, onum)){
7484 scores[nnum].id = scores[i].id;
7485 scores[nnum].score = scores[i].score;
7486 nnum++;
7487 }
7488 }
7489 return nnum;
7490 }
7491
7492
7493 /* Compare two record in numeric order.
7494 `aptr' specifies the pointer to the region of one key.
7495 `asiz' specifies the size of the region of one key.
7496 `bptr' specifies the pointer to the region of the other key.
7497 `bsiz' specifies the size of the region of the other key.
7498 The return value is positive if the former is big, negative if the latter is big, 0 if both
7499 are equivalent. */
est_aidx_numcmp(const char * aptr,int asiz,const char * bptr,int bsiz)7500 static int est_aidx_numcmp(const char *aptr, int asiz, const char *bptr, int bsiz){
7501 int rv;
7502 if((rv = cbstrmktime(aptr) - cbstrmktime(bptr)) != 0) return rv;
7503 return VL_CMPLEX(aptr, asiz, bptr, bsiz);
7504 }
7505
7506
7507 /* Add an attribute of a document to an attribute narrowing index.
7508 `db' specifies a handle of an attribute narrowing index.
7509 `id' specifies the ID number of a document.
7510 `vbuf' specifies the pointer to the attribute value.
7511 `vsiz' specifies the size of the attribute value.
7512 The return value is true if success, else it is false. */
est_aidx_attr_put(VILLA * db,int id,const char * vbuf,int vsiz)7513 static int est_aidx_attr_put(VILLA *db, int id, const char *vbuf, int vsiz){
7514 char *tbuf;
7515 int err, tsiz;
7516 assert(db && id >= 0 && vbuf && vsiz >= 0);
7517 err = FALSE;
7518 tsiz = vsiz + sizeof(int) + 1;
7519 CB_MALLOC(tbuf, tsiz);
7520 memcpy(tbuf, vbuf, vsiz + 1);
7521 memcpy(tbuf + vsiz + 1, &id, sizeof(int));
7522 if(!vlput(db, tbuf, tsiz, "", 0, VL_DKEEP)) err = TRUE;
7523 free(tbuf);
7524 return err ? FALSE : TRUE;
7525 }
7526
7527
7528 /* Remove an attribute of a document from an attribute narrowing index.
7529 `db' specifies a handle of an attribute narrowing index.
7530 `id' specifies the ID number of a document.
7531 `vbuf' specifies the pointer to the attribute value.
7532 `vsiz' specifies the size of the attribute value.
7533 The return value is true if success, else it is false. */
est_aidx_attr_out(VILLA * db,int id,const char * vbuf,int vsiz)7534 static int est_aidx_attr_out(VILLA *db, int id, const char *vbuf, int vsiz){
7535 char *tbuf;
7536 int err, tsiz;
7537 assert(db && id >= 0 && vbuf && vsiz >= 0);
7538 err = FALSE;
7539 tsiz = vsiz + sizeof(int) + 1;
7540 CB_MALLOC(tbuf, tsiz);
7541 memcpy(tbuf, vbuf, vsiz + 1);
7542 memcpy(tbuf + vsiz + 1, &id, sizeof(int));
7543 if(!vlout(db, tbuf, tsiz)) err = TRUE;
7544 free(tbuf);
7545 return err ? FALSE : TRUE;
7546 }
7547
7548
7549 /* Narrow scores of search candidates with an attribute narrowing index.
7550 `db' specifies a handle of an attribute narrowing index.
7551 `pdocs' specifies a list of pseudo documents.
7552 `cop' specifies the pointer to the operator.
7553 `sign' specifies the sign of operation.
7554 `oval' specifies the operation value.
7555 `osiz' specifies the size of the operation value
7556 `sval' specifies the operation value of small cases.
7557 `ssiz' specifies the size of the operation value of small cases.
7558 `regex' specifies the regular expressions.
7559 `onum' specifies the numeric value.
7560 `scores' specifies an array of scores of search candidates.
7561 `snum' specifies the number of the array.
7562 The return value is the new number of the array. */
est_aidx_attr_narrow(VILLA * db,const CBLIST * pdocs,const char * cop,int sign,const char * oval,int osiz,const char * sval,int ssiz,const void * regex,int onum,ESTSCORE * scores,int snum)7563 static int est_aidx_attr_narrow(VILLA *db, const CBLIST *pdocs, const char *cop, int sign,
7564 const char *oval, int osiz, const char *sval, int ssiz,
7565 const void *regex, int onum, ESTSCORE *scores, int snum){
7566 CBDATUM *abuf;
7567 CBLIST *tokens;
7568 const char *kbuf;
7569 char numbuf[ESTNUMBUFSIZ], *tmp, *wp;
7570 int i, j, ksiz, len, esc, jmp, id, nnum, *ary, anum;
7571 time_t lower, upper;
7572 assert(db && pdocs && cop && oval && osiz >= 0 && scores && snum >= 0);
7573 CB_DATUMOPEN(abuf);
7574 if(cop == ESTOPSTROREQ && sign && !sval){
7575 tokens = cbsplit(oval, osiz, " ,");
7576 cblistsort(tokens);
7577 for(i = 0; i < CB_LISTNUM(tokens); i++){
7578 oval = CB_LISTVAL2(tokens, i, osiz);
7579 if(osiz < 1) continue;
7580 vlcurjump(db, oval, osiz, VL_JFORWARD);
7581 while((kbuf = vlcurkeycache(db, &ksiz)) != NULL && !strcmp(kbuf, oval)){
7582 CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
7583 vlcurnext(db);
7584 }
7585 }
7586 CB_LISTCLOSE(tokens);
7587 } else if(cop == ESTOPNUMBT && sign && !sval){
7588 CB_MEMDUP(tmp, oval, osiz);
7589 if((wp = strchr(tmp, ' ')) != NULL || (wp = strchr(tmp, '\t')) != NULL){
7590 *(wp++) = '\0';
7591 while(*wp == ' ' || *wp == '\t'){
7592 wp++;
7593 }
7594 lower = cbstrmktime(tmp);
7595 upper = cbstrmktime(wp);
7596 } else {
7597 lower = cbstrmktime(tmp);
7598 upper = INT_MAX;
7599 }
7600 len = sprintf(numbuf, "%.0f", (double)lower);
7601 vlcurjump(db, numbuf, len, VL_JFORWARD);
7602 while((kbuf = vlcurkeycache(db, &ksiz)) != NULL && cbstrmktime(kbuf) <= upper){
7603 CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
7604 vlcurnext(db);
7605 }
7606 free(tmp);
7607 } else if(!sign || sval){
7608 esc = INT_MAX;
7609 jmp = INT_MAX;
7610 if(sign && (cop == ESTOPSTREQ || cop == ESTOPSTRBW) && osiz > 0){
7611 if(*sval > 0x0 && *sval < 0x7f){
7612 numbuf[0] = *sval;
7613 numbuf[1] = '\0';
7614 esc = *(unsigned char *)sval;
7615 if(*sval >= 'a' && *sval <= 'z'){
7616 numbuf[0] -= 'a' - 'A';
7617 jmp = *sval - 'a' + 'A';
7618 }
7619 vlcurjump(db, numbuf, 1, VL_JFORWARD);
7620 } else if(*(unsigned char *)sval >= 0xc0){
7621 numbuf[0] = *sval;
7622 numbuf[1] = '\0';
7623 esc = *(unsigned char *)sval;
7624 vlcurjump(db, numbuf, 1, VL_JFORWARD);
7625 } else {
7626 vlcurfirst(db);
7627 }
7628 } else {
7629 vlcurfirst(db);
7630 }
7631 while((kbuf = vlcurkeycache(db, &ksiz)) != NULL){
7632 if(est_match_attr(kbuf, ksiz - sizeof(int) - 1,
7633 cop, sign, oval, osiz, sval, ssiz, regex, onum))
7634 CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
7635 if(*(unsigned char *)kbuf > jmp && *(unsigned char *)kbuf < *(unsigned char *)sval){
7636 numbuf[0] = *sval;
7637 numbuf[1] = '\0';
7638 vlcurjump(db, numbuf, 1, VL_JFORWARD);
7639 jmp = INT_MAX;
7640 } else if(*(unsigned char *)kbuf > esc){
7641 break;
7642 } else {
7643 vlcurnext(db);
7644 }
7645 }
7646 } else {
7647 if(cop == ESTOPSTREQ || cop == ESTOPSTRBW ||
7648 cop == ESTOPNUMEQ || cop == ESTOPNUMGT || cop == ESTOPNUMGE){
7649 vlcurjump(db, oval, osiz, VL_JFORWARD);
7650 if(cop == ESTOPNUMGT){
7651 while((kbuf = vlcurkeycache(db, NULL)) != NULL && cbstrmktime(kbuf) <= onum){
7652 vlcurnext(db);
7653 }
7654 }
7655 } else if(cop == ESTOPNUMLT || cop == ESTOPNUMLE){
7656 len = sprintf(numbuf, "%.0f", (double)cbstrmktime(oval) + 1);
7657 vlcurjump(db, numbuf, len, VL_JBACKWARD);
7658 if(cop == ESTOPNUMLT){
7659 while((kbuf = vlcurkeycache(db, NULL)) != NULL && cbstrmktime(kbuf) >= onum){
7660 vlcurprev(db);
7661 }
7662 }
7663 } else {
7664 vlcurfirst(db);
7665 }
7666 while((kbuf = vlcurkeycache(db, &ksiz)) != NULL){
7667 if(est_match_attr(kbuf, ksiz - sizeof(int) - 1,
7668 cop, TRUE, oval, osiz, sval, ssiz, regex, onum)){
7669 CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
7670 } else if(cop == ESTOPSTREQ || cop == ESTOPSTRBW || cop == ESTOPNUMEQ){
7671 break;
7672 }
7673 if(cop == ESTOPNUMLT || cop == ESTOPNUMLE){
7674 vlcurprev(db);
7675 } else {
7676 vlcurnext(db);
7677 }
7678 }
7679 }
7680 for(i = 0; i < CB_LISTNUM(pdocs); i++){
7681 id = ESTPDOCIDMIN + i;
7682 CB_DATUMCAT(abuf, &id, sizeof(int));
7683 }
7684 nnum = 0;
7685 ary = (int *)CB_DATUMPTR(abuf);
7686 anum = CB_DATUMSIZE(abuf) / sizeof(int);
7687 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
7688 qsort(ary, anum, sizeof(int), est_int_compare);
7689 for(i = 0, j = 0; i < snum; i++){
7690 while(j < anum && ary[j] < scores[i].id){
7691 j++;
7692 }
7693 if(j < anum && scores[i].id == ary[j]){
7694 scores[nnum].id = scores[i].id;
7695 scores[nnum].score = scores[i].score;
7696 nnum++;
7697 }
7698 }
7699 CB_DATUMCLOSE(abuf);
7700 return nnum;
7701 }
7702
7703
7704 /* Compare two integers.
7705 `ap' specifies the pointer to one element.
7706 `bp' specifies the pointer to the other element.
7707 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_int_compare(const void * ap,const void * bp)7708 static int est_int_compare(const void *ap, const void *bp){
7709 assert(ap && bp);
7710 return *(int *)ap - *(int *)bp;
7711 }
7712
7713
7714 /* Compare elements of a record for effective compression.
7715 `ap' specifies the pointer to one element.
7716 `bp' specifies the pointer to the other element.
7717 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_short_compare(const void * ap,const void * bp)7718 static int est_short_compare(const void *ap, const void *bp){
7719 assert(ap && bp);
7720 return ((((unsigned char *)ap)[0] << 8) + ((unsigned char *)ap)[1]) -
7721 ((((unsigned char *)bp)[0] << 8) + ((unsigned char *)bp)[1]);
7722 }
7723
7724
7725 /* Clean up the inode map.
7726 `arg' specifies a dummy argument. */
est_inodes_delete(void * arg)7727 static void est_inodes_delete(void *arg){
7728 #if defined(NDEBUG)
7729 ESTDB *db;
7730 const char *kbuf;
7731 int ecode;
7732 assert(arg);
7733 if(cbmaprnum(est_inodes) > 0){
7734 cbmapiterinit(est_inodes);
7735 while((kbuf = cbmapiternext(est_inodes, NULL)) != NULL){
7736 db = *(ESTDB **)cbmapiterval(kbuf, NULL);
7737 est_db_set_informer(db, est_inodes_delete_informer, NULL);
7738 est_db_close(db, &ecode);
7739 }
7740 }
7741 cbmapclose(est_inodes);
7742 #else
7743 ESTDB *db;
7744 const char *kbuf;
7745 int ecode;
7746 assert(arg);
7747 if(cbmaprnum(est_inodes) > 0){
7748 cbmapiterinit(est_inodes);
7749 while((kbuf = cbmapiternext(est_inodes, NULL)) != NULL){
7750 db = *(ESTDB **)cbmapiterval(kbuf, NULL);
7751 fprintf(stderr, "\nWARNING: %s is not closed.\n\n", cbmemdup(est_db_name(db), -1));
7752 est_db_set_informer(db, est_inodes_delete_informer, NULL);
7753 est_db_close(db, &ecode);
7754 }
7755 }
7756 cbmapclose(est_inodes);
7757 #endif
7758 }
7759
7760
7761 /* Inform a database event while clening up database handles.
7762 `msg' specifies the message of each event.
7763 `opaque' is ignored. */
est_inodes_delete_informer(const char * msg,void * opaque)7764 static void est_inodes_delete_informer(const char *msg, void *opaque){
7765 #if !defined(NDEBUG)
7766 fprintf(stderr, "estraier: %s\n", msg);
7767 #endif
7768 }
7769
7770
7771 /* Write meta data to the database.
7772 `db' specifies a database object.
7773 The return value is true if success, else it is false. */
est_db_write_meta(ESTDB * db)7774 static int est_db_write_meta(ESTDB *db){
7775 char vbuf[ESTNUMBUFSIZ], *sbuf;
7776 int err, ssiz;
7777 assert(db);
7778 err = FALSE;
7779 sprintf(vbuf, "%d", est_idx_num(db->idxdb));
7780 if(!dpput(db->metadb, ESTKEYIDXNUM, -1, vbuf, -1, DP_DOVER)) err = TRUE;
7781 sprintf(vbuf, "%d", db->dseq);
7782 if(!dpput(db->metadb, ESTKEYDSEQ, -1, vbuf, -1, DP_DOVER)) err = TRUE;
7783 sprintf(vbuf, "%d", db->dnum);
7784 if(!dpput(db->metadb, ESTKEYDNUM, -1, vbuf, -1, DP_DOVER)) err = TRUE;
7785 if(db->metacc){
7786 sbuf = cbmapdump(db->metacc, &ssiz);
7787 if(!dpput(db->metadb, ESTKEYMETA, -1, sbuf, ssiz, DP_DOVER)) err = TRUE;
7788 free(sbuf);
7789 }
7790 if(err){
7791 est_set_ecode(&(db->ecode), ESTEDB, __LINE__);
7792 db->fatal = TRUE;
7793 }
7794 return err ? FALSE : TRUE;
7795 }
7796
7797
7798 /* Call the callback function of a database.
7799 `db' specifies a database object.
7800 `info' specifies an extra message. */
est_db_inform(ESTDB * db,const char * info)7801 static void est_db_inform(ESTDB *db, const char *info){
7802 char *msg;
7803 assert(db && info);
7804 if(!db->infocb) return;
7805 msg = cbsprintf("%s: name=%s dnum=%d wnum=%d fsiz=%.0f crnum=%d csiz=%d dknum=%d",
7806 info, db->name, db->dnum, vlrnum(db->fwmdb), (double)est_db_size(db),
7807 cbmaprnum(db->idxcc) + cbmaprnum(db->auxcc), est_db_used_cache_size(db),
7808 cbmaprnum(db->outcc));
7809 db->infocb(msg, db->infoop);
7810 free(msg);
7811 }
7812
7813
7814 /* Prepare cache for meta data.
7815 `db' specifies a database object. */
est_db_prepare_meta(ESTDB * db)7816 static void est_db_prepare_meta(ESTDB *db){
7817 char *sbuf;
7818 int ssiz;
7819 assert(db);
7820 if((sbuf = dpget(db->metadb, ESTKEYMETA, -1, 0, -1, &ssiz)) != NULL){
7821 db->metacc = cbmapload(sbuf, ssiz);
7822 free(sbuf);
7823 } else {
7824 db->metacc = cbmapopenex(ESTMINIBNUM);
7825 }
7826 }
7827
7828
7829 /* Score a document object matching the phrase of a search condition object definitely.
7830 `db' specifies a database object.
7831 `doc' specifies a document object.
7832 `cond' specifies a search condition object.
7833 `scp' specifies the pointer to a variable to which the score is assigned.
7834 The return value is true if the document matches the phrase of the condition object
7835 definitely, else it is false. */
est_db_score_doc(ESTDB * db,ESTDOC * doc,ESTCOND * cond,int * scp)7836 static int est_db_score_doc(ESTDB *db, ESTDOC *doc, ESTCOND *cond, int *scp){
7837 struct { char *word; int num; } wsets[ESTSCANWNUM], nsets[ESTSCANWNUM];
7838 CBLIST *terms, *words;
7839 const char *term, *text, *rp;
7840 unsigned char *rbuf;
7841 char *tmp;
7842 int i, j, k, sc, wsnum, nsnum, asiz, tsiz, add, rsiz, hit;
7843 double tune;
7844 assert(db && doc && cond && scp);
7845 *scp = 0;
7846 if(!cond->phrase || cbstrfwmatch(cond->phrase, ESTOPSIMILAR) ||
7847 cbstrfwmatch(cond->phrase, ESTOPID) || cbstrfwmatch(cond->phrase, ESTOPURI)) return FALSE;
7848 if(!doc->dtexts) CB_LISTOPEN(doc->dtexts);
7849 switch(cond->pmode){
7850 default:
7851 terms = est_phrase_terms(cond->phrase);
7852 break;
7853 case ESTPMSIMPLE:
7854 tmp = est_phrase_from_simple(cond->phrase);
7855 terms = est_phrase_terms(tmp);
7856 free(tmp);
7857 break;
7858 case ESTPMROUGH:
7859 tmp = est_phrase_from_rough(cond->phrase);
7860 terms = est_phrase_terms(tmp);
7861 free(tmp);
7862 break;
7863 case ESTPMUNION:
7864 tmp = est_phrase_from_union(cond->phrase);
7865 terms = est_phrase_terms(tmp);
7866 free(tmp);
7867 break;
7868 case ESTPMISECT:
7869 tmp = est_phrase_from_isect(cond->phrase);
7870 terms = est_phrase_terms(tmp);
7871 free(tmp);
7872 break;
7873 }
7874 wsnum = 0;
7875 nsnum = 0;
7876 add = TRUE;
7877 for(i = 0; i < CB_LISTNUM(terms); i++){
7878 term = CB_LISTVAL(terms, i);
7879 if(!strcmp(term, ESTOPISECT)){
7880 add = TRUE;
7881 } else if(!strcmp(term, ESTOPDIFF)){
7882 add = FALSE;
7883 } else if(strcmp(term, ESTOPUVSET)){
7884 if(term[0] == ' '){
7885 term++;
7886 if(term[0] == 'b'){
7887 term++;
7888 } else if(term[0] == 'e'){
7889 term++;
7890 }
7891 }
7892 words = cbsplit(term, -1, "\t");
7893 if(add){
7894 while(wsnum < ESTSCANWNUM && CB_LISTNUM(words) > 0){
7895 wsets[wsnum].word = cblistshift(words, NULL);
7896 wsets[wsnum].num = i;
7897 wsnum++;
7898 }
7899 } else {
7900 while(nsnum < ESTSCANWNUM && CB_LISTNUM(words) > 0){
7901 nsets[nsnum].word = cblistshift(words, NULL);
7902 nsets[nsnum].num = i;
7903 nsnum++;
7904 }
7905 }
7906 CB_LISTCLOSE(words);
7907 }
7908 }
7909 asiz = 0;
7910 sc = 0;
7911 if((rp = cbmapget(doc->attrs, "\t", 1, NULL)) != NULL) sc = -1 - atoi(rp);
7912 for(i = -1; i < CB_LISTNUM(doc->dtexts); i++){
7913 if(i < 0){
7914 if(!doc->attrs || !(text = cbmapget(doc->attrs, "", 0, NULL))) continue;
7915 asiz += strlen(text);
7916 } else {
7917 text = CB_LISTVAL2(doc->dtexts, i, tsiz);
7918 asiz += tsiz;
7919 }
7920 rbuf = (unsigned char *)est_uconv_in(text, strlen(text), &rsiz);
7921 est_canonicalize_text(rbuf, rsiz, FALSE);
7922 tmp = est_uconv_out((char *)rbuf, rsiz, &rsiz);
7923 for(j = 0; j < wsnum; j++){
7924 if(!wsets[j].word) continue;
7925 if((rp = est_strstr_sparse(tmp, wsets[j].word)) != NULL){
7926 if(sc >= 0){
7927 do {
7928 sc += 16;
7929 rp += strlen(wsets[j].word);
7930 } while((rp = est_strstr_sparse(rp, wsets[j].word)) != NULL);
7931 }
7932 for(k = 0; k < wsnum; k++){
7933 if(!wsets[k].word) continue;
7934 if(wsets[k].num == wsets[j].num){
7935 free(wsets[k].word);
7936 wsets[k].word = NULL;
7937 }
7938 }
7939 }
7940 }
7941 for(j = 0; j < nsnum; j++){
7942 if(!nsets[j].word) continue;
7943 if((rp = est_strstr_sparse(tmp, nsets[j].word)) != NULL){
7944 for(k = 0; k < nsnum; k++){
7945 if(!nsets[k].word) continue;
7946 if(nsets[k].num == nsets[j].num){
7947 free(nsets[k].word);
7948 nsets[k].word = NULL;
7949 }
7950 }
7951 }
7952 }
7953 free(tmp);
7954 free(rbuf);
7955 }
7956 hit = TRUE;
7957 for(i = 0; i < wsnum; i++){
7958 if(!wsets[i].word) continue;
7959 free(wsets[i].word);
7960 hit = FALSE;
7961 }
7962 for(i = 0; i < nsnum; i++){
7963 if(!nsets[i].word){
7964 hit = FALSE;
7965 continue;
7966 }
7967 free(nsets[i].word);
7968 }
7969 CB_LISTCLOSE(terms);
7970 if(sc < 0) sc = -1 - sc;
7971 tune = sqrt(asiz / 8.0 + 128) / 16.0;
7972 switch(db->smode){
7973 case ESTDFSCVOID:
7974 sc = 0;
7975 break;
7976 default:
7977 sc /= tune;
7978 if(sc >= 0x80) sc += (0x80 - sc) * 0.75;
7979 if(sc >= 0xc0) sc += (0xc0 - sc) * 0.75;
7980 sc = sc < 0xff ? sc : 0xff;
7981 break;
7982 case ESTDFSCINT:
7983 sc /= tune;
7984 break;
7985 case ESTDFSCASIS:
7986 break;
7987 }
7988 *scp = sc;
7989 return hit;
7990 }
7991
7992
7993 /* Get the ID of a document specified by URI from pseudo indexes.
7994 `db' specifies a database object.
7995 `uri' specifies the URI of a registered document.
7996 The return value is the ID of the document. On error, -1 is returned. */
est_pidx_uri_to_id(ESTDB * db,const char * uri)7997 static int est_pidx_uri_to_id(ESTDB *db, const char *uri){
7998 ESTDOC *doc;
7999 const char *vbuf;
8000 int i, vsiz;
8001 assert(db && uri);
8002 if(!db->puris){
8003 db->puris = cbmapopenex(CB_LISTNUM(db->pdocs) + 1);
8004 for(i = 0; i < CB_LISTNUM(db->pdocs); i++){
8005 if((doc = est_db_get_doc(db, ESTPDOCIDMIN + i, 0)) != NULL){
8006 if((vbuf = cbmapget(doc->attrs, ESTDATTRURI, -1, &vsiz)) != NULL)
8007 cbmapput(db->puris, vbuf, vsiz, (char *)&(doc->id), sizeof(int), FALSE);
8008 est_doc_delete(doc);
8009 }
8010 }
8011 }
8012 if((vbuf = cbmapget(db->puris, uri, -1, NULL)) != NULL) return *(int *)vbuf;
8013 return -1;
8014 }
8015
8016
8017 /* Create a list of terms for search.
8018 `phrase' specifies a search phrase.
8019 The return value is a list object of the terms of the phrase. */
est_phrase_terms(const char * phrase)8020 static CBLIST *est_phrase_terms(const char *phrase){
8021 CBLIST *terms, *elems;
8022 CBDATUM *datum;
8023 const char *elem;
8024 char *tbuf, *pbuf;
8025 int i, tsiz, psiz, lw;
8026 assert(phrase);
8027 CB_LISTOPEN(terms);
8028 tbuf = est_uconv_in(phrase, strlen(phrase), &tsiz);
8029 est_normalize_text((unsigned char *)tbuf, tsiz, &tsiz);
8030 pbuf = est_uconv_out(tbuf, tsiz, &psiz);
8031 elems = cbsplit(pbuf, psiz, "\a\b\t\n\v\f\r ");
8032 CB_DATUMOPEN(datum);
8033 lw = FALSE;
8034 for(i = 0; i < CB_LISTNUM(elems); i++){
8035 elem = CB_LISTVAL(elems, i);
8036 if(elem[0] == '\0') continue;
8037 if(!strcmp(elem, ESTOPUNION)){
8038 if(CB_DATUMSIZE(datum) < 1) continue;
8039 if(lw) CB_DATUMCAT(datum, "\t", 1);
8040 lw = FALSE;
8041 } else if(!strcmp(elem, ESTOPWCBW)){
8042 if(!lw) CB_DATUMCAT(datum, " b", 2);
8043 } else if(!strcmp(elem, ESTOPWCEW)){
8044 if(!lw) CB_DATUMCAT(datum, " e", 2);
8045 } else if(!strcmp(elem, ESTOPWCRX)){
8046 if(!lw) CB_DATUMCAT(datum, " r", 2);
8047 } else if(!strcmp(elem, ESTOPISECT) || !strcmp(elem, ESTOPDIFF)){
8048 if(CB_DATUMSIZE(datum) < 1) continue;
8049 CB_LISTPUSH(terms, CB_DATUMPTR(datum), CB_DATUMSIZE(datum));
8050 CB_DATUMSETSIZE(datum, 0);
8051 CB_LISTPUSH(terms, elem, strlen(elem));
8052 lw = FALSE;
8053 } else {
8054 if(CB_DATUMSIZE(datum) > 0 && lw) CB_DATUMCAT(datum, " ", 1);
8055 CB_DATUMCAT(datum, elem, strlen(elem));
8056 lw = TRUE;
8057 }
8058 }
8059 if(CB_DATUMSIZE(datum) > 0) CB_LISTPUSH(terms, CB_DATUMPTR(datum), CB_DATUMSIZE(datum));
8060 CB_DATUMCLOSE(datum);
8061 CB_LISTCLOSE(elems);
8062 free(pbuf);
8063 free(tbuf);
8064 for(i = 0; i < CB_LISTNUM(terms); i++){
8065 elem = CB_LISTVAL(terms, i);
8066 if(!strcmp(elem, ESTOPUVSET) || !strcmp(elem, ESTOPISECT) ||
8067 !strcmp(elem, ESTOPDIFF)) continue;
8068 tbuf = est_uconv_in(elem, strlen(elem), &tsiz);
8069 est_canonicalize_text((unsigned char *)tbuf, tsiz, TRUE);
8070 pbuf = est_uconv_out(tbuf, tsiz, &psiz);
8071 cblistover(terms, i, pbuf, -1);
8072 free(pbuf);
8073 free(tbuf);
8074 }
8075 for(i = CB_LISTNUM(terms) - 1; i >= 0; i--){
8076 elem = CB_LISTVAL(terms, i);
8077 if(strcmp(elem, ESTOPISECT) && strcmp(elem, ESTOPDIFF)) break;
8078 CB_LISTDROP(terms);
8079 }
8080 return terms;
8081 }
8082
8083
8084 /* Compare two scores by each ID for ascending order.
8085 `ap' specifies the pointer to one score.
8086 `bp' specifies the pointer to the other score.
8087 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_score_compare_by_id_asc(const void * ap,const void * bp)8088 static int est_score_compare_by_id_asc(const void *ap, const void *bp){
8089 assert(ap && bp);
8090 return ((ESTSCORE *)ap)->id - ((ESTSCORE *)bp)->id;
8091 }
8092
8093
8094 /* Compare two scores by each ID for descending order.
8095 `ap' specifies the pointer to one score.
8096 `bp' specifies the pointer to the other score.
8097 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_score_compare_by_id_desc(const void * ap,const void * bp)8098 static int est_score_compare_by_id_desc(const void *ap, const void *bp){
8099 assert(ap && bp);
8100 return ((ESTSCORE *)bp)->id - ((ESTSCORE *)ap)->id;
8101 }
8102
8103
8104 /* Compare two scores by each score point for ascending order.
8105 `ap' specifies the pointer to one score.
8106 `bp' specifies the pointer to the other score.
8107 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_score_compare_by_score_asc(const void * ap,const void * bp)8108 static int est_score_compare_by_score_asc(const void *ap, const void *bp){
8109 assert(ap && bp);
8110 return ((ESTSCORE *)ap)->score - ((ESTSCORE *)bp)->score;
8111 }
8112
8113
8114 /* Compare two scores by each score point for descending order.
8115 `ap' specifies the pointer to one score.
8116 `bp' specifies the pointer to the other score.
8117 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_score_compare_by_score_desc(const void * ap,const void * bp)8118 static int est_score_compare_by_score_desc(const void *ap, const void *bp){
8119 assert(ap && bp);
8120 return ((ESTSCORE *)bp)->score - ((ESTSCORE *)ap)->score;
8121 }
8122
8123
8124 /* Compare two scores by attributes of strings for ascending order.
8125 `ap' specifies the pointer to one score.
8126 `bp' specifies the pointer to the other score.
8127 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_score_compare_by_str_asc(const void * ap,const void * bp)8128 static int est_score_compare_by_str_asc(const void *ap, const void *bp){
8129 assert(ap && bp);
8130 return strcmp(((ESTSCORE *)ap)->value, ((ESTSCORE *)bp)->value);
8131 }
8132
8133
8134 /* Compare two scores by attributes of strings for descending order.
8135 `ap' specifies the pointer to one score.
8136 `bp' specifies the pointer to the other score.
8137 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_score_compare_by_str_desc(const void * ap,const void * bp)8138 static int est_score_compare_by_str_desc(const void *ap, const void *bp){
8139 assert(ap && bp);
8140 return strcmp(((ESTSCORE *)bp)->value, ((ESTSCORE *)ap)->value);
8141 }
8142
8143
8144 /* Compare two scores by attributes of numbers for ascending order.
8145 `ap' specifies the pointer to one score.
8146 `bp' specifies the pointer to the other score.
8147 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_score_compare_by_num_asc(const void * ap,const void * bp)8148 static int est_score_compare_by_num_asc(const void *ap, const void *bp){
8149 assert(ap && bp);
8150 return (time_t)((ESTSCORE *)ap)->value - (time_t)((ESTSCORE *)bp)->value;
8151 }
8152
8153
8154 /* Compare two scores by attributes of numbers for descending order.
8155 `ap' specifies the pointer to one score.
8156 `bp' specifies the pointer to the other score.
8157 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_score_compare_by_num_desc(const void * ap,const void * bp)8158 static int est_score_compare_by_num_desc(const void *ap, const void *bp){
8159 assert(ap && bp);
8160 return (time_t)((ESTSCORE *)bp)->value - (time_t)((ESTSCORE *)ap)->value;
8161 }
8162
8163
8164 /* Compare two meta scores by each ID for ascending order.
8165 `ap' specifies the pointer to one meta score
8166 `bp' specifies the pointer to the other meta score
8167 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_metascore_compare_by_id_asc(const void * ap,const void * bp)8168 static int est_metascore_compare_by_id_asc(const void *ap, const void *bp){
8169 assert(ap && bp);
8170 return ((ESTMETASCORE *)ap)->id - ((ESTMETASCORE *)bp)->id;
8171 }
8172
8173
8174 /* Compare two meta scores by each ID for descending order.
8175 `ap' specifies the pointer to one meta score
8176 `bp' specifies the pointer to the other meta score
8177 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_metascore_compare_by_id_desc(const void * ap,const void * bp)8178 static int est_metascore_compare_by_id_desc(const void *ap, const void *bp){
8179 assert(ap && bp);
8180 return ((ESTMETASCORE *)bp)->id - ((ESTMETASCORE *)ap)->id;
8181 }
8182
8183
8184 /* Compare two meta scores by each score point for ascending order.
8185 `ap' specifies the pointer to one meta score
8186 `bp' specifies the pointer to the other meta score
8187 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_metascore_compare_by_score_asc(const void * ap,const void * bp)8188 static int est_metascore_compare_by_score_asc(const void *ap, const void *bp){
8189 assert(ap && bp);
8190 return ((ESTMETASCORE *)ap)->score - ((ESTMETASCORE *)bp)->score;
8191 }
8192
8193
8194 /* Compare two meta scores by each score point for descending order.
8195 `ap' specifies the pointer to one meta score
8196 `bp' specifies the pointer to the other meta score
8197 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_metascore_compare_by_score_desc(const void * ap,const void * bp)8198 static int est_metascore_compare_by_score_desc(const void *ap, const void *bp){
8199 assert(ap && bp);
8200 return ((ESTMETASCORE *)bp)->score - ((ESTMETASCORE *)ap)->score;
8201 }
8202
8203
8204 /* Compare two meta scores by attributes of strings for ascending order.
8205 `ap' specifies the pointer to one meta score
8206 `bp' specifies the pointer to the other meta score
8207 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_metascore_compare_by_str_asc(const void * ap,const void * bp)8208 static int est_metascore_compare_by_str_asc(const void *ap, const void *bp){
8209 assert(ap && bp);
8210 return strcmp(((ESTMETASCORE *)ap)->value, ((ESTMETASCORE *)bp)->value);
8211 }
8212
8213
8214 /* Compare two meta scores by attributes of strings for descending order.
8215 `ap' specifies the pointer to one meta score
8216 `bp' specifies the pointer to the other meta score
8217 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_metascore_compare_by_str_desc(const void * ap,const void * bp)8218 static int est_metascore_compare_by_str_desc(const void *ap, const void *bp){
8219 assert(ap && bp);
8220 return strcmp(((ESTMETASCORE *)bp)->value, ((ESTMETASCORE *)ap)->value);
8221 }
8222
8223
8224 /* Compare two meta scores by attributes of numbers for ascending order.
8225 `ap' specifies the pointer to one meta score
8226 `bp' specifies the pointer to the other meta score
8227 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_metascore_compare_by_num_asc(const void * ap,const void * bp)8228 static int est_metascore_compare_by_num_asc(const void *ap, const void *bp){
8229 assert(ap && bp);
8230 return (time_t)((ESTMETASCORE *)ap)->value - (time_t)((ESTMETASCORE *)bp)->value;
8231 }
8232
8233
8234 /* Compare two meta scores by attributes of numbers for descending order.
8235 `ap' specifies the pointer to one meta score
8236 `bp' specifies the pointer to the other meta score
8237 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_metascore_compare_by_num_desc(const void * ap,const void * bp)8238 static int est_metascore_compare_by_num_desc(const void *ap, const void *bp){
8239 assert(ap && bp);
8240 return (time_t)((ESTMETASCORE *)bp)->value - (time_t)((ESTMETASCORE *)ap)->value;
8241 }
8242
8243
8244 /* Get the universal set of documents in a database.
8245 `db' specifies a database object.
8246 `nump' specifies the pointer to which the number of elements in the result is assigned.
8247 `hints' specifies a list object. If it is `NULL', it is not used.
8248 `add' specifies whether the result to be treated in union or difference.
8249 The return value is an array of score structures of corresponding documents. */
est_search_uvset(ESTDB * db,int * nump,CBMAP * hints,int add)8250 static ESTSCORE *est_search_uvset(ESTDB *db, int *nump, CBMAP *hints, int add){
8251 ESTSCORE *scores;
8252 char *vbuf, numbuf[ESTNUMBUFSIZ];
8253 int snum, smax;
8254 assert(db && nump);
8255 smax = ESTALLOCUNIT;
8256 CB_MALLOC(scores, smax * sizeof(ESTSCORE));
8257 snum = 0;
8258 vlcurfirst(db->listdb);
8259 while((vbuf = vlcurval(db->listdb, NULL)) != NULL){
8260 if(snum >= smax){
8261 smax *= 2;
8262 CB_REALLOC(scores, smax * sizeof(ESTSCORE));
8263 }
8264 scores[snum].id = atoi(vbuf);
8265 scores[snum].score = 0;
8266 scores[snum].value = NULL;
8267 snum++;
8268 free(vbuf);
8269 vlcurnext(db->listdb);
8270 }
8271 *nump = snum;
8272 if(hints){
8273 sprintf(numbuf, "%d", snum * (add ? 1 : -1));
8274 cbmapput(hints, ESTOPUVSET, -1, numbuf, -1, TRUE);
8275 }
8276 return scores;
8277 }
8278
8279
8280 /* Expand a word to words which begins with it.
8281 `db' specifies a database object.
8282 `word' specifies a word.
8283 `list' specifies a list object to contain the results. */
est_expand_word_bw(ESTDB * db,const char * word,CBLIST * list)8284 static void est_expand_word_bw(ESTDB *db, const char *word, CBLIST *list){
8285 const char *kbuf;
8286 int num, ksiz;
8287 assert(db && word && list);
8288 num = 0;
8289 vlcurjump(db->fwmdb, word, -1, VL_JFORWARD);
8290 while((kbuf = vlcurkeycache(db->fwmdb, &ksiz)) != NULL){
8291 if(!cbstrfwmatch(kbuf, word)) break;
8292 CB_LISTPUSH(list, kbuf, ksiz);
8293 if(++num >= db->wildmax) break;
8294 vlcurnext(db->fwmdb);
8295 }
8296 }
8297
8298
8299 /* Expand a word to words which ends with it.
8300 `db' specifies a database object.
8301 `word' specifies a word.
8302 `list' specifies a list object to contain the results. */
est_expand_word_ew(ESTDB * db,const char * word,CBLIST * list)8303 static void est_expand_word_ew(ESTDB *db, const char *word, CBLIST *list){
8304 const char *kbuf;
8305 int num, wsiz, ksiz;
8306 assert(db && word && list);
8307 num = 0;
8308 wsiz = strlen(word);
8309 vlcurfirst(db->fwmdb);
8310 while((kbuf = vlcurkeycache(db->fwmdb, &ksiz)) != NULL){
8311 if(ksiz >= wsiz && !memcmp(kbuf + ksiz - wsiz, word, wsiz)){
8312 CB_LISTPUSH(list, kbuf, ksiz);
8313 if(++num >= db->wildmax) break;
8314 }
8315 vlcurnext(db->fwmdb);
8316 }
8317 }
8318
8319
8320 /* Expand regular expressios to words which matches them.
8321 `db' specifies a database object.
8322 `word' specifies regular expressions.
8323 `list' specifies a list object to contain the results. */
est_expand_word_rx(ESTDB * db,const char * word,CBLIST * list)8324 static void est_expand_word_rx(ESTDB *db, const char *word, CBLIST *list){
8325 void *regex;
8326 const char *kbuf;
8327 int num, ksiz;
8328 assert(db && word && list);
8329 if(!(regex = est_regex_new(word))) return;
8330 num = 0;
8331 vlcurfirst(db->fwmdb);
8332 while((kbuf = vlcurkeycache(db->fwmdb, &ksiz)) != NULL){
8333 if(est_regex_match(regex, kbuf)){
8334 CB_LISTPUSH(list, kbuf, ksiz);
8335 if(++num >= db->wildmax) break;
8336 }
8337 vlcurnext(db->fwmdb);
8338 }
8339 est_regex_delete(regex);
8340 }
8341
8342
8343 /* Expand a keyword to keywords which begins with it.
8344 `db' specifies a database object.
8345 `word' specifies a word.
8346 `list' specifies a list object to contain the results. */
est_expand_keyword_bw(ESTDB * db,const char * word,CBLIST * list)8347 static void est_expand_keyword_bw(ESTDB *db, const char *word, CBLIST *list){
8348 const char *kbuf;
8349 int num, ksiz;
8350 assert(db && word && list);
8351 num = 0;
8352 vlcurjump(db->xfmdb, word, -1, VL_JFORWARD);
8353 while((kbuf = vlcurkeycache(db->xfmdb, &ksiz)) != NULL){
8354 if(!cbstrfwmatch(kbuf, word)) break;
8355 CB_LISTPUSH(list, kbuf, ksiz);
8356 if(++num >= db->wildmax) break;
8357 vlcurnext(db->xfmdb);
8358 }
8359 }
8360
8361
8362 /* Expand a keyword to keywords which ends with it.
8363 `db' specifies a database object.
8364 `word' specifies a word.
8365 `list' specifies a list object to contain the results. */
est_expand_keyword_ew(ESTDB * db,const char * word,CBLIST * list)8366 static void est_expand_keyword_ew(ESTDB *db, const char *word, CBLIST *list){
8367 const char *kbuf;
8368 int num, wsiz, ksiz;
8369 assert(db && word && list);
8370 num = 0;
8371 wsiz = strlen(word);
8372 vlcurfirst(db->xfmdb);
8373 while((kbuf = vlcurkeycache(db->xfmdb, &ksiz)) != NULL){
8374 if(ksiz >= wsiz && !memcmp(kbuf + ksiz - wsiz, word, wsiz)){
8375 CB_LISTPUSH(list, kbuf, ksiz);
8376 if(++num >= db->wildmax) break;
8377 }
8378 vlcurnext(db->xfmdb);
8379 }
8380 }
8381
8382
8383 /* Expand regular expressios to keywords which matches them.
8384 `db' specifies a database object.
8385 `word' specifies regular expressions.
8386 `list' specifies a list object to contain the results. */
est_expand_keyword_rx(ESTDB * db,const char * word,CBLIST * list)8387 static void est_expand_keyword_rx(ESTDB *db, const char *word, CBLIST *list){
8388 void *regex;
8389 const char *kbuf;
8390 int num, ksiz;
8391 assert(db && word && list);
8392 if(!(regex = est_regex_new(word))) return;
8393 num = 0;
8394 vlcurfirst(db->xfmdb);
8395 while((kbuf = vlcurkeycache(db->xfmdb, &ksiz)) != NULL){
8396 if(est_regex_match(regex, kbuf)){
8397 CB_LISTPUSH(list, kbuf, ksiz);
8398 if(++num >= db->wildmax) break;
8399 }
8400 vlcurnext(db->xfmdb);
8401 }
8402 est_regex_delete(regex);
8403 }
8404
8405
8406 /* Get a correspinding set of documents in a database.
8407 `db' specifies a database object.
8408 `term' specifies a union term.
8409 `gstep' specifies number of steps of N-gram.
8410 `xpn' specifies the pointer to a function for query expansion. If it is `NULL', it is not
8411 used.
8412 `nump' specifies the pointer to which the number of elements in the result is assigned.
8413 `hints' specifies a list object. If it is `NULL', it is not used.
8414 `add' specifies whether the result to be treated in union or difference.
8415 `auxmin' specifies the minimum hits to adopt the auxiliary index. If it is not more than 0,
8416 the auxiliary index is not used.
8417 `auxwords' specifies a map object where keywords used with the auxiliary index are stored. If
8418 it is `NULL', it is not used.
8419 The return value is an array of score structures of corresponding documents. */
est_search_union(ESTDB * db,const char * term,int gstep,void (* xpn)(const char *,CBLIST *),int * nump,CBMAP * hints,int add,int auxmin,CBMAP * auxwords)8420 static ESTSCORE *est_search_union(ESTDB *db, const char *term, int gstep,
8421 void (*xpn)(const char *, CBLIST *),
8422 int *nump, CBMAP *hints, int add, int auxmin, CBMAP *auxwords){
8423 const ESTSCORE *cscores;
8424 ESTSCORE *scores, *tscores, *nscores;
8425 CBMAP *umap;
8426 CBLIST *words, *grams, *tgrams;
8427 const char *ckey, *word, *gram, *rp, *fnext, *snext, *cbuf;
8428 char *vbuf, *wbuf, numbuf[ESTNUMBUFSIZ];
8429 int i, j, k, snum, smax, cksiz, single, tsmax, tsnum, nsnum, vsiz, gcnum;
8430 int gsiz, csiz, wgstep, nnum, west, wild, mfsiz, mssiz, mfhash, mshash, tfhash, tshash;
8431 int id, vstep, score, hit, hnum;
8432 double avg, sd, dif;
8433 assert(db && term && gstep > 0 && nump);
8434 smax = ESTALLOCUNIT;
8435 CB_MALLOC(scores, smax * sizeof(ESTSCORE));
8436 snum = 0;
8437 words = cbsplit(term, -1, "\t");
8438 if(xpn){
8439 umap = cbmapopenex(ESTMINIBNUM);
8440 for(i = 0; i < CB_LISTNUM(words); i++){
8441 word = CB_LISTVAL(words, i);
8442 if(word[0] == '\0' || word[0] == ' ') continue;
8443 CB_LISTOPEN(grams);
8444 xpn(word, grams);
8445 for(j = 0; j < CB_LISTNUM(grams); j++){
8446 word = CB_LISTVAL(grams, j);
8447 cbmapput(umap, word, -1, "", 0, FALSE);
8448 }
8449 CB_LISTCLOSE(grams);
8450 }
8451 CB_LISTCLOSE(words);
8452 words = cbmapkeys(umap);
8453 cbmapclose(umap);
8454 }
8455 for(i = 0; i < CB_LISTNUM(words); i++){
8456 ckey = CB_LISTVAL2(words, i, cksiz);
8457 if(cksiz < 1) continue;
8458 word = ckey;
8459 wbuf = NULL;
8460 if((cscores = est_rescc_get(db, ckey, cksiz, &tsnum)) != NULL){
8461 if(word[0] == ' '){
8462 word++;
8463 if(word[0] != '\0') word++;
8464 }
8465 if(hints){
8466 sprintf(numbuf, "%d", tsnum * (add ? 1 : -1));
8467 cbmapput(hints, word, -1, numbuf, -1, TRUE);
8468 }
8469 for(j = 0; j < tsnum; j++){
8470 if(snum >= smax){
8471 smax *= 2;
8472 CB_REALLOC(scores, smax * sizeof(ESTSCORE));
8473 }
8474 scores[snum].id = cscores[j].id;
8475 scores[snum].score = cscores[j].score;
8476 snum++;
8477 }
8478 } else if(!strchr(word + 1, ' ') && auxmin > 0 &&
8479 (tscores = est_search_keywords(db, word, auxmin, &tsnum)) != NULL){
8480 if(word[0] == ' '){
8481 word++;
8482 if(word[0] != '\0') word++;
8483 }
8484 if(hints){
8485 sprintf(numbuf, "%d", tsnum * (add ? 1 : -1));
8486 cbmapput(hints, word, -1, numbuf, -1, TRUE);
8487 }
8488 if(auxwords) cbmapput(auxwords, word, -1, "", 0, FALSE);
8489 for(j = 0; j < tsnum; j++){
8490 if(snum >= smax){
8491 smax *= 2;
8492 CB_REALLOC(scores, smax * sizeof(ESTSCORE));
8493 }
8494 scores[snum].id = tscores[j].id;
8495 scores[snum].score = tscores[j].score;
8496 snum++;
8497 }
8498 free(tscores);
8499 } else {
8500 wild = '\0';
8501 if(word[0] == ' '){
8502 word++;
8503 if(word[0] == 'b'){
8504 wild = 'b';
8505 word++;
8506 } else if(word[0] == 'e'){
8507 wild = 'e';
8508 word++;
8509 } else if(word[0] == 'r'){
8510 wild = 'r';
8511 word++;
8512 }
8513 }
8514 west = ((unsigned char *)word)[0] <= 0xdf;
8515 if(!west || db->amode) wild = '\0';
8516 single = FALSE;
8517 CB_LISTOPEN(grams);
8518 switch(wild){
8519 case 'b':
8520 est_break_text(word, grams, TRUE, FALSE);
8521 CB_LISTPUSH(grams, word, strlen(word));
8522 while(CB_LISTNUM(grams) > 1){
8523 CB_LISTDROP(grams);
8524 }
8525 wbuf = cbmemdup(CB_LISTVAL(grams, 0), -1);
8526 word = wbuf;
8527 est_expand_word_bw(db, word, grams);
8528 single = TRUE;
8529 break;
8530 case 'e':
8531 est_break_text(word, grams, TRUE, FALSE);
8532 cblistunshift(grams, word, -1);
8533 while(CB_LISTNUM(grams) > 1){
8534 free(cblistshift(grams, NULL));
8535 }
8536 wbuf = cbmemdup(CB_LISTVAL(grams, 0), -1);
8537 word = wbuf;
8538 est_expand_word_ew(db, word, grams);
8539 single = TRUE;
8540 break;
8541 case 'r':
8542 est_break_text(word, grams, TRUE, FALSE);
8543 while(CB_LISTNUM(grams) > 0){
8544 free(cblistshift(grams, NULL));
8545 }
8546 est_expand_word_rx(db, word, grams);
8547 single = TRUE;
8548 break;
8549 default:
8550 switch(db->amode){
8551 case ESTDFPERFNG:
8552 est_break_text_perfng(word, grams, TRUE, FALSE);
8553 break;
8554 case ESTDFCHRCAT:
8555 est_break_text_chrcat(word, grams, TRUE);
8556 break;
8557 default:
8558 est_break_text(word, grams, TRUE, FALSE);
8559 break;
8560 }
8561 if(CB_LISTNUM(grams) < 1){
8562 est_expand_word_bw(db, word, grams);
8563 single = TRUE;
8564 }
8565 break;
8566 }
8567 tsmax = ESTALLOCUNIT;
8568 CB_MALLOC(tscores, tsmax * sizeof(ESTSCORE));
8569 tsnum = 0;
8570 gcnum = 0;
8571 wgstep = !single && (CB_LISTNUM(grams) > 2 || gstep > 2) ? gstep : 1;
8572 if(west && gstep <= 2) wgstep = 1;
8573 for(j = 0; j < CB_LISTNUM(grams); j += wgstep){
8574 gcnum++;
8575 gram = CB_LISTVAL2(grams, j, gsiz);
8576 fnext = cblistval(grams, j + 1, &mfsiz);
8577 snext = cblistval(grams, j + 2, &mssiz);
8578 mfhash = fnext ? dpinnerhash(fnext, mfsiz) % ESTJHASHNUM + 1: 0xff;
8579 mshash = snext ? dpouterhash(snext, mssiz) % ESTJHASHNUM + 1: 0xff;
8580 vbuf = est_idx_scan(db->idxdb, gram, gsiz, &vsiz, db->smode);
8581 if((cbuf = cbmapget(db->idxcc, gram, gsiz, &csiz)) != NULL){
8582 CB_REALLOC(vbuf, vsiz + csiz + 1);
8583 memcpy(vbuf + vsiz, cbuf, csiz);
8584 vsiz += csiz;
8585 }
8586 rp = vbuf;
8587 while(rp < vbuf + vsiz){
8588 EST_READ_VNUMBUF(rp, id, vstep);
8589 rp += vstep;
8590 switch(db->smode){
8591 case ESTDFSCVOID:
8592 score = 0;
8593 break;
8594 default:
8595 score = *(unsigned char *)rp;
8596 rp++;
8597 break;
8598 case ESTDFSCINT:
8599 case ESTDFSCASIS:
8600 memcpy(&score, rp, sizeof(int));
8601 rp += sizeof(int);
8602 break;
8603 }
8604 hit = mfhash == 0xff && mshash == 0xff;
8605 while(rp < vbuf + vsiz && *(unsigned char *)rp != 0x00){
8606 tfhash = *(unsigned char *)rp;
8607 rp++;
8608 tshash = *(unsigned char *)rp;
8609 rp++;
8610 if((mfhash == 0xff || mfhash == tfhash) && (mshash == 0xff || mshash == tshash))
8611 hit = TRUE;
8612 }
8613 rp++;
8614 if(hit || single){
8615 if(tsnum >= tsmax){
8616 tsmax *= 2;
8617 CB_REALLOC(tscores, tsmax * sizeof(ESTSCORE));
8618 }
8619 tscores[tsnum].id = id;
8620 switch(db->smode){
8621 case ESTDFSCVOID:
8622 tscores[tsnum].score = rp - vbuf;
8623 break;
8624 default:
8625 tscores[tsnum].score = score * 100 + 10;
8626 break;
8627 case ESTDFSCASIS:
8628 tscores[tsnum].score = score;
8629 break;
8630 }
8631 tsnum++;
8632 }
8633 }
8634 free(vbuf);
8635 }
8636 if(CB_LISTNUM(grams) == 1 && !single && db->amode == 0 && *(unsigned char *)word < 0xe0){
8637 CB_LISTOPEN(tgrams);
8638 est_break_text(word, tgrams, TRUE, TRUE);
8639 if(CB_LISTNUM(tgrams) == 2){
8640 gram = CB_LISTVAL(tgrams, 1);
8641 nscores = est_search_union(db, gram, 1, NULL, &nsnum, NULL, TRUE, -1, NULL);
8642 for(j = 0; j < nsnum; j++){
8643 if(tsnum >= tsmax){
8644 tsmax *= 2;
8645 CB_REALLOC(tscores, tsmax * sizeof(ESTSCORE));
8646 }
8647 tscores[tsnum].id = nscores[j].id;
8648 tscores[tsnum].score = nscores[j].score;
8649 tsnum++;
8650 }
8651 free(nscores);
8652 gcnum++;
8653 }
8654 CB_LISTCLOSE(tgrams);
8655 }
8656 if(gcnum > 1){
8657 qsort(tscores, tsnum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
8658 nnum = 0;
8659 for(j = 0; j < tsnum; j++){
8660 id = tscores[j].id;
8661 score = tscores[j].score;
8662 hnum = 1;
8663 if(db->smode == ESTDFSCASIS){
8664 for(k = j + 1; k < tsnum && tscores[k].id == id; k++){
8665 hnum++;
8666 }
8667 if(hnum >= gcnum || single){
8668 tscores[nnum].id = id;
8669 tscores[nnum].score = score;
8670 nnum++;
8671 }
8672 } else {
8673 for(k = j + 1; k < tsnum && tscores[k].id == id; k++){
8674 score += tscores[k].score;
8675 hnum++;
8676 }
8677 if(hnum >= gcnum || single){
8678 tscores[nnum].id = id;
8679 tscores[nnum].score = score / hnum;
8680 nnum++;
8681 }
8682 }
8683 j = k - 1;
8684 }
8685 tsnum = nnum;
8686 }
8687 if(hints){
8688 sprintf(numbuf, "%d", tsnum * (add ? 1 : -1));
8689 cbmapput(hints, word, -1, numbuf, -1, TRUE);
8690 }
8691 CB_LISTCLOSE(grams);
8692 if(db->smode != ESTDFSCASIS && !strchr(word, ' ') && auxmin > 0)
8693 est_weight_keywords(db, word, tscores, tsnum);
8694 for(j = 0; j < tsnum; j++){
8695 if(snum >= smax){
8696 smax *= 2;
8697 CB_REALLOC(scores, smax * sizeof(ESTSCORE));
8698 }
8699 scores[snum].id = tscores[j].id;
8700 scores[snum].score = tscores[j].score;
8701 snum++;
8702 }
8703 est_rescc_put(db, ckey, cksiz, tscores, tsnum);
8704 }
8705 free(wbuf);
8706 }
8707 CB_LISTCLOSE(words);
8708 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
8709 nnum = 0;
8710 for(i = 0; i < snum; i++){
8711 id = scores[i].id;
8712 score = scores[i].score;
8713 hnum = 1;
8714 for(j = i + 1; j < snum && scores[j].id == id; j++){
8715 score += scores[j].score;
8716 hnum++;
8717 }
8718 scores[nnum].id = id;
8719 scores[nnum].score = score / hnum;
8720 scores[nnum].value = NULL;
8721 nnum++;
8722 i = j - 1;
8723 }
8724 *nump = nnum;
8725 if(db->smode != ESTDFSCASIS && nnum > 0){
8726 avg = 0.0;
8727 for(i = 0; i < nnum; i++){
8728 avg += scores[i].score;
8729 }
8730 avg /= nnum;
8731 sd = 0.0;
8732 for(i = 0; i < nnum; i++){
8733 dif = avg - scores[i].score;
8734 sd += dif * dif;
8735 }
8736 sd /= nnum;
8737 sd = sqrt(sd);
8738 if(sd < 0.1){
8739 for(i = 0; i < nnum; i++){
8740 scores[i].score = ESTSCOREUNIT / 2;
8741 }
8742 } else {
8743 for(i = 0; i < nnum; i++){
8744 scores[i].score = (int)(((scores[i].score - avg) * (ESTSCOREUNIT / 10.0) / sd) +
8745 ESTSCOREUNIT / 2.0);
8746 }
8747 }
8748 }
8749 return scores;
8750 }
8751
8752
8753 /* Get scores in the result cache.
8754 `db' specifies a database object.
8755 `word' specifies a search word.
8756 `size' specifies the size of the word.
8757 `nump' specifies the pointer to which the number of elements in the result is assigned.
8758 The return value is an array whose elements are ID numbers of corresponding documents. */
est_rescc_get(ESTDB * db,const char * word,int size,int * nump)8759 static const ESTSCORE *est_rescc_get(ESTDB *db, const char *word, int size, int *nump){
8760 const char *vbuf;
8761 int vsiz;
8762 assert(db && word && size >= 0 && nump);
8763 if(!(vbuf = cbmapget(db->rescc, word, size, &vsiz))) return NULL;
8764 if(vsiz == sizeof(ESTSCORE) && ((ESTSCORE *)vbuf)->id == -1) return NULL;
8765 cbmapmove(db->rescc, word, size, FALSE);
8766 *nump = vsiz / sizeof(ESTSCORE);
8767 return (ESTSCORE *)vbuf;
8768 }
8769
8770
8771 /* Add scores into the result cache.
8772 `db' specifies a database object.
8773 `word' specifies a search word.
8774 `size' specifies the size of the word.
8775 `scores' specifies an array of scores. It is released in this function.
8776 `num' specifies the number of elements of the score array. */
est_rescc_put(ESTDB * db,const char * word,int size,ESTSCORE * scores,int num)8777 static void est_rescc_put(ESTDB *db, const char *word, int size, ESTSCORE *scores, int num){
8778 int i;
8779 assert(db && word && size >= 0 && scores && num >= 0);
8780 if(db->rcmnum < 1){
8781 free(scores);
8782 return;
8783 }
8784 cbmapput(db->rescc, word, size, (char *)scores, num * sizeof(ESTSCORE), TRUE);
8785 free(scores);
8786 if(cbmaprnum(db->rescc) > db->rcmnum){
8787 num = db->rcmnum * 0.1 + 1;
8788 cbmapiterinit(db->rescc);
8789 for(i = 0; i < num && (word = cbmapiternext(db->rescc, &size)) != NULL; i++){
8790 cbmapout(db->rescc, word, size);
8791 }
8792 }
8793 }
8794
8795
8796 /* Search the auxiliary index.
8797 `db' specifies a database object.
8798 `word' specifies a search word.
8799 `min' specifies the minimum hits to adopt the auxiliary index.
8800 `nump' specifies the pointer to which the number of elements in the result is assigned.
8801 The return value is an array of score structures of corresponding documents. */
est_search_keywords(ESTDB * db,const char * word,int min,int * nump)8802 static ESTSCORE *est_search_keywords(ESTDB *db, const char *word, int min, int *nump){
8803 ESTSCORE *scores;
8804 CBLIST *words;
8805 CBDATUM *rbuf;
8806 const int *res;
8807 int i, rnum, snum, wsiz, nnum, lid;
8808 assert(db && word && min >= 0 && nump);
8809 if(*word != ' ' && (res = (int *)vlgetcache(db->auxdb, word, -1, &rnum)) != NULL &&
8810 (rnum /= sizeof(int)) / 2 >= min){
8811 CB_MALLOC(scores, (rnum / 2) * sizeof(ESTSCORE) + 1);
8812 snum = 0;
8813 for(i = 0; i < rnum; i += 2){
8814 scores[snum].id = res[i];
8815 scores[snum].score = res[i+1];
8816 snum++;
8817 }
8818 *nump = snum;
8819 return scores;
8820 }
8821 CB_LISTOPEN(words);
8822 if(*word == ' '){
8823 word++;
8824 if(*word == 'b'){
8825 est_expand_keyword_bw(db, word + 1, words);
8826 } else if(*word == 'e'){
8827 est_expand_keyword_ew(db, word + 1, words);
8828 } else if(*word == 'r'){
8829 est_expand_keyword_rx(db, word + 1, words);
8830 }
8831 } else if(*(unsigned char *)word >= 0xe3){
8832 est_expand_keyword_bw(db, word, words);
8833 }
8834 CB_DATUMOPEN(rbuf);
8835 for(i = 0; i < CB_LISTNUM(words) &&
8836 CB_DATUMSIZE(rbuf) <= sizeof(int) * 2 * min * ESTAUXEXRAT; i++){
8837 word = CB_LISTVAL2(words, i, wsiz);
8838 if(!(res = (int *)vlgetcache(db->auxdb, word, wsiz, &rnum))) continue;
8839 CB_DATUMCAT(rbuf, (char *)res, rnum);
8840 }
8841 res = (int *)CB_DATUMPTR(rbuf);
8842 rnum = CB_DATUMSIZE(rbuf);
8843 if((rnum /= sizeof(int)) / 2 < min){
8844 CB_DATUMCLOSE(rbuf);
8845 CB_LISTCLOSE(words);
8846 return NULL;
8847 }
8848 CB_MALLOC(scores, (rnum / 2) * sizeof(ESTSCORE) + 1);
8849 snum = 0;
8850 for(i = 0; i < rnum; i += 2){
8851 scores[snum].id = res[i];
8852 scores[snum].score = res[i+1];
8853 snum++;
8854 }
8855 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
8856 nnum = 0;
8857 lid = -1;
8858 for(i = 0; i < snum; i++){
8859 if(nnum > 0 && scores[i].id == lid){
8860 scores[nnum-1].score += scores[i].score;
8861 continue;
8862 }
8863 scores[nnum].id = scores[i].id;
8864 scores[nnum].score = scores[i].score;
8865 nnum++;
8866 lid = scores[i].id;
8867 }
8868 CB_DATUMCLOSE(rbuf);
8869 CB_LISTCLOSE(words);
8870 *nump = nnum;
8871 return scores;
8872 }
8873
8874
8875 /* Weight scores with the auxiliary index.
8876 `db' specifies a database object.
8877 `word' specifies a search word.
8878 `scores' specifies an array of scores of search candidates.
8879 `snum' specifies the number of the array. */
est_weight_keywords(ESTDB * db,const char * word,ESTSCORE * scores,int snum)8880 static void est_weight_keywords(ESTDB *db, const char *word, ESTSCORE *scores, int snum){
8881 ESTSCORE *kscores;
8882 const int *res;
8883 int i, knum, nnum;
8884 double rank;
8885 if(!(res = (int *)vlgetcache(db->auxdb, word, -1, &knum)) || knum < 2) return;
8886 knum /= sizeof(int);
8887 CB_MALLOC(kscores, knum / 2 * sizeof(ESTSCORE));
8888 rank = knum / 2 + 1;
8889 nnum = 0;
8890 for(i = 0; i < knum; i += 2){
8891 kscores[nnum].id = res[i];
8892 kscores[nnum].score = (pow(rank, 0.7) / 8.0 + 1.0) * 10000.0;
8893 nnum++;
8894 rank -= 1.0;
8895 }
8896 knum = nnum;
8897 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
8898 qsort(kscores, knum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
8899 nnum = 0;
8900 for(i = 0; i < snum; i++){
8901 while(nnum < knum && kscores[nnum].id < scores[i].id){
8902 nnum++;
8903 }
8904 if(nnum < knum && kscores[nnum].id == scores[i].id)
8905 scores[i].score *= kscores[nnum].score / 10000.0;
8906 }
8907 free(kscores);
8908 }
8909
8910
8911 /* Get scores correspinding a ranking search with an attribute narrowing index.
8912 `db' specifies a database object.
8913 `name' specifies the name of an attribute.
8914 `nump' specifies the pointer to which the number of elements in the result is assigned.
8915 The return value is an array of score structures of corresponding documents. */
est_search_rank(ESTDB * db,const char * name,int top,int * nump)8916 static ESTSCORE *est_search_rank(ESTDB *db, const char *name, int top, int *nump){
8917 ESTATTRIDX *attridx;
8918 ESTSCORE *scores;
8919 const char *kbuf;
8920 int snum, ksiz, id;
8921 assert(db && name && nump);
8922 if(top == 0 || !(attridx = (ESTATTRIDX *)cbmapget(db->aidxs, name, -1, NULL)) ||
8923 (attridx->type != ESTIDXATTRSTR && attridx->type != ESTIDXATTRNUM)){
8924 *nump = 0;
8925 return cbmalloc(1);
8926 }
8927 snum = abs(top);
8928 if(snum > db->dnum) snum = db->dnum;
8929 CB_MALLOC(scores, snum * sizeof(ESTSCORE) + 1);
8930 snum = 0;
8931 if(top > 0){
8932 vlcurfirst(attridx->db);
8933 while(snum < top && (kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL){
8934 if(ksiz < sizeof(int)){
8935 vlcurnext(attridx->db);
8936 continue;
8937 }
8938 memcpy(&id, kbuf + ksiz - sizeof(int), sizeof(int));
8939 if(id < 1){
8940 vlcurnext(attridx->db);
8941 continue;
8942 }
8943 scores[snum].id = id;
8944 scores[snum].score = 0;
8945 scores[snum].value = NULL;
8946 snum++;
8947 vlcurnext(attridx->db);
8948 }
8949 } else {
8950 top *= -1;
8951 vlcurlast(attridx->db);
8952 while(snum < top && (kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL){
8953 if(ksiz < sizeof(int)){
8954 vlcurprev(attridx->db);
8955 continue;
8956 }
8957 memcpy(&id, kbuf + ksiz - sizeof(int), sizeof(int));
8958 if(id < 1){
8959 vlcurprev(attridx->db);
8960 continue;
8961 }
8962 scores[snum].id = id;
8963 scores[snum].score = 0;
8964 scores[snum].value = NULL;
8965 snum++;
8966 vlcurprev(attridx->db);
8967 }
8968 }
8969 *nump = snum;
8970 return scores;
8971 }
8972
8973
8974 /* Get scores correspinding an attribute expression with an attribute narrowing index.
8975 `db' specifies a database object.
8976 `expr' specifies an attribute search expression.
8977 `nump' specifies the pointer to which the number of elements in the result is assigned.
8978 The return value is an array of score structures of corresponding documents or `NULL' if no
8979 index is available. */
est_search_aidx_attr(ESTDB * db,const char * expr,int * nump)8980 static ESTSCORE *est_search_aidx_attr(ESTDB *db, const char *expr, int *nump){
8981 ESTATTRIDX *attridx;
8982 ESTSCORE *scores;
8983 CBDATUM *abuf;
8984 CBLIST *tokens;
8985 void *regex;
8986 const char *cop, *pv, *kbuf, *tbuf;
8987 unsigned char *utmp;
8988 char *name, *oper, *val, *sval, *wp, numbuf[ESTNUMBUFSIZ];
8989 int i, nsiz, vsiz, ksiz, tsiz, sign, ic, ssiz, esc, jmp, len, *ary, anum;
8990 time_t num, lower, upper;
8991 assert(db && expr && nump);
8992 name = NULL;
8993 oper = NULL;
8994 val = NULL;
8995 nsiz = 0;
8996 vsiz = 0;
8997 while(*expr > 0 && *expr <= ' '){
8998 expr++;
8999 }
9000 if((pv = strchr(expr, ' ')) != NULL){
9001 nsiz = pv - expr;
9002 name = cbmemdup(expr, nsiz);
9003 expr = pv;
9004 while(*expr > 0 && *expr <= ' '){
9005 expr++;
9006 }
9007 if((pv = strchr(expr, ' ')) != NULL){
9008 oper = cbmemdup(expr, pv - expr);
9009 expr = pv;
9010 while(*expr > 0 && *expr <= ' '){
9011 expr++;
9012 }
9013 vsiz = strlen(expr);
9014 val = cbmemdup(expr, vsiz);
9015 } else {
9016 oper = cbmemdup(expr, -1);
9017 }
9018 } else {
9019 nsiz = strlen(expr);
9020 name = cbmemdup(expr, nsiz);
9021 }
9022 if(!oper){
9023 oper = cbmemdup("", 0);
9024 }
9025 if(!val){
9026 vsiz = 0;
9027 val = cbmemdup("", 0);
9028 }
9029 cop = oper;
9030 if(*cop == '!'){
9031 sign = FALSE;
9032 cop++;
9033 } else {
9034 sign = TRUE;
9035 }
9036 if(*cop == 'I' || *cop == 'i'){
9037 ic = !est_check_cjk_only(val);
9038 cop++;
9039 } else {
9040 ic = FALSE;
9041 }
9042 regex = NULL;
9043 if(!cbstricmp(cop, ESTOPSTREQ)){
9044 cop = ESTOPSTREQ;
9045 } else if(!cbstricmp(cop, ESTOPSTRNE)){
9046 cop = ESTOPSTRNE;
9047 } else if(!cbstricmp(cop, ESTOPSTRINC)){
9048 cop = ESTOPSTRINC;
9049 } else if(!cbstricmp(cop, ESTOPSTRBW)){
9050 cop = ESTOPSTRBW;
9051 } else if(!cbstricmp(cop, ESTOPSTREW)){
9052 cop = ESTOPSTREW;
9053 } else if(!cbstricmp(cop, ESTOPSTRAND)){
9054 cop = ESTOPSTRAND;
9055 } else if(!cbstricmp(cop, ESTOPSTROR)){
9056 cop = ESTOPSTROR;
9057 } else if(!cbstricmp(cop, ESTOPSTROREQ)){
9058 cop = ESTOPSTROREQ;
9059 } else if(!cbstricmp(cop, ESTOPSTRRX)){
9060 cop = ESTOPSTRRX;
9061 regex = est_regex_new(val);
9062 } else if(!cbstricmp(cop, ESTOPNUMEQ)){
9063 cop = ESTOPNUMEQ;
9064 } else if(!cbstricmp(cop, ESTOPNUMNE)){
9065 cop = ESTOPNUMNE;
9066 } else if(!cbstricmp(cop, ESTOPNUMGT)){
9067 cop = ESTOPNUMGT;
9068 } else if(!cbstricmp(cop, ESTOPNUMGE)){
9069 cop = ESTOPNUMGE;
9070 } else if(!cbstricmp(cop, ESTOPNUMLT)){
9071 cop = ESTOPNUMLT;
9072 } else if(!cbstricmp(cop, ESTOPNUMLE)){
9073 cop = ESTOPNUMLE;
9074 } else if(!cbstricmp(cop, ESTOPNUMBT)){
9075 cop = ESTOPNUMBT;
9076 } else {
9077 cop = ESTOPSTRINC;
9078 val[0] = '\0';
9079 vsiz = 0;
9080 }
9081 num = cbstrmktime(val);
9082 if(!(attridx = (ESTATTRIDX *)cbmapget(db->aidxs, name, nsiz, NULL)) ||
9083 (attridx->type != ESTIDXATTRSTR && attridx->type != ESTIDXATTRNUM) ||
9084 (attridx->type == ESTIDXATTRNUM &&
9085 cop != ESTOPNUMEQ && cop != ESTOPNUMNE && cop != ESTOPNUMGT && cop != ESTOPNUMGE &&
9086 cop != ESTOPNUMLT && cop != ESTOPNUMLE && cop != ESTOPNUMBT)){
9087 if(regex) est_regex_delete(regex);
9088 free(val);
9089 free(oper);
9090 free(name);
9091 return NULL;
9092 }
9093 CB_DATUMOPEN(abuf);
9094 if(!sign || ic){
9095 if(ic){
9096 utmp = (unsigned char *)est_uconv_in(val, vsiz, &tsiz);
9097 est_normalize_text(utmp, tsiz, &tsiz);
9098 est_canonicalize_text(utmp, tsiz, FALSE);
9099 sval = (char *)est_uconv_out((char *)utmp, tsiz, &ssiz);
9100 free(utmp);
9101 } else {
9102 sval = NULL;
9103 ssiz = 0;
9104 }
9105 esc = INT_MAX;
9106 jmp = INT_MAX;
9107 if(sign && (cop == ESTOPSTREQ || cop == ESTOPSTRBW) && vsiz > 0){
9108 if(*sval > 0x0 && *sval < 0x7f){
9109 numbuf[0] = *sval;
9110 numbuf[1] = '\0';
9111 esc = *(unsigned char *)sval;
9112 if(*sval >= 'a' && *sval <= 'z'){
9113 numbuf[0] -= 'a' - 'A';
9114 jmp = *sval - 'a' + 'A';
9115 }
9116 vlcurjump(attridx->db, numbuf, 1, VL_JFORWARD);
9117 } else if(*(unsigned char *)sval >= 0xc0){
9118 numbuf[0] = *sval;
9119 numbuf[1] = '\0';
9120 esc = *(unsigned char *)sval;
9121 vlcurjump(attridx->db, numbuf, 1, VL_JFORWARD);
9122 } else {
9123 vlcurfirst(attridx->db);
9124 }
9125 } else {
9126 vlcurfirst(attridx->db);
9127 }
9128 while((kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL){
9129 if(est_match_attr(kbuf, ksiz - sizeof(int) - 1,
9130 cop, sign, val, vsiz, sval, ssiz, regex, num))
9131 CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
9132 if(*(unsigned char *)kbuf > jmp && *(unsigned char *)kbuf < *(unsigned char *)sval){
9133 numbuf[0] = *sval;
9134 numbuf[1] = '\0';
9135 vlcurjump(attridx->db, numbuf, 1, VL_JFORWARD);
9136 jmp = INT_MAX;
9137 } else if(*(unsigned char *)kbuf > esc){
9138 break;
9139 } else {
9140 vlcurnext(attridx->db);
9141 }
9142 }
9143 if(sval) free(sval);
9144 } else if(cop == ESTOPSTROREQ){
9145 tokens = cbsplit(val, vsiz, " ,");
9146 cblistsort(tokens);
9147 for(i = 0; i < CB_LISTNUM(tokens); i++){
9148 tbuf = CB_LISTVAL2(tokens, i, tsiz);
9149 vlcurjump(attridx->db, tbuf, tsiz, VL_JFORWARD);
9150 while((kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL && !strcmp(kbuf, tbuf)){
9151 CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
9152 vlcurnext(attridx->db);
9153 }
9154 }
9155 CB_LISTCLOSE(tokens);
9156 } else if(cop == ESTOPNUMBT){
9157 if((wp = strchr(val, ' ')) != NULL || (wp = strchr(val, '\t')) != NULL){
9158 *(wp++) = '\0';
9159 while(*wp == ' ' || *wp == '\t'){
9160 wp++;
9161 }
9162 lower = cbstrmktime(val);
9163 upper = cbstrmktime(wp);
9164 } else {
9165 lower = cbstrmktime(val);
9166 upper = INT_MAX;
9167 }
9168 len = sprintf(numbuf, "%.0f", (double)lower);
9169 vlcurjump(attridx->db, numbuf, len, VL_JFORWARD);
9170 while((kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL && cbstrmktime(kbuf) <= upper){
9171 CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
9172 vlcurnext(attridx->db);
9173 }
9174 } else {
9175 if(cop == ESTOPSTREQ || cop == ESTOPSTRBW ||
9176 cop == ESTOPNUMEQ || cop == ESTOPNUMGT || cop == ESTOPNUMGE){
9177 vlcurjump(attridx->db, val, vsiz, VL_JFORWARD);
9178 if(cop == ESTOPNUMGT){
9179 while((kbuf = vlcurkeycache(attridx->db, NULL)) != NULL && cbstrmktime(kbuf) <= num){
9180 vlcurnext(attridx->db);
9181 }
9182 }
9183 } else if(cop == ESTOPNUMLT || cop == ESTOPNUMLE){
9184 len = sprintf(numbuf, "%.0f", (double)cbstrmktime(val) + 1);
9185 vlcurjump(attridx->db, numbuf, len, VL_JBACKWARD);
9186 if(cop == ESTOPNUMLT){
9187 while((kbuf = vlcurkeycache(attridx->db, NULL)) != NULL && cbstrmktime(kbuf) >= num){
9188 vlcurprev(attridx->db);
9189 }
9190 }
9191 } else {
9192 vlcurfirst(attridx->db);
9193 }
9194 while((kbuf = vlcurkeycache(attridx->db, &ksiz)) != NULL){
9195 if(est_match_attr(kbuf, ksiz - sizeof(int) - 1,
9196 cop, TRUE, val, vsiz, NULL, 0, regex, num)){
9197 CB_DATUMCAT(abuf, kbuf + ksiz - sizeof(int), sizeof(int));
9198 } else if(cop == ESTOPSTREQ || cop == ESTOPSTRBW || cop == ESTOPNUMEQ){
9199 break;
9200 }
9201 if(cop == ESTOPNUMLT || cop == ESTOPNUMLE){
9202 vlcurprev(attridx->db);
9203 } else {
9204 vlcurnext(attridx->db);
9205 }
9206 }
9207 }
9208 ary = (int *)CB_DATUMPTR(abuf);
9209 anum = CB_DATUMSIZE(abuf) / sizeof(int);
9210 CB_MALLOC(scores, anum * sizeof(ESTSCORE) + 1);
9211 for(i = 0; i < anum; i++){
9212 scores[i].id = ary[i];
9213 scores[i].score = 0;
9214 scores[i].value = NULL;
9215 }
9216 *nump = anum;
9217 CB_DATUMCLOSE(abuf);
9218 if(regex) est_regex_delete(regex);
9219 free(val);
9220 free(oper);
9221 free(name);
9222 return scores;
9223 }
9224
9225
9226 /* Get a correspinding set of documents in pseudo indexes.
9227 `db' specifies a database object.
9228 `cond' specifies a search condition object.
9229 `scores' specifies an array of scores of search candidates.
9230 `nump' specifies the pointer to which the number of elements in the parameter and result is
9231 assigned.
9232 `ordattrs' specifies a map object into which ordering attributes are stored.
9233 The return value is an array of re-allocated score structures. */
est_search_pidxs(ESTDB * db,ESTCOND * cond,ESTSCORE * scores,int * nump,CBMAP * ordattrs)9234 static ESTSCORE *est_search_pidxs(ESTDB *db, ESTCOND *cond, ESTSCORE *scores, int *nump,
9235 CBMAP *ordattrs){
9236 ESTCATTR *list;
9237 ESTDOC *doc;
9238 const char *otype, *lbuf, *vbuf;
9239 char *oname, *wp;
9240 int i, j, k, snum, anum, id, hit, sc, miss, lsiz, vsiz;
9241 double avg, sd, dif, tune;
9242 assert(db && cond && scores && nump && ordattrs);
9243 snum = *nump;
9244 CB_REALLOC(scores, (snum + CB_LISTNUM(db->pdocs)) * sizeof(ESTSCORE) + 1);
9245 if(cond->phrase){
9246 if(cbstrfwmatch(cond->phrase, ESTOPID)){
9247 return scores;
9248 } else if(cbstrfwmatch(cond->phrase, ESTOPURI)){
9249 return scores;
9250 } else if(cbstrfwmatch(cond->phrase, ESTOPSIMILAR)){
9251 return scores;
9252 }
9253 }
9254 oname = NULL;
9255 otype = NULL;
9256 if(cond->order){
9257 oname = cbmemdup(cond->order, -1);
9258 cbstrtrim(oname);
9259 otype = ESTORDSTRA;
9260 if((wp = strchr(oname, ' ')) != NULL){
9261 *(wp++) = '\0';
9262 while(*wp == ' '){
9263 wp++;
9264 }
9265 otype = wp;
9266 }
9267 }
9268 list = NULL;
9269 anum = -1;
9270 if(cond->attrs) list = est_make_cattr_list(cond->attrs, &anum);
9271 for(i = 0; i < CB_LISTNUM(db->pdocs); i++){
9272 id = ESTPDOCIDMIN + i;
9273 hit = FALSE;
9274 sc = 0;
9275 doc = NULL;
9276 if(!cond->phrase || cond->phrase[0] == '\0'){
9277 hit = cond->attrs ? TRUE : FALSE;
9278 } else if(cbstrfwmatch(cond->phrase, ESTOPUVSET)){
9279 hit = TRUE;
9280 } else {
9281 if((doc = est_db_get_doc(db, id, 0)) != NULL){
9282 hit = est_db_score_doc(db, doc, cond, &sc);
9283 } else {
9284 hit = FALSE;
9285 }
9286 }
9287 if(hit && list){
9288 if(!doc && !(doc = est_db_get_doc(db, id, 0))){
9289 hit = FALSE;
9290 } else {
9291 miss = FALSE;
9292 for(j = 0; !miss && j < anum; j++){
9293 if(list[j].nsiz < 1) continue;
9294 if(list[j].nlist){
9295 hit = FALSE;
9296 for(k = 0; k < CB_LISTNUM(list[j].nlist); k++){
9297 lbuf = CB_LISTVAL2(list[j].nlist, k, lsiz);
9298 if(lsiz < 1) continue;
9299 if(!(vbuf = cbmapget(doc->attrs, lbuf, lsiz, &vsiz))) continue;
9300 if(est_match_attr(vbuf, vsiz, list[j].cop, list[j].sign, list[j].val, list[j].vsiz,
9301 list[j].sval, list[j].ssiz, list[j].regex, list[j].num)){
9302 hit = TRUE;
9303 break;
9304 }
9305 }
9306 if(!hit) miss = TRUE;
9307 } else if(!(vbuf = cbmapget(doc->attrs, list[j].name, list[j].nsiz, &vsiz))){
9308 miss = TRUE;
9309 } else if(!est_match_attr(vbuf, vsiz, list[j].cop, list[j].sign,
9310 list[j].val, list[j].vsiz, list[j].sval, list[j].ssiz,
9311 list[j].regex, list[j].num)){
9312 miss = TRUE;
9313 }
9314 }
9315 hit = !miss;
9316 }
9317 }
9318 if(hit){
9319 scores[snum].id = id;
9320 scores[snum].score = sc;
9321 scores[snum].value = NULL;
9322 snum++;
9323 if(oname && (doc || (doc = est_db_get_doc(db, id, 0)) != NULL)){
9324 if(!(vbuf = cbmapget(doc->attrs, oname, -1, &vsiz))){
9325 vbuf = "";
9326 vsiz = 0;
9327 }
9328 cbmapput(ordattrs, (char *)&id, sizeof(int), vbuf, vsiz, FALSE);
9329 }
9330 }
9331 if(doc) est_doc_delete(doc);
9332 }
9333 if(list) est_free_cattr_list(list, anum);
9334 if(oname) free(oname);
9335 if(db->smode != ESTDFSCASIS && snum > *nump){
9336 avg = 0.0;
9337 for(i = *nump; i < snum; i++){
9338 avg += scores[i].score;
9339 }
9340 avg /= snum - *nump;
9341 sd = 0.0;
9342 for(i = *nump; i < snum; i++){
9343 dif = avg - scores[i].score;
9344 sd += dif * dif;
9345 }
9346 sd /= snum - *nump;
9347 sd = sqrt(sd);
9348 if(sd < 0.1){
9349 for(i = *nump; i < snum; i++){
9350 scores[i].score = ESTSCOREUNIT / 2;
9351 }
9352 } else {
9353 for(i = *nump; i < snum; i++){
9354 scores[i].score = (int)(((scores[i].score - avg) * (ESTSCOREUNIT / 10.0) / sd) +
9355 ESTSCOREUNIT / 2.0);
9356 }
9357 }
9358 if(cond->tfidf){
9359 tune = pow(snum - *nump + 64, 0.4);
9360 for(i = *nump; i < snum; i++){
9361 scores[i].score *= 100.0 / tune;
9362 }
9363 } else {
9364 for(i = *nump; i < snum; i++){
9365 scores[i].score *= 10;
9366 }
9367 }
9368 }
9369 *nump = snum;
9370 return scores;
9371 }
9372
9373
9374 /* Narrow and sort scores of search candidates.
9375 `db' specifies a database object.
9376 `attrs' specifies a list object of narrowing attributes.
9377 `ign' specifies the offset of an attribute to be ignored.
9378 `order' specifies an expression for sorting.
9379 `distinct' specifies the name of the distinct attribute.
9380 `scores' specifies an array of scores of search candidates.
9381 `snum' specifies the number of the array.
9382 `limit' specifies the limit number to check.
9383 `restp' specifies the pointer to a variable to which rest number to be checked is assigned.
9384 `ordattrs' specifies a map object of cached ordering attributes.
9385 The return value is the new number of the array. */
est_narrow_scores(ESTDB * db,const CBLIST * attrs,int ign,const char * order,const char * distinct,ESTSCORE * scores,int snum,int limit,int * restp,CBMAP * ordattrs)9386 static int est_narrow_scores(ESTDB *db, const CBLIST *attrs, int ign,
9387 const char *order, const char *distinct, ESTSCORE *scores, int snum,
9388 int limit, int *restp, CBMAP *ordattrs){
9389 ESTCATTR *list;
9390 ESTATTRIDX *attridx;
9391 CBMAP *umap;
9392 const char *otype, *cbuf, *ibuf, *lbuf;
9393 char *oname, *wp, *mbuf, *vbuf;
9394 int i, j, k, ci, oi, anum, done, mixed, nnum, csiz, msiz;
9395 int miss, vsiz, num, isiz, lsiz, hit, onlen, dnlen;
9396 time_t tval;
9397 assert(db && scores && snum >= 0 && limit >= 0 && restp && ordattrs);
9398 *restp = 0;
9399 ci = -1;
9400 oi = -1;
9401 oname = NULL;
9402 otype = NULL;
9403 if(order){
9404 oname = cbmemdup(order, -1);
9405 cbstrtrim(oname);
9406 otype = ESTORDSTRA;
9407 if((wp = strchr(oname, ' ')) != NULL){
9408 *(wp++) = '\0';
9409 while(*wp == ' '){
9410 wp++;
9411 }
9412 otype = wp;
9413 }
9414 }
9415 if(attrs){
9416 list = est_make_cattr_list(attrs, &anum);
9417 if(cbmaprnum(db->aidxs) > 0){
9418 done = TRUE;
9419 mixed = FALSE;
9420 for(i = 0; i < anum; i++){
9421 if(i == ign) continue;
9422 if(!(attridx = (ESTATTRIDX *)cbmapget(db->aidxs, list[i].name, list[i].nsiz, NULL)) ||
9423 (attridx->type == ESTIDXATTRNUM &&
9424 list[i].cop != ESTOPNUMEQ && list[i].cop != ESTOPNUMNE &&
9425 list[i].cop != ESTOPNUMGT && list[i].cop != ESTOPNUMGE &&
9426 list[i].cop != ESTOPNUMLT && list[i].cop != ESTOPNUMLE &&
9427 list[i].cop != ESTOPNUMBT) ||
9428 (attridx->type != ESTIDXATTRSEQ && snum < ESTAISNUMMIN)){
9429 done = FALSE;
9430 continue;
9431 }
9432 switch(attridx->type){
9433 case ESTIDXATTRSTR:
9434 case ESTIDXATTRNUM:
9435 snum = est_aidx_attr_narrow(attridx->db, db->pdocs, list[i].cop, list[i].sign,
9436 list[i].val, list[i].vsiz, list[i].sval, list[i].ssiz,
9437 list[i].regex, list[i].num, scores, snum);
9438 mixed = TRUE;
9439 break;
9440 default:
9441 if(done && i == anum - 1 && !order && mixed){
9442 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
9443 mixed = FALSE;
9444 }
9445 snum = est_aidx_seq_narrow(attridx->db, db->pdocs, list[i].cop, list[i].sign,
9446 list[i].val, list[i].vsiz, list[i].sval, list[i].ssiz,
9447 list[i].regex, list[i].num, scores, snum,
9448 done && i == anum - 1 ? limit : INT_MAX, restp);
9449 break;
9450 }
9451 list[i].cop = ESTOPDUMMY;
9452 }
9453 if(mixed && !order) qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
9454 } else {
9455 done = FALSE;
9456 }
9457 if(db->spacc){
9458 for(i = 0; i < anum; i++){
9459 if(!strcmp(list[i].name, db->scname)){
9460 ci = i;
9461 break;
9462 }
9463 }
9464 }
9465 if(oname){
9466 for(i = 0; i < anum; i++){
9467 if(!strcmp(list[i].name, oname)){
9468 oi = i;
9469 break;
9470 }
9471 }
9472 }
9473 if(!done){
9474 nnum = 0;
9475 for(i = 0; i < snum; i++){
9476 if(nnum >= limit){
9477 *restp = snum - i;
9478 break;
9479 }
9480 scores[i].value = NULL;
9481 if(ci >= 0){
9482 if((cbuf = cbmapget(db->spacc, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL)
9483 cbmapmove(db->spacc, (char *)&(scores[i].id), sizeof(int), FALSE);
9484 } else {
9485 cbuf = NULL;
9486 csiz = 0;
9487 }
9488 mbuf = NULL;
9489 if(scores[i].id >= ESTPDOCIDMIN){
9490 scores[nnum++] = scores[i];
9491 } else if((cbuf && anum == 1) ||
9492 (mbuf = est_crget(db->attrdb, db->zmode, scores[i].id, &msiz)) != NULL){
9493 miss = FALSE;
9494 for(j = 0; !miss && j < anum; j++){
9495 if(list[j].nsiz < 1) continue;
9496 if(list[j].nlist){
9497 hit = FALSE;
9498 for(k = 0; k < CB_LISTNUM(list[j].nlist); k++){
9499 lbuf = CB_LISTVAL2(list[j].nlist, k, lsiz);
9500 if(lsiz < 1) continue;
9501 if(!(vbuf = cbmaploadone(mbuf, msiz, lbuf, lsiz, &vsiz))) continue;
9502 if(est_match_attr(vbuf, vsiz, list[j].cop, list[j].sign, list[j].val, list[j].vsiz,
9503 list[j].sval, list[j].ssiz, list[j].regex, list[j].num)){
9504 hit = TRUE;
9505 free(vbuf);
9506 break;
9507 }
9508 free(vbuf);
9509 }
9510 if(!hit) miss = TRUE;
9511 vbuf = NULL;
9512 } else {
9513 if(mbuf){
9514 vbuf = cbmaploadone(mbuf, msiz, list[j].name, list[j].nsiz, &vsiz);
9515 } else if(csiz != 1 || cbuf[0] != '\0'){
9516 vbuf = cbmemdup(cbuf, csiz);
9517 vsiz = csiz;
9518 } else {
9519 vbuf = NULL;
9520 }
9521 if(list[j].oper[0] == '\0'){
9522 if(!vbuf) miss = TRUE;
9523 } else {
9524 if(!vbuf){
9525 vbuf = cbmemdup("", 0);
9526 vsiz = 0;
9527 }
9528 if(!est_match_attr(vbuf, vsiz, list[j].cop, list[j].sign,
9529 list[j].val, list[j].vsiz, list[j].sval, list[j].ssiz,
9530 list[j].regex, list[j].num)) miss = TRUE;
9531 }
9532 }
9533 if(j == ci && !cbuf){
9534 if(vbuf){
9535 cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), vbuf, vsiz, FALSE);
9536 } else {
9537 cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), "", 1, FALSE);
9538 }
9539 if(cbmaprnum(db->spacc) > db->scmnum){
9540 num = db->scmnum * 0.1 + 1;
9541 cbmapiterinit(db->spacc);
9542 for(k = 0; k < num && (ibuf = cbmapiternext(db->spacc, &isiz)) != NULL; k++){
9543 cbmapout(db->spacc, ibuf, isiz);
9544 }
9545 }
9546 }
9547 if(j == oi){
9548 scores[i].value = vbuf;
9549 } else {
9550 free(vbuf);
9551 }
9552 }
9553 if(miss){
9554 free(scores[i].value);
9555 } else {
9556 scores[nnum++] = scores[i];
9557 }
9558 }
9559 free(mbuf);
9560 }
9561 snum = nnum;
9562 } else {
9563 for(i = 0; i < snum; i++){
9564 scores[i].value = NULL;
9565 }
9566 }
9567 est_free_cattr_list(list, anum);
9568 } else {
9569 for(i = 0; i < snum; i++){
9570 scores[i].value = NULL;
9571 }
9572 }
9573 if(oname){
9574 if(!cbstricmp(oname, ESTORDIDA)){
9575 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
9576 } else if(!cbstricmp(oname, ESTORDIDD)){
9577 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_desc);
9578 } else if(!cbstricmp(oname, ESTORDSCA)){
9579 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_asc);
9580 } else if(!cbstricmp(oname, ESTORDSCD)){
9581 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
9582 } else {
9583 ci = db->spacc && !strcmp(oname, db->scname);
9584 onlen = strlen(oname);
9585 attridx = (ESTATTRIDX *)cbmapget(db->aidxs, oname, onlen, NULL);
9586 if(attridx && attridx->type != ESTIDXATTRSEQ) attridx = NULL;
9587 for(i = 0; i < snum; i++){
9588 if(scores[i].value) continue;
9589 if(ci &&
9590 (cbuf = cbmapget(db->spacc, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL){
9591 cbmapmove(db->spacc, (char *)&(scores[i].id), sizeof(int), FALSE);
9592 if(csiz == 1 && cbuf[0] == '\0'){
9593 scores[i].value = cbmemdup("", 0);
9594 } else {
9595 scores[i].value = cbmemdup(cbuf, csiz);
9596 }
9597 continue;
9598 }
9599 if((cbuf = cbmapget(ordattrs, (char *)&(scores[i].id), sizeof(int), &csiz)) != NULL){
9600 scores[i].value = cbmemdup(cbuf, csiz);
9601 continue;
9602 }
9603 if(attridx){
9604 if(!(vbuf = est_aidx_seq_get(attridx->db, scores[i].id, &vsiz))) vbuf = cbmemdup("", 0);
9605 scores[i].value = vbuf;
9606 continue;
9607 }
9608 if((mbuf = est_crget(db->attrdb, db->zmode, scores[i].id, &msiz)) != NULL){
9609 if((vbuf = cbmaploadone(mbuf, msiz, oname, onlen, &vsiz)) != NULL){
9610 if(ci) cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), vbuf, vsiz, FALSE);
9611 scores[i].value = vbuf;
9612 } else {
9613 if(ci) cbmapput(db->spacc, (char *)&(scores[i].id), sizeof(int), "", 1, FALSE);
9614 scores[i].value = cbmemdup("", 0);
9615 }
9616 if(ci && cbmaprnum(db->spacc) > db->scmnum){
9617 num = db->scmnum * 0.1 + 1;
9618 cbmapiterinit(db->spacc);
9619 for(j = 0; j < num && (ibuf = cbmapiternext(db->spacc, &isiz)) != NULL; j++){
9620 cbmapout(db->spacc, ibuf, isiz);
9621 }
9622 }
9623 free(mbuf);
9624 } else {
9625 scores[i].value = cbmemdup("", 0);
9626 }
9627 }
9628 if(!cbstricmp(otype, ESTORDSTRA)){
9629 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_str_asc);
9630 } else if(!cbstricmp(otype, ESTORDSTRD)){
9631 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_str_desc);
9632 } else if(!cbstricmp(otype, ESTORDNUMA)){
9633 for(i = 0; i < snum; i++){
9634 tval = cbstrmktime(scores[i].value);
9635 free(scores[i].value);
9636 scores[i].value = (void *)tval;
9637 }
9638 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_num_asc);
9639 for(i = 0; i < snum; i++){
9640 scores[i].value = NULL;
9641 }
9642 } else if(!cbstricmp(otype, ESTORDNUMD)){
9643 for(i = 0; i < snum; i++){
9644 tval = cbstrmktime(scores[i].value);
9645 free(scores[i].value);
9646 scores[i].value = (void *)tval;
9647 }
9648 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_num_desc);
9649 for(i = 0; i < snum; i++){
9650 scores[i].value = NULL;
9651 }
9652 }
9653 for(i = 0; i < snum; i++){
9654 free(scores[i].value);
9655 }
9656 }
9657 free(oname);
9658 }
9659 if(distinct){
9660 if(!order && *distinct != '~')
9661 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
9662 if(*distinct == '~') distinct++;
9663 dnlen = strlen(distinct);
9664 umap = cbmapopenex(snum + 1);
9665 attridx = (ESTATTRIDX *)cbmapget(db->aidxs, distinct, dnlen, NULL);
9666 if(attridx && attridx->type != ESTIDXATTRSEQ) attridx = NULL;
9667 nnum = 0;
9668 for(i = 0; i < snum; i++){
9669 if(scores[i].id >= ESTPDOCIDMIN){
9670 if(!(vbuf = est_db_get_doc_attr(db, scores[i].id, distinct))) vbuf = cbmemdup("", 0);
9671 vsiz = strlen(vbuf);
9672 } else if(attridx){
9673 if(!(vbuf = est_aidx_seq_get(attridx->db, scores[i].id, &vsiz))){
9674 vbuf = cbmemdup("", 0);
9675 vsiz = 0;
9676 }
9677 } else {
9678 if((mbuf = est_crget(db->attrdb, db->zmode, scores[i].id, &msiz)) != NULL){
9679 if(!(vbuf = cbmaploadone(mbuf, msiz, distinct, dnlen, &vsiz))){
9680 vbuf = cbmemdup("", 0);
9681 vsiz = 0;
9682 }
9683 free(mbuf);
9684 } else {
9685 vbuf = cbmemdup("", 0);
9686 vsiz = 0;
9687 }
9688 }
9689 if(cbmapput(umap, vbuf, vsiz, "", 0, FALSE)) scores[nnum++] = scores[i];
9690 free(vbuf);
9691 }
9692 snum = nnum;
9693 cbmapclose(umap);
9694 }
9695 return snum;
9696 }
9697
9698
9699 /* Make a list of condition attributes.
9700 `attrs' specifies a list object of attribute expressions.
9701 `nump' specifies the pointer to which the number of elements in the result is assigned.
9702 The return value is a list of condition attributes. */
est_make_cattr_list(const CBLIST * attrs,int * nump)9703 static ESTCATTR *est_make_cattr_list(const CBLIST *attrs, int *nump){
9704 ESTCATTR *list;
9705 const char *rp, *pv;
9706 unsigned char *utmp;
9707 int i, anum, tsiz;
9708 assert(attrs && nump);
9709 anum = CB_LISTNUM(attrs);
9710 CB_MALLOC(list, sizeof(ESTCATTR) * anum + 1);
9711 for(i = 0; i < anum; i++){
9712 list[i].name = NULL;
9713 list[i].oper = NULL;
9714 list[i].val = NULL;
9715 rp = CB_LISTVAL(attrs, i);
9716 while(*rp > 0 && *rp <= ' '){
9717 rp++;
9718 }
9719 if((pv = strchr(rp, ' ')) != NULL){
9720 list[i].nsiz = pv - rp;
9721 list[i].name = cbmemdup(rp, list[i].nsiz);
9722 rp = pv;
9723 while(*rp > 0 && *rp <= ' '){
9724 rp++;
9725 }
9726 if((pv = strchr(rp, ' ')) != NULL){
9727 list[i].oper = cbmemdup(rp, pv - rp);
9728 rp = pv;
9729 while(*rp > 0 && *rp <= ' '){
9730 rp++;
9731 }
9732 list[i].vsiz = strlen(rp);
9733 list[i].val = cbmemdup(rp, list[i].vsiz);
9734 } else {
9735 list[i].oper = cbmemdup(rp, -1);
9736 }
9737 } else {
9738 list[i].nsiz = strlen(rp);
9739 list[i].name = cbmemdup(rp, list[i].nsiz);
9740 }
9741 if(strchr(list[i].name, ',')){
9742 list[i].nlist = cbsplit(list[i].name, list[i].nsiz, ",");
9743 } else {
9744 list[i].nlist = NULL;
9745 }
9746 if(!list[i].oper){
9747 list[i].oper = cbmemdup("", 0);
9748 }
9749 if(!list[i].val){
9750 list[i].vsiz = 0;
9751 list[i].val = cbmemdup("", 0);
9752 }
9753 }
9754 for(i = 0; i < anum; i++){
9755 rp = list[i].oper;
9756 if(*rp == '!'){
9757 list[i].sign = FALSE;
9758 rp++;
9759 } else {
9760 list[i].sign = TRUE;
9761 }
9762 if(*rp == 'I' || *rp == 'i'){
9763 if(est_check_cjk_only(list[i].val)){
9764 list[i].sval = NULL;
9765 list[i].ssiz = 0;
9766 } else {
9767 utmp = (unsigned char *)est_uconv_in(list[i].val, list[i].vsiz, &tsiz);
9768 est_normalize_text(utmp, tsiz, &tsiz);
9769 est_canonicalize_text(utmp, tsiz, FALSE);
9770 list[i].sval = (char *)est_uconv_out((char *)utmp, tsiz, &(list[i].ssiz));
9771 free(utmp);
9772 }
9773 rp++;
9774 } else {
9775 list[i].sval = NULL;
9776 list[i].ssiz = 0;
9777 }
9778 list[i].regex = NULL;
9779 list[i].num = cbstrmktime(list[i].val);
9780 if(!cbstricmp(rp, ESTOPSTREQ)){
9781 list[i].cop = ESTOPSTREQ;
9782 } else if(!cbstricmp(rp, ESTOPSTRNE)){
9783 list[i].cop = ESTOPSTRNE;
9784 } else if(!cbstricmp(rp, ESTOPSTRINC)){
9785 list[i].cop = ESTOPSTRINC;
9786 } else if(!cbstricmp(rp, ESTOPSTRBW)){
9787 list[i].cop = ESTOPSTRBW;
9788 } else if(!cbstricmp(rp, ESTOPSTREW)){
9789 list[i].cop = ESTOPSTREW;
9790 } else if(!cbstricmp(rp, ESTOPSTRAND)){
9791 list[i].cop = ESTOPSTRAND;
9792 } else if(!cbstricmp(rp, ESTOPSTROR)){
9793 list[i].cop = ESTOPSTROR;
9794 } else if(!cbstricmp(rp, ESTOPSTROREQ)){
9795 list[i].cop = ESTOPSTROREQ;
9796 } else if(!cbstricmp(rp, ESTOPSTRRX)){
9797 list[i].cop = ESTOPSTRRX;
9798 list[i].regex = list[i].sval ? est_regex_new(list[i].sval) : est_regex_new(list[i].val);
9799 } else if(!cbstricmp(rp, ESTOPNUMEQ)){
9800 list[i].cop = ESTOPNUMEQ;
9801 } else if(!cbstricmp(rp, ESTOPNUMNE)){
9802 list[i].cop = ESTOPNUMNE;
9803 } else if(!cbstricmp(rp, ESTOPNUMGT)){
9804 list[i].cop = ESTOPNUMGT;
9805 } else if(!cbstricmp(rp, ESTOPNUMGE)){
9806 list[i].cop = ESTOPNUMGE;
9807 } else if(!cbstricmp(rp, ESTOPNUMLT)){
9808 list[i].cop = ESTOPNUMLT;
9809 } else if(!cbstricmp(rp, ESTOPNUMLE)){
9810 list[i].cop = ESTOPNUMLE;
9811 } else if(!cbstricmp(rp, ESTOPNUMBT)){
9812 list[i].cop = ESTOPNUMBT;
9813 } else {
9814 list[i].cop = ESTOPSTRINC;
9815 list[i].val[0] = '\0';
9816 list[i].vsiz = 0;
9817 if(list[i].sval){
9818 list[i].sval[0] = '\0';
9819 list[i].ssiz = 0;
9820 }
9821 }
9822 }
9823 *nump = anum;
9824 return list;
9825 }
9826
9827
9828 /* Release resources of a list of condition attributes.
9829 `list' specifies a list of condition attributes.
9830 `anum' specifies the number of elements of the list. */
est_free_cattr_list(ESTCATTR * list,int anum)9831 static void est_free_cattr_list(ESTCATTR *list, int anum){
9832 int i;
9833 assert(list && anum >= 0);
9834 for(i = 0; i < anum; i++){
9835 if(list[i].regex) est_regex_delete(list[i].regex);
9836 free(list[i].sval);
9837 free(list[i].val);
9838 free(list[i].oper);
9839 if(list[i].nlist) CB_LISTCLOSE(list[i].nlist);
9840 free(list[i].name);
9841 }
9842 free(list);
9843 }
9844
9845
9846 /* Narrow and sort scores of search candidates.
9847 `db' specifies a database object.
9848 `scores' specifies an array of scores of search candidates.
9849 `snum' specifies the number of the array.
9850 `num' specifies the number of documents to be shown.
9851 `max' specifies the maximum number of shown documents.
9852 `vnum' specifies the number of dimensions of the vector.
9853 `tfidf' specifies whether to perform TF-IDF tuning.
9854 `limit' specifies the upper limit of similarity for documents to survive.
9855 `opts' specifies optoins for eclipse.
9856 `shadows' specifies a map object to store shadow document information.
9857 The return value is the new number of the array. */
est_eclipse_scores(ESTDB * db,ESTSCORE * scores,int snum,int num,int vnum,int tfidf,double limit,CBMAP * shadows)9858 static int est_eclipse_scores(ESTDB *db, ESTSCORE *scores, int snum, int num,
9859 int vnum, int tfidf, double limit, CBMAP *shadows){
9860 CBMAP *svmap, *tvmap;
9861 const char *suri, *turi;
9862 char *tmp;
9863 int i, j, ubase, simurl, max, *svec, *tvec, pair[2], nnum;
9864 double dval;
9865 assert(db && scores && snum >= 0 && num >= 0 && vnum > 0 && limit > 0.0 && shadows);
9866 ubase = FALSE;
9867 simurl = FALSE;
9868 if(limit == ESTECLSERV || limit == ESTECLDIR || limit == ESTECLFILE){
9869 ubase = TRUE;
9870 } else if(limit >= ESTECLSIMURL){
9871 simurl = TRUE;
9872 limit -= ESTECLSIMURL;
9873 if(limit < 0.01) limit = 0.01;
9874 if(limit > 1.0) limit = 1.0;
9875 }
9876 nnum = 0;
9877 if(ubase){
9878 if(limit == ESTECLSERV){
9879 max = num * 14.8 + 8;
9880 } else if(limit == ESTECLDIR){
9881 max = num * 6.8 + 8;
9882 } else {
9883 max = num * 4.8 + 8;
9884 }
9885 if(max > snum) max = snum;
9886 for(i = 0; i < max; i++){
9887 scores[i].value = est_db_get_doc_attr(db, scores[i].id, ESTDATTRURI);
9888 }
9889 for(i = 0; i < max; i++){
9890 if(!scores[i].value) continue;
9891 for(j = i + 1; j < max; j++){
9892 dval = 0.0;
9893 if(scores[j].value){
9894 switch(est_url_sameness(scores[i].value, scores[j].value)){
9895 case 1:
9896 dval = ESTECLSERV;
9897 break;
9898 case 2:
9899 dval = ESTECLDIR;
9900 break;
9901 case 3:
9902 dval = ESTECLFILE;
9903 break;
9904 }
9905 }
9906 if(dval >= limit){
9907 free(scores[j].value);
9908 scores[j].value = NULL;
9909 pair[0] = scores[j].id;
9910 pair[1] = 0;
9911 cbmapputcat(shadows, (char *)&(scores[i].id), sizeof(int),
9912 (char *)pair, sizeof(int) * 2);
9913 }
9914 }
9915 }
9916 for(i = 0; i < max; i++){
9917 if(scores[i].value){
9918 free(scores[i].value);
9919 scores[nnum++] = scores[i];
9920 }
9921 }
9922 for(i = max; i < snum; i++){
9923 scores[nnum++] = scores[i];
9924 }
9925 } else {
9926 max = limit < 0.1 ? snum : num * ((2.4 / (limit - 0.05)) + 0.8) + 8;
9927 if(simurl) max *= 1.4;
9928 if(max > snum) max = snum;
9929 CB_MALLOC(svec, vnum * sizeof(int));
9930 CB_MALLOC(tvec, vnum * sizeof(int));
9931 for(i = 0; i < max; i++){
9932 if((svmap = est_get_tvmap(db, scores[i].id, vnum, tfidf)) != NULL){
9933 scores[i].value = (char *)svmap;
9934 if(simurl && (tmp = est_db_get_doc_attr(db, scores[i].id, ESTDATTRURI)) != NULL){
9935 cbmapput(svmap, "", 0, tmp, -1, TRUE);
9936 free(tmp);
9937 }
9938 } else {
9939 scores[i].value = NULL;
9940 }
9941 }
9942 for(i = 0; i < max; i++){
9943 svmap = (CBMAP *)(scores[i].value);
9944 if(!svmap || cbmaprnum(svmap) < 1) continue;
9945 suri = cbmapget((CBMAP *)scores[i].value, "", -1, NULL);
9946 if(num-- < 1) continue;
9947 est_vector_set_seed(svmap, svec, vnum);
9948 for(j = i + 1; j < max; j++){
9949 tvmap = (CBMAP *)(scores[j].value);
9950 if(!tvmap || cbmaprnum(tvmap) < 1) continue;
9951 est_vector_set_target(svmap, tvmap, tvec, vnum);
9952 dval = est_vector_cosine(svec, tvec, vnum);
9953 if(dval > 0.01 && suri &&
9954 (turi = cbmapget((CBMAP *)scores[j].value, "", -1, NULL)) != NULL){
9955 switch(est_url_sameness(suri, turi)){
9956 default:
9957 dval = pow(cos(acos(dval) * (1.0 - pow(dval, 9.9))), 1.07);
9958 break;
9959 case 1:
9960 dval = pow(cos(acos(dval) * (1.0 - pow(dval, 4.1))), 1.05);
9961 break;
9962 case 2:
9963 dval = pow(cos(acos(dval) * (1.0 - pow(dval, 2.9))), 1.03);
9964 break;
9965 case 3:
9966 dval = pow(cos(acos(dval) * (1.0 - pow(dval, 2.1))), 1.01);
9967 break;
9968 }
9969 }
9970 if(dval > limit){
9971 cbmapclose(tvmap);
9972 scores[j].value = NULL;
9973 pair[0] = scores[j].id;
9974 pair[1] = (int)(dval * 10000.0);
9975 cbmapputcat(shadows, (char *)&(scores[i].id), sizeof(int),
9976 (char *)pair, sizeof(int) * 2);
9977 }
9978 }
9979 }
9980 for(i = 0; i < max; i++){
9981 if(scores[i].value){
9982 cbmapclose((CBMAP *)(scores[i].value));
9983 scores[nnum++] = scores[i];
9984 }
9985 }
9986 for(i = max; i < snum; i++){
9987 scores[nnum++] = scores[i];
9988 }
9989 free(tvec);
9990 free(svec);
9991 }
9992 return nnum;
9993 }
9994
9995
9996 /* Check whether a score matches an attribute condition.
9997 `tval' specifies the target value;
9998 `tsiz' specifies the size of the target value
9999 `cop' specifies the pointer to the operator.
10000 `sign' specifies the sign of operation.
10001 `oval' specifies the operation value.
10002 `osiz' specifies the size of the operation value
10003 `sval' specifies the operation value of small cases.
10004 `ssiz' specifies the size of the operation value of small cases.
10005 `regex' specifies the regular expressions.
10006 `onum' specifies the numeric value.
10007 The return value is true if it does match, else it is false. */
est_match_attr(const char * tval,int tsiz,const char * cop,int sign,const char * oval,int osiz,const char * sval,int ssiz,const void * regex,int onum)10008 static int est_match_attr(const char *tval, int tsiz, const char *cop, int sign,
10009 const char *oval, int osiz, const char *sval, int ssiz,
10010 const void *regex, int onum){
10011 unsigned char *eval;
10012 char *cval;
10013 int csiz, esiz, hit;
10014 assert(tval && tsiz >= 0 && oval && osiz >= 0);
10015 cval = NULL;
10016 if(sval){
10017 eval = (unsigned char *)est_uconv_in(tval, tsiz, &esiz);
10018 est_normalize_text(eval, esiz, &esiz);
10019 est_canonicalize_text(eval, esiz, FALSE);
10020 cval = (char *)est_uconv_out((char *)eval, esiz, &csiz);
10021 free(eval);
10022 tval = cval;
10023 tsiz = csiz;
10024 oval = sval;
10025 osiz = ssiz;
10026 }
10027 if(cop == ESTOPSTREQ){
10028 hit = !strcmp(tval, oval);
10029 } else if(cop == ESTOPSTRNE){
10030 hit = strcmp(tval, oval) != 0;
10031 } else if(cop == ESTOPSTRINC){
10032 hit = strstr(tval, oval) != NULL;
10033 } else if(cop == ESTOPSTRBW){
10034 hit = cbstrfwmatch(tval, oval);
10035 } else if(cop == ESTOPSTREW){
10036 hit = cbstrbwmatch(tval, oval);
10037 } else if(cop == ESTOPSTRAND){
10038 hit = est_check_strand(tval, oval);
10039 } else if(cop == ESTOPSTROR){
10040 hit = est_check_stror(tval, oval);
10041 } else if(cop == ESTOPSTROREQ){
10042 hit = est_check_stroreq(tval, oval);
10043 } else if(cop == ESTOPSTRRX){
10044 hit = regex ? est_regex_match(regex, tval) : FALSE;
10045 } else if(cop == ESTOPNUMEQ){
10046 hit = cbstrmktime(tval) == onum;
10047 } else if(cop == ESTOPNUMNE){
10048 hit = cbstrmktime(tval) != onum;
10049 } else if(cop == ESTOPNUMGT){
10050 hit = cbstrmktime(tval) > onum;
10051 } else if(cop == ESTOPNUMGE){
10052 hit = cbstrmktime(tval) >= onum;
10053 } else if(cop == ESTOPNUMLT){
10054 hit = cbstrmktime(tval) < onum;
10055 } else if(cop == ESTOPNUMLE){
10056 hit = cbstrmktime(tval) <= onum;
10057 } else if(cop == ESTOPNUMBT){
10058 hit = est_check_numbt(tval, oval);
10059 } else if(cop == ESTOPDUMMY){
10060 hit = TRUE;
10061 } else {
10062 hit = FALSE;
10063 }
10064 free(cval);
10065 return sign ? hit : !hit;
10066 }
10067
10068
10069 /* Check whether a string includes all tokens in another string.
10070 `tval' specifies the target value;
10071 `oval' specifies the operation value;
10072 The return value is the result of the check. */
est_check_strand(const char * tval,const char * oval)10073 static int est_check_strand(const char *tval, const char *oval){
10074 const char *sp, *ep, *rp, *pp, *qp;
10075 int hit;
10076 assert(tval && oval);
10077 sp = oval;
10078 while(*sp != '\0'){
10079 while(*sp == ' ' || *sp == ','){
10080 sp++;
10081 }
10082 ep = sp;
10083 while(*ep != '\0' && *ep != ' ' && *ep != ','){
10084 ep++;
10085 }
10086 if(ep > sp){
10087 hit = FALSE;
10088 for(rp = tval; *rp != '\0'; rp++){
10089 for(pp = sp, qp = rp; pp < ep; pp++, qp++){
10090 if(*pp != *qp) break;
10091 }
10092 if(pp == ep && (*qp == '\0' || *qp == ' ' || *qp == ',')){
10093 hit = TRUE;
10094 break;
10095 }
10096 }
10097 if(!hit) return FALSE;
10098 }
10099 sp = ep;
10100 }
10101 return TRUE;
10102 }
10103
10104
10105 /* Check whether a string includes at least one token in another string.
10106 `tval' specifies the target value;
10107 `oval' specifies the operation value;
10108 The return value is the result of the check. */
est_check_stror(const char * tval,const char * oval)10109 static int est_check_stror(const char *tval, const char *oval){
10110 const char *sp, *ep, *rp, *pp, *qp;
10111 int hit;
10112 assert(tval && oval);
10113 sp = oval;
10114 while(*sp != '\0'){
10115 while(*sp == ' ' || *sp == ','){
10116 sp++;
10117 }
10118 ep = sp;
10119 while(*ep != '\0' && *ep != ' ' && *ep != ','){
10120 ep++;
10121 }
10122 if(ep > sp){
10123 hit = FALSE;
10124 for(rp = tval; *rp != '\0'; rp++){
10125 for(pp = sp, qp = rp; pp < ep; pp++, qp++){
10126 if(*pp != *qp) break;
10127 }
10128 if(pp == ep && (*qp == '\0' || *qp == ' ' || *qp == ',')){
10129 hit = TRUE;
10130 break;
10131 }
10132 }
10133 if(hit) return TRUE;
10134 }
10135 sp = ep;
10136 }
10137 return FALSE;
10138 }
10139
10140
10141 /* Check whether a string is equal to at least one token in another string.
10142 `tval' specifies the target value;
10143 `oval' specifies the operation value;
10144 The return value is the result of the check. */
est_check_stroreq(const char * tval,const char * oval)10145 static int est_check_stroreq(const char *tval, const char *oval){
10146 const char *sp, *ep, *rp;
10147 assert(tval && oval);
10148 sp = oval;
10149 while(*sp != '\0'){
10150 while(*sp == ' ' || *sp == ','){
10151 sp++;
10152 }
10153 ep = sp;
10154 while(*ep != '\0' && *ep != ' ' && *ep != ','){
10155 ep++;
10156 }
10157 if(ep > sp){
10158 for(rp = tval; *rp != '\0'; rp++){
10159 if(*sp != *rp || sp >= ep) break;
10160 sp++;
10161 }
10162 if(*rp == '\0' && sp == ep) return TRUE;
10163 }
10164 sp = ep;
10165 }
10166 return FALSE;
10167 }
10168
10169
10170 /* Check whether a decimal string is between two tokens in another string.
10171 `tval' specifies the target value;
10172 `oval' specifies the operation value;
10173 The return value is the result of the check. */
est_check_numbt(const char * tval,const char * oval)10174 static int est_check_numbt(const char *tval, const char *oval){
10175 time_t val, lower, upper, swap;
10176 char numbuf[ESTNUMBUFSIZ];
10177 int i;
10178 for(i = 0; i < ESTNUMBUFSIZ && oval[i] != '\0' && oval[i] != ' ' && oval[i] != '\t'; i++){
10179 numbuf[i] = oval[i];
10180 }
10181 numbuf[i] = '\0';
10182 oval += i;
10183 while(*oval == ' ' || *oval == '\t'){
10184 oval++;
10185 }
10186 if(*oval == '\0') return FALSE;
10187 val = cbstrmktime(tval);
10188 lower = cbstrmktime(numbuf);
10189 upper = cbstrmktime(oval);
10190 if(lower > upper){
10191 swap = lower;
10192 lower = upper;
10193 upper = swap;
10194 }
10195 return val >= lower && val <= upper;
10196 }
10197
10198
10199 /* Compare two keywords by scores in descending order.
10200 `ap' specifies the pointer to one keyword.
10201 `bp' specifies the pointer to the other keyword.
10202 The return value is negative if one is small, positive if one is big, 0 if both are equal. */
est_keysc_compare(const void * ap,const void * bp)10203 static int est_keysc_compare(const void *ap, const void *bp){
10204 assert(ap && bp);
10205 return ((ESTKEYSC *)bp)->pt - ((ESTKEYSC *)ap)->pt;
10206 }
10207
10208
10209 /* Get a similar set of documents in a database.
10210 `db' specifies a database object.
10211 `svmap' specifies a map object of a seed vector.
10212 `nump' specifies the pointer to which the number of elements in the result is assigned.
10213 `knum' specifies the number of keywords to get candidates.
10214 `unum' specifies the number of adopted documents for a keyword.
10215 `tfidf' specifies whether to perform TF-IDF tuning.
10216 `nmin' specifies the minimum value for narrowing.
10217 `auxmin' specifies the minimum hits to adopt the auxiliary index. If it is not more than 0,
10218 the auxiliary index is not used.
10219 `auxwords' specifies a map object where keywords used with the auxiliary index are stored. If
10220 it is `NULL', it is not used.
10221 The return value is an array of score structures of corresponding documents. */
est_search_similar(ESTDB * db,CBMAP * svmap,int * nump,int knum,int unum,int mnum,int tfidf,double nmin,int auxmin,CBMAP * auxwords)10222 static ESTSCORE *est_search_similar(ESTDB *db, CBMAP *svmap, int *nump,
10223 int knum, int unum, int mnum, int tfidf,
10224 double nmin, int auxmin, CBMAP *auxwords){
10225 ESTSCORE *scores, *tscores;
10226 CBMAP *tvmap;
10227 const char *word;
10228 int i, j, vnum, snum, tmax, tsnum, nnum, lid, *svec, *tvec;
10229 double dval;
10230 assert(db && svmap && nump && knum >= 0 && unum >= 0 && nmin >= 0.0);
10231 CB_MALLOC(scores, sizeof(ESTSCORE) * (unum * knum + CB_LISTNUM(db->pdocs)) + 1);
10232 snum = 0;
10233 if((vnum = cbmaprnum(svmap)) < 1) vnum = 1;
10234 cbmapiterinit(svmap);
10235 tmax = unum;
10236 for(i = 0; (i < knum || (i < knum * 2 && snum < unum * 2)) &&
10237 (word = cbmapiternext(svmap, NULL)) != NULL; i++){
10238 while(*word > '\0' && *word <= ' '){
10239 word++;
10240 }
10241 tscores = est_search_union(db, word, 1, NULL, &tsnum, NULL, TRUE, auxmin, auxwords);
10242 qsort(tscores, tsnum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
10243 for(j = 0; j < tmax && j < tsnum; j++){
10244 scores[snum].id = tscores[j].id;
10245 scores[snum].score = tscores[j].score * (knum * 2.2 - i);
10246 snum++;
10247 }
10248 free(tscores);
10249 tmax -= unum / knum / 1.25;
10250 if(tmax < unum / 4) tmax = unum / 4;
10251 }
10252 for(i = 0; i < CB_LISTNUM(db->pdocs); i++){
10253 scores[snum].id = ESTPDOCIDMIN + i;
10254 scores[snum].score = 1;
10255 snum++;
10256 }
10257 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_id_asc);
10258 nnum = 0;
10259 lid = -1;
10260 for(i = 0; i < snum; i++){
10261 if(nnum > 0 && scores[i].id == lid){
10262 scores[nnum-1].score += scores[i].score;
10263 continue;
10264 }
10265 scores[nnum].id = scores[i].id;
10266 scores[nnum].score = scores[i].score;
10267 nnum++;
10268 lid = scores[i].id;
10269 }
10270 snum = nnum;
10271 qsort(scores, snum, sizeof(ESTSCORE), est_score_compare_by_score_desc);
10272 nnum = 0;
10273 CB_MALLOC(svec, vnum * sizeof(int));
10274 CB_MALLOC(tvec, vnum * sizeof(int));
10275 est_vector_set_seed(svmap, svec, vnum);
10276 for(i = 0; i < snum && nnum < mnum; i++){
10277 tvmap = est_get_tvmap(db, scores[i].id, vnum, tfidf);
10278 if(tvmap){
10279 est_vector_set_target(svmap, tvmap, tvec, vnum);
10280 if((dval = est_vector_cosine(svec, tvec, vnum)) >= nmin){
10281 scores[nnum].id = scores[i].id;
10282 scores[nnum].score = (int)(dval * 10000);
10283 if(scores[nnum].score == 9999) scores[nnum].score = 10000;
10284 scores[nnum].value = NULL;
10285 nnum++;
10286 }
10287 cbmapclose(tvmap);
10288 }
10289 }
10290 free(tvec);
10291 free(svec);
10292 snum = nnum;
10293 *nump = snum;
10294 return scores;
10295 }
10296
10297
10298 /* Create a map object of a vector for similar search from a phrase.
10299 `phrase' specifies a search phrase for similar search.
10300 The return value is a map object of the seed vector. */
est_phrase_vector(const char * phrase)10301 static CBMAP *est_phrase_vector(const char *phrase){
10302 ESTKEYSC *scores;
10303 CBMAP *svmap;
10304 CBLIST *list;
10305 const char *pv, *rp;
10306 char *utext, *rtext;
10307 int i, num, len, size;
10308 svmap = cbmapopenex(ESTMINIBNUM);
10309 CB_LISTOPEN(list);
10310 while(*phrase != '\0'){
10311 if(*phrase == ESTOPWITH[0] && cbstrfwmatch(phrase, ESTOPWITH)){
10312 phrase += strlen(ESTOPWITH);
10313 pv = phrase;
10314 while(*phrase != '\0'){
10315 if(*phrase <= ' ' && cbstrfwmatch(phrase + 1, ESTOPWITH)){
10316 phrase++;
10317 break;
10318 }
10319 phrase++;
10320 }
10321 CB_LISTPUSH(list, pv, phrase - pv);
10322 } else {
10323 phrase++;
10324 }
10325 }
10326 for(i = 0; i < CB_LISTNUM(list); i++){
10327 pv = CB_LISTVAL(list, i);
10328 while(*pv > '\0' && *pv <= ' '){
10329 pv++;
10330 }
10331 num = strtol(pv, (char **)&rp, 10);
10332 if(rp && (len = rp - pv) > 0 && num >= 0){
10333 utext = est_uconv_in(rp, strlen(rp), &size);
10334 est_normalize_text((unsigned char *)utext, size, &size);
10335 est_canonicalize_text((unsigned char *)utext, size, FALSE);
10336 rtext = est_uconv_out(utext, size, NULL);
10337 cbstrsqzspc(rtext);
10338 if(rtext[0] != '\0') cbmapput(svmap, rtext, -1, pv, len, FALSE);
10339 free(rtext);
10340 free(utext);
10341 }
10342 }
10343 CB_LISTCLOSE(list);
10344 CB_MALLOC(scores, cbmaprnum(svmap) * sizeof(ESTKEYSC) + 1);
10345 cbmapiterinit(svmap);
10346 for(i = 0; (rp = cbmapiternext(svmap, &len)) != NULL; i++){
10347 scores[i].word = rp;
10348 scores[i].wsiz = len;
10349 scores[i].pt = atoi(cbmapiterval(rp, NULL));
10350 }
10351 qsort(scores, i, sizeof(ESTKEYSC), est_keysc_compare);
10352 for(i--; i >= 0; i--){
10353 cbmapmove(svmap, scores[i].word, scores[i].wsiz, TRUE);
10354 }
10355 free(scores);
10356 return svmap;
10357 }
10358
10359
10360 /* Get the target vector of a document dynamically.
10361 `db' specifies a database object.
10362 `id' specifies the ID of a document.
10363 `vnum' specifies the number of dimensions of the vector.
10364 `tfidf' specifies whether to perform TF-IDF tuning.
10365 The return value is a map object of the target vector. */
est_get_tvmap(ESTDB * db,int id,int vnum,int tfidf)10366 static CBMAP *est_get_tvmap(ESTDB *db, int id, int vnum, int tfidf){
10367 ESTDOC *doc;
10368 CBMAP *tvmap;
10369 assert(db && id > 0);
10370 if((tvmap = est_db_get_keywords(db, id)) != NULL) return tvmap;
10371 if(!(doc = est_db_get_doc(db, id, 0))) return NULL;
10372 tvmap = est_db_etch_doc(tfidf ? db : NULL, doc, vnum);
10373 est_doc_delete(doc);
10374 if(dpwritable(db->metadb)) est_db_put_keywords(db, id, tvmap, 1.0);
10375 return tvmap;
10376 }
10377
10378
10379 /* Calculate sameness of two URLs.
10380 The return value is 0 if the both have different servers, 1 if the both have the same server,
10381 2 if the both have the same parent directory, 3 if the both have the same file. */
est_url_sameness(const char * aurl,const char * burl)10382 static int est_url_sameness(const char *aurl, const char *burl){
10383 const char *apv, *bpv;
10384 int i, alen, blen;
10385 assert(aurl && burl);
10386 if((apv = strstr(aurl, "://")) != NULL){
10387 aurl = apv + 3;
10388 } else {
10389 return 0;
10390 }
10391 if((bpv = strstr(burl, "://")) != NULL){
10392 burl = bpv + 3;
10393 } else {
10394 return 0;
10395 }
10396 if(!(apv = strchr(aurl, '/'))) apv = aurl + strlen(aurl);
10397 if(!(bpv = strchr(burl, '/'))) bpv = burl + strlen(burl);
10398 alen = apv - aurl;
10399 blen = bpv - burl;
10400 if(alen != blen || memcmp(aurl, burl, alen)) return 0;
10401 aurl = *apv == '\0' ? "/" : apv;
10402 burl = *bpv == '\0' ? "/" : bpv;
10403 if(!(apv = strchr(aurl, '?'))) apv = aurl + strlen(aurl);
10404 if(!(bpv = strchr(burl, '?'))) bpv = burl + strlen(burl);
10405 alen = apv - aurl;
10406 blen = bpv - burl;
10407 if(alen == blen && !memcmp(aurl, burl, alen)) return 3;
10408 apv = aurl;
10409 for(i = 0; i < alen; i++){
10410 if(aurl[i] == '/') apv = aurl + i;
10411 }
10412 bpv = burl;
10413 for(i = 0; i < blen; i++){
10414 if(burl[i] == '/') bpv = burl + i;
10415 }
10416 alen = apv - aurl;
10417 blen = bpv - burl;
10418 if(alen == blen && !memcmp(aurl, burl, alen)) return 2;
10419 return 1;
10420 }
10421
10422
10423 /* Close the handle to the file of random number generator. */
est_random_fclose(void)10424 static void est_random_fclose(void){
10425 if(est_random_ifp) fclose(est_random_ifp);
10426 }
10427
10428
10429 /* Dispatch a signal to the corresponding handler.
10430 Signum specifies the number of catched signal. */
est_signal_dispatch(int signum)10431 static int est_signal_dispatch(int signum){
10432 #if defined(_SYS_MSVC_) || defined(_SYS_MINGW_)
10433 switch(signum){
10434 case CTRL_C_EVENT: case CTRL_BREAK_EVENT: case CTRL_CLOSE_EVENT:
10435 signum = 2;
10436 break;
10437 case CTRL_LOGOFF_EVENT: case CTRL_SHUTDOWN_EVENT:
10438 signum = 15;
10439 break;
10440 }
10441 if(est_signal_handlers[signum]) est_signal_handlers[signum](signum);
10442 return TRUE;
10443 #else
10444 assert(signum >= 0);
10445 if(est_signal_handlers[signum]) est_signal_handlers[signum](signum);
10446 return TRUE;
10447 #endif
10448 }
10449
10450
10451
10452 /* END OF FILE */
10453