1 /*************************************************************************************************
2 * Implementation of Odeum
3 * Copyright (C) 2000-2007 Mikio Hirabayashi
4 * This file is part of QDBM, Quick Database Manager.
5 * QDBM is free software; you can redistribute it and/or modify it under the terms of the GNU
6 * Lesser General Public License as published by the Free Software Foundation; either version
7 * 2.1 of the License or any later version. QDBM is distributed in the hope that it will be
8 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
9 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
10 * details.
11 * You should have received a copy of the GNU Lesser General Public License along with QDBM; if
12 * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
13 * 02111-1307 USA.
14 *************************************************************************************************/
15
16
17 #define QDBM_INTERNAL 1
18
19 #include "odeum.h"
20 #include "myconf.h"
21
22 #define OD_NAMEMAX 256 /* max size of a database name */
23 #define OD_DIRMODE 00755 /* permission of a creating directory */
24 #define OD_PATHBUFSIZ 1024 /* size of a path buffer */
25 #define OD_NUMBUFSIZ 32 /* size of a buffer for a number */
26 #define OD_MAPPBNUM 127 /* bucket size of a petit map handle */
27 #define OD_DOCSNAME "docs" /* name of the database for documents */
28 #define OD_INDEXNAME "index" /* name of the database for inverted index */
29 #define OD_RDOCSNAME "rdocs" /* name of the database for reverse dictionary */
30 #define OD_DOCSBNUM 2039 /* initial bucket number of document database */
31 #define OD_DOCSDNUM 17 /* division number of document database */
32 #define OD_DOCSALIGN -4 /* alignment of document database */
33 #define OD_DOCSFBP 32 /* size of free block pool of document database */
34 #define OD_INDEXBNUM 32749 /* initial bucket number of inverted index */
35 #define OD_INDEXDNUM 7 /* division number of inverted index */
36 #define OD_INDEXALIGN -2 /* alignment of inverted index */
37 #define OD_INDEXFBP 32 /* size of free block pool of inverted index */
38 #define OD_RDOCSLRM 81 /* records in a leaf node of reverse dictionary */
39 #define OD_RDOCSNIM 192 /* records in a non-leaf node of reverse dictionary */
40 #define OD_RDOCSLCN 128 /* number of leaf cache of reverse dictionary */
41 #define OD_RDOCSNCN 32 /* number of non-leaf cache of reverse dictionary */
42 #define OD_CACHEBNUM 262139 /* number of buckets for dirty buffers */
43 #define OD_CACHESIZ 8388608 /* max bytes to use memory for dirty buffers */
44 #define OD_CFLIVERAT 0.8 /* ratio of usable cache region */
45 #define OD_CFBEGSIZ 2048 /* beginning size of flushing frequent words */
46 #define OD_CFENDSIZ 64 /* lower limit of flushing frequent words */
47 #define OD_CFRFRAT 0.2 /* ratio of flushing rare words a time */
48 #define OD_OTCBBUFSIZ 1024 /* size of a buffer for call back functions */
49 #define OD_OTPERWORDS 10000 /* frequency of call back in merging index */
50 #define OD_OTPERDOCS 1000 /* frequency of call back in merging docs */
51 #define OD_MDBRATIO 2.5 /* ratio of bucket number and document number */
52 #define OD_MIBRATIO 1.5 /* ratio of bucket number and word number */
53 #define OD_MIARATIO 0.75 /* ratio of alignment to the first words */
54 #define OD_MIWUNIT 32 /* writing unit of merging inverted index */
55 #define OD_DMAXEXPR "dmax" /* key of max number of the document ID */
56 #define OD_DNUMEXPR "dnum" /* key of number of the documents */
57 #define OD_URIEXPR "1" /* map key of URI */
58 #define OD_ATTRSEXPR "2" /* map key of attributes */
59 #define OD_NWORDSEXPR "3" /* map key of normal words */
60 #define OD_AWORDSEXPR "4" /* map key of as-is words */
61 #define OD_WTOPRATE 0.1 /* ratio of top words */
62 #define OD_WTOPBONUS 5000 /* bonus points of top words */
63 #define OD_KEYCRATIO 1.75 /* ratio of number to max of keyword candidates */
64 #define OD_WOCCRPOINT 10000 /* points per occurence */
65 #define OD_SPACECHARS "\t\n\v\f\r " /* space characters */
66 #define OD_DELIMCHARS "!\"#$%&'()*/<=>?[\\]^`{|}~" /* delimiter characters */
67 #define OD_GLUECHARS "+,-.:;@" /* glueing characters */
68 #define OD_MAXWORDLEN 48 /* max length of a word */
69
70 typedef struct { /* type of structure for word counting */
71 const char *word; /* pointer to the word */
72 int num; /* frequency of the word */
73 } ODWORD;
74
75 enum { /* enumeration for events binded to each character */
76 OD_EVWORD, /* word */
77 OD_EVSPACE, /* space */
78 OD_EVDELIM, /* delimiter */
79 OD_EVGLUE /* glue */
80 };
81
82
83 /* private global variables */
84 int odindexbnum = OD_INDEXBNUM;
85 int odindexdnum = OD_INDEXDNUM;
86 int odcachebnum = OD_CACHEBNUM;
87 int odcachesiz = OD_CACHESIZ;
88 void (*odotcb)(const char *, ODEUM *, const char *) = NULL;
89
90
91 /* private function prototypes */
92 static ODEUM *odopendb(const char *name, int omode, int docsbnum, int indexbnum,
93 const char *fname);
94 static int odcacheflush(ODEUM *odeum, const char *fname);
95 static int odcacheflushfreq(ODEUM *odeum, const char *fname, int min);
96 static int odcacheflushrare(ODEUM *odeum, const char *fname, double ratio);
97 static int odsortindex(ODEUM *odeum, const char *fname);
98 static int odsortcompare(const void *a, const void *b);
99 static int odpurgeindex(ODEUM *odeum, const char *fname);
100 static CBMAP *odpairsmap(const ODPAIR *pairs, int num);
101 static int odwordcompare(const void *a, const void *b);
102 static int odmatchoperator(ODEUM *odeum, CBLIST *tokens);
103 static ODPAIR *odparsesubexpr(ODEUM *odeum, CBLIST *tokens, CBLIST *nwords, int *np,
104 CBLIST *errors);
105 static ODPAIR *odparseexpr(ODEUM *odeum, CBLIST *tokens, CBLIST *nwords, int *np,
106 CBLIST *errors);
107 static void odfixtokens(ODEUM *odeum, CBLIST *tokens);
108 static void odcleannormalized(ODEUM *odeum, CBLIST *nwords);
109
110
111
112 /*************************************************************************************************
113 * public objects
114 *************************************************************************************************/
115
116
117 /* Get a database handle. */
odopen(const char * name,int omode)118 ODEUM *odopen(const char *name, int omode){
119 assert(name);
120 return odopendb(name, omode, OD_DOCSBNUM, odindexbnum, "odopen");
121 }
122
123
124 /* Close a database handle. */
odclose(ODEUM * odeum)125 int odclose(ODEUM *odeum){
126 char numbuf[OD_NUMBUFSIZ];
127 int err;
128 assert(odeum);
129 err = FALSE;
130 if(odotcb) odotcb("odclose", odeum, "closing the connection");
131 if(odeum->wmode){
132 if(odotcb) odotcb("odclose", odeum, "writing meta information");
133 sprintf(numbuf, "%d", odeum->dmax);
134 if(!vlput(odeum->rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), numbuf, -1, VL_DOVER)) err = TRUE;
135 sprintf(numbuf, "%d", odeum->dnum);
136 if(!vlput(odeum->rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), numbuf, -1, VL_DOVER)) err = TRUE;
137 if(!odcacheflushfreq(odeum, "odclose", OD_CFENDSIZ)) err = TRUE;
138 if(!odcacheflushrare(odeum, "odclose", OD_CFRFRAT)) err = TRUE;
139 if(!odcacheflush(odeum, "odclose")) err = TRUE;
140 if(!odsortindex(odeum, "odclose")) err = TRUE;
141 cbmapclose(odeum->cachemap);
142 cbmapclose(odeum->sortmap);
143 }
144 if(!vlclose(odeum->rdocsdb)) err = TRUE;
145 if(!crclose(odeum->indexdb)) err = TRUE;
146 if(!crclose(odeum->docsdb)) err = TRUE;
147 free(odeum->name);
148 free(odeum);
149 return err ? FALSE : TRUE;
150 }
151
152
153 /* Store a document. */
odput(ODEUM * odeum,ODDOC * doc,int wmax,int over)154 int odput(ODEUM *odeum, ODDOC *doc, int wmax, int over){
155 char *tmp, *zbuf;
156 const char *word, *ctmp;
157 int i, docid, tsiz, wsiz, wnum, tmax, num, zsiz;
158 double ival;
159 ODPAIR pair;
160 CBMAP *map;
161 CBLIST *tlist;
162 assert(odeum);
163 if(odeum->fatal){
164 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
165 return FALSE;
166 }
167 if(!odeum->wmode){
168 dpecodeset(DP_EMODE, __FILE__, __LINE__);
169 return FALSE;
170 }
171 if((tmp = vlget(odeum->rdocsdb, doc->uri, -1, &tsiz)) != NULL){
172 if(!over){
173 free(tmp);
174 dpecodeset(DP_EKEEP, __FILE__, __LINE__);
175 return FALSE;
176 }
177 if(tsiz != sizeof(int) || !odoutbyid(odeum, *(int *)tmp)){
178 free(tmp);
179 dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
180 odeum->fatal = TRUE;
181 return FALSE;
182 }
183 free(tmp);
184 }
185 odeum->dmax++;
186 odeum->dnum++;
187 docid = odeum->dmax;
188 map = cbmapopen();
189 cbmapput(map, OD_URIEXPR, sizeof(OD_URIEXPR), doc->uri, -1, TRUE);
190 tmp = cbmapdump(doc->attrs, &tsiz);
191 cbmapput(map, OD_ATTRSEXPR, sizeof(OD_ATTRSEXPR), tmp, tsiz, TRUE);
192 free(tmp);
193 if(wmax < 0 || wmax > cblistnum(doc->nwords)) wmax = cblistnum(doc->nwords);
194 tlist = cblistopen();
195 for(i = 0; i < wmax; i++){
196 ctmp = cblistval(doc->nwords, i, &wsiz);
197 cblistpush(tlist, ctmp, wsiz);
198 }
199 tmp = cblistdump(tlist, &tsiz);
200 cbmapput(map, OD_NWORDSEXPR, sizeof(OD_NWORDSEXPR), tmp, tsiz, TRUE);
201 free(tmp);
202 cblistclose(tlist);
203 tlist = cblistopen();
204 for(i = 0; i < wmax; i++){
205 ctmp = cblistval(doc->awords, i, &wsiz);
206 if(strcmp(ctmp, cblistval(doc->nwords, i, NULL))){
207 cblistpush(tlist, ctmp, wsiz);
208 } else {
209 cblistpush(tlist, "\0", 1);
210 }
211 }
212 tmp = cblistdump(tlist, &tsiz);
213 cbmapput(map, OD_AWORDSEXPR, sizeof(OD_AWORDSEXPR), tmp, tsiz, TRUE);
214 free(tmp);
215 cblistclose(tlist);
216 tmp = cbmapdump(map, &tsiz);
217 cbmapclose(map);
218 if(_qdbm_deflate){
219 if(!(zbuf = _qdbm_deflate(tmp, tsiz, &zsiz, _QDBM_ZMRAW))){
220 free(tmp);
221 dpecodeset(DP_EMISC, __FILE__, __LINE__);
222 odeum->fatal = TRUE;
223 return FALSE;
224 }
225 free(tmp);
226 tmp = zbuf;
227 tsiz = zsiz;
228 }
229 if(!crput(odeum->docsdb, (char *)&docid, sizeof(int), tmp, tsiz, CR_DKEEP)){
230 free(tmp);
231 if(dpecode == DP_EKEEP) dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
232 odeum->fatal = TRUE;
233 return FALSE;
234 }
235 free(tmp);
236 if(!vlput(odeum->rdocsdb, doc->uri, -1, (char *)&docid, sizeof(int), VL_DOVER)){
237 odeum->fatal = TRUE;
238 return FALSE;
239 }
240 map = cbmapopen();
241 wnum = cblistnum(doc->nwords);
242 tmax = (int)(wnum * OD_WTOPRATE);
243 for(i = 0; i < wnum; i++){
244 word = cblistval(doc->nwords, i, &wsiz);
245 if(wsiz < 1) continue;
246 if((ctmp = cbmapget(map, word, wsiz, NULL)) != NULL){
247 num = *(int *)ctmp + OD_WOCCRPOINT;
248 } else {
249 num = i <= tmax ? OD_WTOPBONUS + OD_WOCCRPOINT : OD_WOCCRPOINT;
250 }
251 cbmapput(map, word, wsiz, (char *)&num, sizeof(int), TRUE);
252 }
253 ival = odlogarithm(wnum);
254 ival = (ival * ival * ival) / 8.0;
255 if(ival < 8.0) ival = 8.0;
256 cbmapiterinit(map);
257 while((word = cbmapiternext(map, &wsiz)) != NULL){
258 pair.id = docid;
259 pair.score = (int)(*(int *)cbmapget(map, word, wsiz, NULL) / ival);
260 cbmapputcat(odeum->cachemap, word, wsiz, (char *)&pair, sizeof(pair));
261 cbmapmove(odeum->cachemap, word, wsiz, FALSE);
262 odeum->cacheasiz += sizeof(pair);
263 cbmapput(odeum->sortmap, word, wsiz, "", 0, FALSE);
264 }
265 cbmapclose(map);
266 if(odeum->cacheasiz > odcachesiz){
267 for(i = OD_CFBEGSIZ; odeum->cacheasiz > odcachesiz * OD_CFLIVERAT && i >= OD_CFENDSIZ;
268 i /= 2){
269 if(!odcacheflushfreq(odeum, "odput", i)) return FALSE;
270 }
271 while(odeum->cacheasiz > odcachesiz * OD_CFLIVERAT){
272 if(!odcacheflushrare(odeum, "odput", OD_CFRFRAT)) return FALSE;
273 }
274 }
275 doc->id = docid;
276 odeum->ldid = docid;
277 return TRUE;
278 }
279
280
281 /* Delete a document by a URL. */
odout(ODEUM * odeum,const char * uri)282 int odout(ODEUM *odeum, const char *uri){
283 char *tmp;
284 int tsiz, docid;
285 assert(odeum && uri);
286 if(odeum->fatal){
287 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
288 return FALSE;
289 }
290 if(!odeum->wmode){
291 dpecodeset(DP_EMODE, __FILE__, __LINE__);
292 return FALSE;
293 }
294 if(!(tmp = vlget(odeum->rdocsdb, uri, -1, &tsiz))){
295 if(dpecode != DP_ENOITEM) odeum->fatal = TRUE;
296 return FALSE;
297 }
298 if(tsiz != sizeof(int)){
299 free(tmp);
300 dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
301 odeum->fatal = TRUE;
302 return FALSE;
303 }
304 docid = *(int *)tmp;
305 free(tmp);
306 return odoutbyid(odeum, docid);
307 }
308
309
310 /* Delete a document specified by an ID number. */
odoutbyid(ODEUM * odeum,int id)311 int odoutbyid(ODEUM *odeum, int id){
312 char *tmp, *zbuf;
313 const char *uritmp;
314 int tsiz, uritsiz, zsiz;
315 CBMAP *map;
316 assert(odeum && id > 0);
317 if(odeum->fatal){
318 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
319 return FALSE;
320 }
321 if(!odeum->wmode){
322 dpecodeset(DP_EMODE, __FILE__, __LINE__);
323 return FALSE;
324 }
325 if(!(tmp = crget(odeum->docsdb, (char *)&id, sizeof(int), 0, -1, &tsiz))){
326 if(dpecode != DP_ENOITEM) odeum->fatal = TRUE;
327 return FALSE;
328 }
329 if(_qdbm_inflate){
330 if(!(zbuf = _qdbm_inflate(tmp, tsiz, &zsiz, _QDBM_ZMRAW))){
331 free(tmp);
332 dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
333 odeum->fatal = TRUE;
334 return FALSE;
335 }
336 free(tmp);
337 tmp = zbuf;
338 tsiz = zsiz;
339 }
340 map = cbmapload(tmp, tsiz);
341 free(tmp);
342 uritmp = cbmapget(map, OD_URIEXPR, sizeof(OD_URIEXPR), &uritsiz);
343 if(!uritmp || !vlout(odeum->rdocsdb, uritmp, uritsiz)){
344 cbmapclose(map);
345 dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
346 odeum->fatal = TRUE;
347 return FALSE;
348 }
349 cbmapclose(map);
350 if(!crout(odeum->docsdb, (char *)&id, sizeof(int))){
351 odeum->fatal = TRUE;
352 return FALSE;
353 }
354 odeum->dnum--;
355 return TRUE;
356 }
357
358
359 /* Retrieve a document by a URI. */
odget(ODEUM * odeum,const char * uri)360 ODDOC *odget(ODEUM *odeum, const char *uri){
361 char *tmp;
362 int tsiz, docid;
363 assert(odeum && uri);
364 if(odeum->fatal){
365 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
366 return NULL;
367 }
368 if(!(tmp = vlget(odeum->rdocsdb, uri, -1, &tsiz))){
369 if(dpecode != DP_ENOITEM) odeum->fatal = TRUE;
370 return NULL;
371 }
372 if(tsiz != sizeof(int)){
373 free(tmp);
374 dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
375 odeum->fatal = TRUE;
376 return NULL;
377 }
378 docid = *(int *)tmp;
379 free(tmp);
380 return odgetbyid(odeum, docid);
381 }
382
383
384 /* Retrieve a document by an ID number. */
odgetbyid(ODEUM * odeum,int id)385 ODDOC *odgetbyid(ODEUM *odeum, int id){
386 char *tmp, *zbuf;
387 const char *uritmp, *attrstmp, *nwordstmp, *awordstmp, *asis, *normal;
388 int i, tsiz, uritsiz, attrstsiz, nwordstsiz, awordstsiz, zsiz, asiz, nsiz;
389 ODDOC *doc;
390 CBMAP *map;
391 assert(odeum);
392 if(odeum->fatal){
393 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
394 return NULL;
395 }
396 if(id < 1){
397 dpecodeset(DP_ENOITEM, __FILE__, __LINE__);
398 return NULL;
399 }
400 if(!(tmp = crget(odeum->docsdb, (char *)&id, sizeof(int), 0, -1, &tsiz))){
401 if(dpecode != DP_ENOITEM) odeum->fatal = TRUE;
402 return NULL;
403 }
404 if(_qdbm_inflate){
405 if(!(zbuf = _qdbm_inflate(tmp, tsiz, &zsiz, _QDBM_ZMRAW))){
406 free(tmp);
407 dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
408 odeum->fatal = TRUE;
409 return NULL;
410 }
411 free(tmp);
412 tmp = zbuf;
413 tsiz = zsiz;
414 }
415 map = cbmapload(tmp, tsiz);
416 free(tmp);
417 uritmp = cbmapget(map, OD_URIEXPR, sizeof(OD_URIEXPR), &uritsiz);
418 attrstmp = cbmapget(map, OD_ATTRSEXPR, sizeof(OD_ATTRSEXPR), &attrstsiz);
419 nwordstmp = cbmapget(map, OD_NWORDSEXPR, sizeof(OD_NWORDSEXPR), &nwordstsiz);
420 awordstmp = cbmapget(map, OD_AWORDSEXPR, sizeof(OD_AWORDSEXPR), &awordstsiz);
421 if(!uritmp || !attrstmp || !nwordstmp || !awordstmp){
422 cbmapclose(map);
423 dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
424 odeum->fatal = TRUE;
425 return NULL;
426 }
427 doc = cbmalloc(sizeof(ODDOC));
428 doc->id = id;
429 doc->uri = cbmemdup(uritmp, uritsiz);
430 doc->attrs = cbmapload(attrstmp, attrstsiz);
431 doc->nwords = cblistload(nwordstmp, nwordstsiz);
432 doc->awords = cblistload(awordstmp, awordstsiz);
433 cbmapclose(map);
434 for(i = 0; i < cblistnum(doc->awords); i++){
435 asis = cblistval(doc->awords, i, &asiz);
436 if(asiz == 1 && asis[0] == '\0'){
437 normal = cblistval(doc->nwords, i, &nsiz);
438 cblistover(doc->awords, i, normal, nsiz);
439 }
440 }
441 return doc;
442 }
443
444
445 /* Retrieve the ID of the document specified by a URI. */
odgetidbyuri(ODEUM * odeum,const char * uri)446 int odgetidbyuri(ODEUM *odeum, const char *uri){
447 char *tmp;
448 int tsiz, docid;
449 assert(odeum && uri);
450 if(odeum->fatal){
451 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
452 return -1;
453 }
454 if(!(tmp = vlget(odeum->rdocsdb, uri, -1, &tsiz))){
455 if(dpecode != DP_ENOITEM) odeum->fatal = TRUE;
456 return -1;
457 }
458 if(tsiz != sizeof(int)){
459 free(tmp);
460 dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
461 odeum->fatal = TRUE;
462 return -1;
463 }
464 docid = *(int *)tmp;
465 free(tmp);
466 return docid;
467 }
468
469
470 /* Check whether the document specified by an ID number exists. */
odcheck(ODEUM * odeum,int id)471 int odcheck(ODEUM *odeum, int id){
472 assert(odeum);
473 if(odeum->fatal){
474 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
475 return FALSE;
476 }
477 if(id < 1){
478 dpecodeset(DP_ENOITEM, __FILE__, __LINE__);
479 return FALSE;
480 }
481 return crvsiz(odeum->docsdb, (char *)&id, sizeof(int)) != -1;
482 }
483
484
485 /* Search the inverted index for documents including a word. */
odsearch(ODEUM * odeum,const char * word,int max,int * np)486 ODPAIR *odsearch(ODEUM *odeum, const char *word, int max, int *np){
487 char *tmp;
488 int tsiz;
489 assert(odeum && word && np);
490 if(odeum->fatal){
491 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
492 return NULL;
493 }
494 if(odeum->wmode && cbmaprnum(odeum->sortmap) > 0 &&
495 (!odcacheflush(odeum, "odsearch") || !odsortindex(odeum, "odsearch"))){
496 odeum->fatal = TRUE;
497 return NULL;
498 }
499 max = max < 0 ? -1 : max * sizeof(ODPAIR);
500 if(!(tmp = crget(odeum->indexdb, word, -1, 0, max, &tsiz))){
501 if(dpecode != DP_ENOITEM){
502 odeum->fatal = TRUE;
503 return NULL;
504 }
505 *np = 0;
506 return cbmalloc(1);
507 }
508 *np = tsiz / sizeof(ODPAIR);
509 return (ODPAIR *)tmp;
510 }
511
512
513 /* Get the number of documents including a word. */
odsearchdnum(ODEUM * odeum,const char * word)514 int odsearchdnum(ODEUM *odeum, const char *word){
515 int rv;
516 assert(odeum && word);
517 if(odeum->fatal){
518 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
519 return -1;
520 }
521 rv = crvsiz(odeum->indexdb, word, -1);
522 return rv < 0 ? -1 : rv / sizeof(ODPAIR);
523 }
524
525
526 /* Initialize the iterator of a database handle. */
oditerinit(ODEUM * odeum)527 int oditerinit(ODEUM *odeum){
528 assert(odeum);
529 if(odeum->fatal){
530 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
531 return FALSE;
532 }
533 return criterinit(odeum->docsdb);
534 }
535
536
537 /* Get the next key of the iterator. */
oditernext(ODEUM * odeum)538 ODDOC *oditernext(ODEUM *odeum){
539 char *tmp;
540 int tsiz, docsid;
541 ODDOC *doc;
542 assert(odeum);
543 if(odeum->fatal){
544 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
545 return NULL;
546 }
547 doc = NULL;
548 while(TRUE){
549 if(!(tmp = criternext(odeum->docsdb, &tsiz))){
550 if(dpecode != DP_ENOITEM) odeum->fatal = TRUE;
551 return NULL;
552 }
553 if(tsiz != sizeof(int)){
554 free(tmp);
555 dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
556 odeum->fatal = TRUE;
557 return NULL;
558 }
559 docsid = *(int *)tmp;
560 free(tmp);
561 if((doc = odgetbyid(odeum, docsid)) != NULL) break;
562 if(dpecode != DP_ENOITEM){
563 odeum->fatal = TRUE;
564 return NULL;
565 }
566 }
567 return doc;
568 }
569
570
571 /* Synchronize updating contents with the files and the devices. */
odsync(ODEUM * odeum)572 int odsync(ODEUM *odeum){
573 char numbuf[OD_NUMBUFSIZ];
574 assert(odeum);
575 if(odeum->fatal){
576 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
577 return FALSE;
578 }
579 if(!odeum->wmode){
580 dpecodeset(DP_EMODE, __FILE__, __LINE__);
581 return FALSE;
582 }
583 if(odotcb) odotcb("odsync", odeum, "writing meta information");
584 sprintf(numbuf, "%d", odeum->dmax);
585 if(!vlput(odeum->rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), numbuf, -1, VL_DOVER)){
586 odeum->fatal = TRUE;
587 return FALSE;
588 }
589 sprintf(numbuf, "%d", odeum->dnum);
590 if(!vlput(odeum->rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), numbuf, -1, VL_DOVER)){
591 odeum->fatal = TRUE;
592 return FALSE;
593 }
594 if(!odcacheflush(odeum, "odsync")){
595 odeum->fatal = TRUE;
596 return FALSE;
597 }
598 if(!odsortindex(odeum, "odsync")){
599 odeum->fatal = TRUE;
600 return FALSE;
601 }
602 if(odotcb) odotcb("odsync", odeum, "synchronizing the document database");
603 if(!crsync(odeum->docsdb)){
604 odeum->fatal = TRUE;
605 return FALSE;
606 }
607 if(odotcb) odotcb("odsync", odeum, "synchronizing the inverted index");
608 if(!crsync(odeum->indexdb)){
609 odeum->fatal = TRUE;
610 return FALSE;
611 }
612 if(odotcb) odotcb("odsync", odeum, "synchronizing the reverse dictionary");
613 if(!vlsync(odeum->rdocsdb)){
614 odeum->fatal = TRUE;
615 return FALSE;
616 }
617 return TRUE;
618 }
619
620
621 /* Optimize a database. */
odoptimize(ODEUM * odeum)622 int odoptimize(ODEUM *odeum){
623 assert(odeum);
624 if(odeum->fatal){
625 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
626 return FALSE;
627 }
628 if(!odeum->wmode){
629 dpecodeset(DP_EMODE, __FILE__, __LINE__);
630 return FALSE;
631 }
632 if(!odcacheflush(odeum, "odoptimize")){
633 odeum->fatal = TRUE;
634 return FALSE;
635 }
636 if(odeum->ldid < 1 || odeum->ldid != odeum->dnum){
637 if(!odpurgeindex(odeum, "odoptimize")){
638 odeum->fatal = TRUE;
639 return FALSE;
640 }
641 }
642 if(odeum->ldid > 0){
643 if(!odsortindex(odeum, "odoptimize")){
644 odeum->fatal = TRUE;
645 return FALSE;
646 }
647 }
648 if(odotcb) odotcb("odoptimize", odeum, "optimizing the document database");
649 if(!croptimize(odeum->docsdb, -1)){
650 odeum->fatal = TRUE;
651 return FALSE;
652 }
653 if(odotcb) odotcb("odoptimize", odeum, "optimizing the inverted index");
654 if(!croptimize(odeum->indexdb, -1)){
655 odeum->fatal = TRUE;
656 return FALSE;
657 }
658 if(odotcb) odotcb("odoptimize", odeum, "optimizing the reverse dictionary");
659 if(!vloptimize(odeum->rdocsdb)){
660 odeum->fatal = TRUE;
661 return FALSE;
662 }
663 return TRUE;
664 }
665
666
667 /* Get the name of a database. */
odname(ODEUM * odeum)668 char *odname(ODEUM *odeum){
669 assert(odeum);
670 if(odeum->fatal){
671 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
672 return NULL;
673 }
674 return cbmemdup(odeum->name, -1);
675 }
676
677
678 /* Get the total size of database files. */
odfsiz(ODEUM * odeum)679 double odfsiz(ODEUM *odeum){
680 double fsiz, rv;
681 assert(odeum);
682 if(odeum->fatal){
683 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
684 return -1;
685 }
686 fsiz = 0;
687 if((rv = crfsizd(odeum->docsdb)) < 0) return -1.0;
688 fsiz += rv;
689 if((rv = crfsizd(odeum->indexdb)) < 0) return -1.0;
690 fsiz += rv;
691 if((rv = vlfsiz(odeum->rdocsdb)) == -1) return -1.0;
692 fsiz += rv;
693 return fsiz;
694 }
695
696
697 /* Get the total number of the elements of the bucket arrays for the inverted index. */
odbnum(ODEUM * odeum)698 int odbnum(ODEUM *odeum){
699 assert(odeum);
700 if(odeum->fatal){
701 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
702 return -1;
703 }
704 return crbnum(odeum->indexdb);
705 }
706
707
708 /* Get the total number of the used elements of the bucket arrays in the inverted index. */
odbusenum(ODEUM * odeum)709 int odbusenum(ODEUM *odeum){
710 assert(odeum);
711 if(odeum->fatal){
712 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
713 return -1;
714 }
715 return crbusenum(odeum->indexdb);
716 }
717
718
719 /* Get the number of the documents stored in a database. */
oddnum(ODEUM * odeum)720 int oddnum(ODEUM *odeum){
721 assert(odeum);
722 if(odeum->fatal){
723 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
724 return -1;
725 }
726 return odeum->dnum;
727 }
728
729
730 /* Get the number of the words stored in a database. */
odwnum(ODEUM * odeum)731 int odwnum(ODEUM *odeum){
732 assert(odeum);
733 if(odeum->fatal){
734 dpecodeset(DP_EFATAL, __FILE__, __LINE__);
735 return -1;
736 }
737 return crrnum(odeum->indexdb);
738 }
739
740
741 /* Check whether a database handle is a writer or not. */
odwritable(ODEUM * odeum)742 int odwritable(ODEUM *odeum){
743 assert(odeum);
744 return odeum->wmode;
745 }
746
747
748 /* Check whether a database has a fatal error or not. */
odfatalerror(ODEUM * odeum)749 int odfatalerror(ODEUM *odeum){
750 assert(odeum);
751 return odeum->fatal;
752 }
753
754
755 /* Get the inode number of a database directory. */
odinode(ODEUM * odeum)756 int odinode(ODEUM *odeum){
757 assert(odeum);
758 return odeum->inode;
759 }
760
761
762 /* Get the last modified time of a database. */
odmtime(ODEUM * odeum)763 time_t odmtime(ODEUM *odeum){
764 assert(odeum);
765 return crmtime(odeum->indexdb);
766 }
767
768
769 /* Merge plural database directories. */
odmerge(const char * name,const CBLIST * elemnames)770 int odmerge(const char *name, const CBLIST *elemnames){
771 ODEUM *odeum, **elems;
772 CURIA *curia, *ecuria;
773 VILLA *villa, *evilla;
774 ODPAIR *pairs;
775 char *word, *kbuf, *vbuf, *dbuf, otmsg[OD_OTCBBUFSIZ];
776 char *wpunit[OD_MIWUNIT], *vpunit[OD_MIWUNIT];
777 int i, j, k, num, dnum, wnum, dbnum, ibnum, tnum, wsunit[OD_MIWUNIT], vsunit[OD_MIWUNIT];
778 int err, *bases, sum, max, wsiz, ksiz, vsiz, uend, unum, pnum, align, id, nid, dsiz;
779 assert(name && elemnames);
780 num = cblistnum(elemnames);
781 elems = cbmalloc(num * sizeof(ODEUM *) + 1);
782 dnum = 0;
783 wnum = 0;
784 for(i = 0; i < num; i++){
785 if(!(elems[i] = odopen(cblistval(elemnames, i, NULL), OD_OREADER))){
786 for(i -= 1; i >= 0; i--){
787 odclose(elems[i]);
788 }
789 free(elems);
790 return FALSE;
791 }
792 dnum += oddnum(elems[i]);
793 wnum += odwnum(elems[i]);
794 }
795 dbnum = (int)(dnum * OD_MDBRATIO / OD_DOCSDNUM);
796 ibnum = (int)(wnum * OD_MIBRATIO / odindexdnum);
797 if(!(odeum = odopendb(name, OD_OWRITER | OD_OCREAT | OD_OTRUNC, dbnum, ibnum, "odmerge"))){
798 for(i = 0; i < num; i++){
799 odclose(elems[i]);
800 }
801 free(elems);
802 return FALSE;
803 }
804 err = FALSE;
805 if(odotcb) odotcb("odmerge", odeum, "calculating the base ID numbers");
806 bases = cbmalloc(num * sizeof(int) + 1);
807 sum = 0;
808 for(i = 0; i < num; i++){
809 ecuria = elems[i]->docsdb;
810 max = 0;
811 if(!criterinit(ecuria) && dpecode != DP_ENOITEM) err = TRUE;
812 while((kbuf = criternext(ecuria, &ksiz)) != NULL){
813 if(ksiz == sizeof(int)){
814 if(*(int *)kbuf > max) max = *(int *)kbuf;
815 }
816 free(kbuf);
817 }
818 bases[i] = sum;
819 sum += max;
820 }
821 curia = odeum->indexdb;
822 for(i = 0; i < num; i++){
823 if(odotcb){
824 sprintf(otmsg, "merging the inverted index (%d/%d)", i + 1, num);
825 odotcb("odmerge", odeum, otmsg);
826 }
827 ecuria = elems[i]->indexdb;
828 tnum = 0;
829 uend = FALSE;
830 if(!criterinit(ecuria) && dpecode != DP_ENOITEM) err = TRUE;
831 while(!uend){
832 for(unum = 0; unum < OD_MIWUNIT; unum++){
833 if(!(word = criternext(ecuria, &wsiz))){
834 uend = TRUE;
835 break;
836 }
837 if(!(vbuf = crget(ecuria, word, wsiz, 0, -1, &vsiz))){
838 err = TRUE;
839 free(word);
840 break;
841 }
842 wpunit[unum] = word;
843 wsunit[unum] = wsiz;
844 vpunit[unum] = vbuf;
845 vsunit[unum] = vsiz;
846 }
847 for(j = 0; j < unum; j++){
848 word = wpunit[j];
849 wsiz = wsunit[j];
850 vbuf = vpunit[j];
851 vsiz = vsunit[j];
852 pairs = (ODPAIR *)vbuf;
853 pnum = vsiz / sizeof(ODPAIR);
854 for(k = 0; k < pnum; k++){
855 pairs[k].id += bases[i];
856 }
857 align = (int)(i < num - 1 ? vsiz * (num - i) * OD_MIARATIO : OD_INDEXALIGN);
858 if(!crsetalign(curia, align)) err = TRUE;
859 if(!crput(curia, word, wsiz, vbuf, vsiz, CR_DCAT)) err = TRUE;
860 free(vbuf);
861 free(word);
862 if(odotcb && (tnum + 1) % OD_OTPERWORDS == 0){
863 sprintf(otmsg, "... (%d/%d)", tnum + 1, crrnum(ecuria));
864 odotcb("odmerge", odeum, otmsg);
865 }
866 tnum++;
867 }
868 }
869 }
870 if(odotcb) odotcb("odmerge", odeum, "sorting the inverted index");
871 tnum = 0;
872 if(!criterinit(curia) && dpecode != DP_ENOITEM) err = TRUE;
873 while((word = criternext(curia, &wsiz)) != NULL){
874 if((vbuf = crget(curia, word, wsiz, 0, -1, &vsiz)) != NULL){
875 if(vsiz > sizeof(ODPAIR)){
876 pairs = (ODPAIR *)vbuf;
877 pnum = vsiz / sizeof(ODPAIR);
878 qsort(pairs, pnum, sizeof(ODPAIR), odsortcompare);
879 if(!crput(curia, word, wsiz, vbuf, vsiz, CR_DOVER)) err = TRUE;
880 }
881 free(vbuf);
882 }
883 free(word);
884 if(odotcb && (tnum + 1) % OD_OTPERWORDS == 0){
885 sprintf(otmsg, "... (%d/%d)", tnum + 1, crrnum(curia));
886 odotcb("odmerge", odeum, otmsg);
887 }
888 tnum++;
889 }
890 if(odotcb) odotcb("odmerge", odeum, "synchronizing the inverted index");
891 if(!crsync(curia)) err = TRUE;
892 dnum = 0;
893 curia = odeum->docsdb;
894 villa = odeum->rdocsdb;
895 for(i = 0; i < num; i++){
896 if(odotcb){
897 sprintf(otmsg, "merging the document database (%d/%d)", i + 1, num);
898 odotcb("odmerge", odeum, otmsg);
899 }
900 evilla = elems[i]->rdocsdb;
901 ecuria = elems[i]->docsdb;
902 tnum = 0;
903 if(!vlcurfirst(evilla) && dpecode != DP_ENOITEM) err = TRUE;
904 while(TRUE){
905 if(!(kbuf = vlcurkey(evilla, &ksiz))) break;
906 if((ksiz == sizeof(OD_DMAXEXPR) && !memcmp(kbuf, OD_DMAXEXPR, ksiz)) ||
907 (ksiz == sizeof(OD_DNUMEXPR) && !memcmp(kbuf, OD_DNUMEXPR, ksiz))){
908 free(kbuf);
909 if(!vlcurnext(evilla)) break;
910 continue;
911 }
912 if(!(vbuf = vlcurval(evilla, &vsiz))){
913 free(kbuf);
914 if(!vlcurnext(evilla)) break;
915 continue;
916 }
917 if(vsiz != sizeof(int)){
918 free(vbuf);
919 free(kbuf);
920 if(!vlcurnext(evilla)) break;
921 continue;
922 }
923 id = *(int *)vbuf;
924 nid = id + bases[i];
925 if(vlput(villa, kbuf, ksiz, (char *)&nid, sizeof(int), VL_DKEEP)){
926 if((dbuf = crget(ecuria, (char *)&id, sizeof(int), 0, -1, &dsiz)) != NULL){
927 if(crput(curia, (char *)&nid, sizeof(int), dbuf, dsiz, CR_DKEEP)){
928 dnum++;
929 } else {
930 err = TRUE;
931 }
932 free(dbuf);
933 } else {
934 err = TRUE;
935 }
936 } else if(dpecode != DP_EKEEP){
937 err = TRUE;
938 }
939 free(vbuf);
940 free(kbuf);
941 odeum->dnum++;
942 if(odotcb && (tnum + 1) % OD_OTPERDOCS == 0){
943 sprintf(otmsg, "... (%d/%d)", tnum + 1, crrnum(ecuria));
944 odotcb("odmerge", odeum, otmsg);
945 }
946 tnum++;
947 if(!vlcurnext(evilla)) break;
948 }
949 }
950 odeum->dnum = dnum;
951 odeum->dmax = dnum;
952 free(bases);
953 if(odotcb) odotcb("odmerge", odeum, "synchronizing the document index");
954 if(!crsync(curia)) err = TRUE;
955 if(!odclose(odeum)) err = TRUE;
956 for(i = 0; i < num; i++){
957 if(!odclose(elems[i])) err = TRUE;
958 }
959 free(elems);
960 return err ? FALSE : TRUE;
961 }
962
963
964 /* Remove a database directory. */
odremove(const char * name)965 int odremove(const char *name){
966 char docsname[OD_PATHBUFSIZ], indexname[OD_PATHBUFSIZ], rdocsname[OD_PATHBUFSIZ];
967 char path[OD_PATHBUFSIZ];
968 const char *file;
969 struct stat sbuf;
970 CBLIST *list;
971 int i;
972 assert(name);
973 sprintf(docsname, "%s%c%s", name, MYPATHCHR, OD_DOCSNAME);
974 sprintf(indexname, "%s%c%s", name, MYPATHCHR, OD_INDEXNAME);
975 sprintf(rdocsname, "%s%c%s", name, MYPATHCHR, OD_RDOCSNAME);
976 if(lstat(name, &sbuf) == -1){
977 dpecodeset(DP_ESTAT, __FILE__, __LINE__);
978 return FALSE;
979 }
980 if(lstat(docsname, &sbuf) != -1 && !crremove(docsname)) return FALSE;
981 if(lstat(indexname, &sbuf) != -1 && !crremove(indexname)) return FALSE;
982 if(lstat(rdocsname, &sbuf) != -1 && !vlremove(rdocsname)) return FALSE;
983 if((list = cbdirlist(name)) != NULL){
984 for(i = 0; i < cblistnum(list); i++){
985 file = cblistval(list, i, NULL);
986 if(!strcmp(file, MYCDIRSTR) || !strcmp(file, MYPDIRSTR)) continue;
987 sprintf(path, "%s%c%s", name, MYPATHCHR, file);
988 if(lstat(path, &sbuf) == -1) continue;
989 if(S_ISDIR(sbuf.st_mode)){
990 if(!crremove(path)) return FALSE;
991 } else {
992 if(!dpremove(path)) return FALSE;
993 }
994 }
995 cblistclose(list);
996 }
997 if(rmdir(name) == -1){
998 dpecodeset(DP_ERMDIR, __FILE__, __LINE__);
999 return FALSE;
1000 }
1001 return TRUE;
1002 }
1003
1004
1005 /* Get a document handle. */
oddocopen(const char * uri)1006 ODDOC *oddocopen(const char *uri){
1007 ODDOC *doc;
1008 assert(uri);
1009 doc = cbmalloc(sizeof(ODDOC));
1010 doc->id = -1;
1011 doc->uri = cbmemdup(uri, -1);
1012 doc->attrs = cbmapopenex(OD_MAPPBNUM);
1013 doc->nwords = cblistopen();
1014 doc->awords = cblistopen();
1015 return doc;
1016 }
1017
1018
1019 /* Close a document handle. */
oddocclose(ODDOC * doc)1020 void oddocclose(ODDOC *doc){
1021 assert(doc);
1022 cblistclose(doc->awords);
1023 cblistclose(doc->nwords);
1024 cbmapclose(doc->attrs);
1025 free(doc->uri);
1026 free(doc);
1027 }
1028
1029
1030 /* Add an attribute to a document. */
oddocaddattr(ODDOC * doc,const char * name,const char * value)1031 void oddocaddattr(ODDOC *doc, const char *name, const char *value){
1032 assert(doc && name && value);
1033 cbmapput(doc->attrs, name, -1, value, -1, TRUE);
1034 }
1035
1036
1037 /* Add a word to a document. */
oddocaddword(ODDOC * doc,const char * normal,const char * asis)1038 void oddocaddword(ODDOC *doc, const char *normal, const char *asis){
1039 assert(doc && normal && asis);
1040 cblistpush(doc->nwords, normal, -1);
1041 cblistpush(doc->awords, asis, -1);
1042 }
1043
1044
1045 /* Get the ID number of a document. */
oddocid(const ODDOC * doc)1046 int oddocid(const ODDOC *doc){
1047 assert(doc);
1048 return doc->id;
1049 }
1050
1051
1052 /* Get the URI of a document. */
oddocuri(const ODDOC * doc)1053 const char *oddocuri(const ODDOC *doc){
1054 assert(doc);
1055 return doc->uri;
1056 }
1057
1058
1059 /* Get the value of an attribute of a document. */
oddocgetattr(const ODDOC * doc,const char * name)1060 const char *oddocgetattr(const ODDOC *doc, const char *name){
1061 assert(doc && name);
1062 return cbmapget(doc->attrs, name, -1, NULL);
1063 }
1064
1065
1066 /* Get the list handle contains words in normalized form of a document. */
oddocnwords(const ODDOC * doc)1067 const CBLIST *oddocnwords(const ODDOC *doc){
1068 assert(doc);
1069 return doc->nwords;
1070 }
1071
1072
1073 /* Get the list handle contains words in appearance form of a document. */
oddocawords(const ODDOC * doc)1074 const CBLIST *oddocawords(const ODDOC *doc){
1075 assert(doc);
1076 return doc->awords;
1077 }
1078
1079
1080 /* Get the map handle contains keywords in normalized form and their scores. */
oddocscores(const ODDOC * doc,int max,ODEUM * odeum)1081 CBMAP *oddocscores(const ODDOC *doc, int max, ODEUM *odeum){
1082 const CBLIST *nwords;
1083 CBMAP *map, *kwmap;
1084 const char *word, *ctmp;
1085 char numbuf[OD_NUMBUFSIZ];
1086 ODWORD *owords;
1087 int i, wsiz, wnum, hnum, mnum, nbsiz;
1088 double ival;
1089 assert(doc && max >= 0);
1090 map = cbmapopen();
1091 nwords = oddocnwords(doc);
1092 for(i = 0; i < cblistnum(nwords); i++){
1093 word = cblistval(nwords, i, &wsiz);
1094 if(wsiz < 1) continue;
1095 if((ctmp = cbmapget(map, word, wsiz, NULL)) != NULL){
1096 wnum = *(int *)ctmp + OD_WOCCRPOINT;
1097 } else {
1098 wnum = OD_WOCCRPOINT;
1099 }
1100 cbmapput(map, word, wsiz, (char *)&wnum, sizeof(int), TRUE);
1101 }
1102 mnum = cbmaprnum(map);
1103 owords = cbmalloc(mnum * sizeof(ODWORD) + 1);
1104 cbmapiterinit(map);
1105 for(i = 0; (word = cbmapiternext(map, &wsiz)) != NULL; i++){
1106 owords[i].word = word;
1107 owords[i].num = *(int *)cbmapget(map, word, wsiz, NULL);
1108 }
1109 qsort(owords, mnum, sizeof(ODWORD), odwordcompare);
1110 if(odeum){
1111 if(mnum > max * OD_KEYCRATIO) mnum = (int)(max * OD_KEYCRATIO);
1112 for(i = 0; i < mnum; i++){
1113 if((hnum = odsearchdnum(odeum, owords[i].word)) < 0) hnum = 0;
1114 ival = odlogarithm(hnum);
1115 ival = (ival * ival * ival) / 8.0;
1116 if(ival < 8.0) ival = 8.0;
1117 owords[i].num = (int)(owords[i].num / ival);
1118 }
1119 qsort(owords, mnum, sizeof(ODWORD), odwordcompare);
1120 }
1121 if(mnum > max) mnum = max;
1122 kwmap = cbmapopenex(OD_MAPPBNUM);
1123 for(i = 0; i < mnum; i++){
1124 nbsiz = sprintf(numbuf, "%d", owords[i].num);
1125 cbmapput(kwmap, owords[i].word, -1, numbuf, nbsiz, TRUE);
1126 }
1127 free(owords);
1128 cbmapclose(map);
1129 return kwmap;
1130 }
1131
1132
1133 /* Break a text into words in appearance form. */
odbreaktext(const char * text)1134 CBLIST *odbreaktext(const char *text){
1135 const char *word;
1136 CBLIST *elems, *words;
1137 int i, j, dif, wsiz, pv, delim;
1138 assert(text);
1139 words = cblistopen();
1140 elems = cbsplit(text, -1, OD_SPACECHARS);
1141 for(i = 0; i < cblistnum(elems); i++){
1142 word = cblistval(elems, i, &wsiz);
1143 delim = FALSE;
1144 j = 0;
1145 pv = 0;
1146 while(TRUE){
1147 dif = j - pv;
1148 if(j >= wsiz){
1149 if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv);
1150 break;
1151 }
1152 if(delim){
1153 if(!strchr(OD_DELIMCHARS, word[j])){
1154 if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv);
1155 pv = j;
1156 delim = FALSE;
1157 }
1158 } else {
1159 if(strchr(OD_DELIMCHARS, word[j])){
1160 if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv);
1161 pv = j;
1162 delim = TRUE;
1163 }
1164 }
1165 j++;
1166 }
1167 }
1168 cblistclose(elems);
1169 return words;
1170 }
1171
1172
1173 /* Make the normalized form of a word. */
odnormalizeword(const char * asis)1174 char *odnormalizeword(const char *asis){
1175 char *nword;
1176 int i;
1177 assert(asis);
1178 for(i = 0; asis[i] != '\0'; i++){
1179 if(!strchr(OD_DELIMCHARS, asis[i])) break;
1180 }
1181 if(asis[i] == '\0') return cbmemdup("", 0);
1182 nword = cbmemdup(asis, -1);
1183 for(i = 0; nword[i] != '\0'; i++){
1184 if(nword[i] >= 'A' && nword[i] <= 'Z') nword[i] += 'a' - 'A';
1185 }
1186 while(i >= 0){
1187 if(strchr(OD_GLUECHARS, nword[i])){
1188 nword[i] = '\0';
1189 } else {
1190 break;
1191 }
1192 i--;
1193 }
1194 return nword;
1195 }
1196
1197
1198 /* Get the common elements of two sets of documents. */
odpairsand(ODPAIR * apairs,int anum,ODPAIR * bpairs,int bnum,int * np)1199 ODPAIR *odpairsand(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){
1200 CBMAP *map;
1201 ODPAIR *result;
1202 const char *tmp;
1203 int i, rnum;
1204 assert(apairs && anum >= 0 && bpairs && bnum >= 0);
1205 map = odpairsmap(bpairs, bnum);
1206 result = cbmalloc(sizeof(ODPAIR) * anum + 1);
1207 rnum = 0;
1208 for(i = 0; i < anum; i++){
1209 if(!(tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL))) continue;
1210 result[rnum].id = apairs[i].id;
1211 result[rnum].score = apairs[i].score + *(int *)tmp;
1212 rnum++;
1213 }
1214 cbmapclose(map);
1215 qsort(result, rnum, sizeof(ODPAIR), odsortcompare);
1216 *np = rnum;
1217 return result;
1218 }
1219
1220
1221 /* Get the sum of elements of two sets of documents. */
odpairsor(ODPAIR * apairs,int anum,ODPAIR * bpairs,int bnum,int * np)1222 ODPAIR *odpairsor(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){
1223 CBMAP *map;
1224 ODPAIR *result;
1225 const char *tmp;
1226 int i, score, rnum;
1227 assert(apairs && anum >= 0 && bpairs && bnum >= 0);
1228 map = odpairsmap(bpairs, bnum);
1229 for(i = 0; i < anum; i++){
1230 score = 0;
1231 if((tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL)) != NULL)
1232 score = *(int *)tmp;
1233 score += apairs[i].score;
1234 cbmapput(map, (char *)&(apairs[i].id), sizeof(int),
1235 (char *)&score, sizeof(int), TRUE);
1236 }
1237 rnum = cbmaprnum(map);
1238 result = cbmalloc(rnum * sizeof(ODPAIR) + 1);
1239 cbmapiterinit(map);
1240 for(i = 0; (tmp = cbmapiternext(map, NULL)) != NULL; i++){
1241 result[i].id = *(int *)tmp;
1242 result[i].score = *(int *)cbmapget(map, tmp, sizeof(int), NULL);
1243 }
1244 cbmapclose(map);
1245 qsort(result, rnum, sizeof(ODPAIR), odsortcompare);
1246 *np = rnum;
1247 return result;
1248 }
1249
1250
1251 /* Get the difference set of documents. */
odpairsnotand(ODPAIR * apairs,int anum,ODPAIR * bpairs,int bnum,int * np)1252 ODPAIR *odpairsnotand(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){
1253 CBMAP *map;
1254 ODPAIR *result;
1255 const char *tmp;
1256 int i, rnum;
1257 assert(apairs && anum >= 0 && bpairs && bnum >= 0);
1258 map = odpairsmap(bpairs, bnum);
1259 result = cbmalloc(sizeof(ODPAIR) * anum + 1);
1260 rnum = 0;
1261 for(i = 0; i < anum; i++){
1262 if((tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL)) != NULL) continue;
1263 result[rnum].id = apairs[i].id;
1264 result[rnum].score = apairs[i].score;
1265 rnum++;
1266 }
1267 cbmapclose(map);
1268 qsort(result, rnum, sizeof(ODPAIR), odsortcompare);
1269 *np = rnum;
1270 return result;
1271 }
1272
1273
1274 /* Sort a set of documents in descending order of scores. */
odpairssort(ODPAIR * pairs,int pnum)1275 void odpairssort(ODPAIR *pairs, int pnum){
1276 assert(pairs && pnum >= 0);
1277 qsort(pairs, pnum, sizeof(ODPAIR), odsortcompare);
1278 }
1279
1280
1281 /* Get the natural logarithm of a number. */
odlogarithm(double x)1282 double odlogarithm(double x){
1283 int i;
1284 if(x <= 1.0) return 0.0;
1285 x = x * x * x * x * x * x * x * x * x * x;
1286 for(i = 0; x > 1.0; i++){
1287 x /= 2.718281828459;
1288 }
1289 return (double)i / 10.0;
1290 }
1291
1292
1293 /* Get the cosine of the angle of two vectors. */
odvectorcosine(const int * avec,const int * bvec,int vnum)1294 double odvectorcosine(const int *avec, const int *bvec, int vnum){
1295 double rv;
1296 assert(avec && bvec && vnum >= 0);
1297 rv = odvecinnerproduct(avec, bvec, vnum) /
1298 ((odvecabsolute(avec, vnum) * odvecabsolute(bvec, vnum)));
1299 return rv > 0.0 ? rv : 0.0;
1300 }
1301
1302
1303 /* Set the global tuning parameters. */
odsettuning(int ibnum,int idnum,int cbnum,int csiz)1304 void odsettuning(int ibnum, int idnum, int cbnum, int csiz){
1305 if(ibnum > 0) odindexbnum = ibnum;
1306 if(idnum > 0) odindexdnum = idnum;
1307 if(cbnum > 0) odcachebnum = dpprimenum(cbnum);
1308 if(csiz > 0) odcachesiz = csiz;
1309 }
1310
1311
1312 /* Break a text into words and store appearance forms and normalized form into lists. */
odanalyzetext(ODEUM * odeum,const char * text,CBLIST * awords,CBLIST * nwords)1313 void odanalyzetext(ODEUM *odeum, const char *text, CBLIST *awords, CBLIST *nwords){
1314 char aword[OD_MAXWORDLEN+1], *wp;
1315 int lev, wsiz;
1316 assert(odeum && text && awords);
1317 lev = OD_EVSPACE;
1318 wsiz = 0;
1319 for(; *text != '\0'; text++){
1320 switch(odeum->statechars[*(unsigned char *)text]){
1321 case OD_EVWORD:
1322 if(wsiz > 0 && lev == OD_EVDELIM){
1323 cblistpush(awords, aword, wsiz);
1324 if(nwords) cblistpush(nwords, "", 0);
1325 wsiz = 0;
1326 }
1327 if(wsiz <= OD_MAXWORDLEN){
1328 aword[wsiz++] = *text;
1329 }
1330 lev = OD_EVWORD;
1331 break;
1332 case OD_EVGLUE:
1333 if(wsiz > 0 && lev == OD_EVDELIM){
1334 cblistpush(awords, aword, wsiz);
1335 if(nwords) cblistpush(nwords, "", 0);
1336 wsiz = 0;
1337 }
1338 if(wsiz <= OD_MAXWORDLEN){
1339 aword[wsiz++] = *text;
1340 }
1341 lev = OD_EVGLUE;
1342 break;
1343 case OD_EVDELIM:
1344 if(wsiz > 0 && lev != OD_EVDELIM){
1345 cblistpush(awords, aword, wsiz);
1346 if(nwords){
1347 wp = aword;
1348 aword[wsiz] = '\0';
1349 while(*wp != '\0'){
1350 if(*wp >= 'A' && *wp <= 'Z') *wp += 'a' - 'A';
1351 wp++;
1352 }
1353 wp--;
1354 while(wp >= aword && odeum->statechars[*(unsigned char *)wp] == OD_EVGLUE){
1355 wsiz--;
1356 wp--;
1357 }
1358 cblistpush(nwords, aword, wsiz);
1359 }
1360 wsiz = 0;
1361 }
1362 if(wsiz <= OD_MAXWORDLEN){
1363 aword[wsiz++] = *text;
1364 }
1365 lev = OD_EVDELIM;
1366 break;
1367 default:
1368 if(wsiz > 0){
1369 cblistpush(awords, aword, wsiz);
1370 if(nwords){
1371 if(lev == OD_EVDELIM){
1372 cblistpush(nwords, "", 0);
1373 } else {
1374 wp = aword;
1375 aword[wsiz] = '\0';
1376 while(*wp != '\0'){
1377 if(*wp >= 'A' && *wp <= 'Z') *wp += 'a' - 'A';
1378 wp++;
1379 }
1380 wp--;
1381 while(wp >= aword && odeum->statechars[*(unsigned char *)wp] == OD_EVGLUE){
1382 wsiz--;
1383 wp--;
1384 }
1385 cblistpush(nwords, aword, wsiz);
1386 }
1387 }
1388 wsiz = 0;
1389 }
1390 lev = OD_EVSPACE;
1391 break;
1392 }
1393 }
1394 if(wsiz > 0){
1395 cblistpush(awords, aword, wsiz);
1396 if(nwords){
1397 if(lev == OD_EVDELIM){
1398 cblistpush(nwords, "", 0);
1399 } else {
1400 wp = aword;
1401 aword[wsiz] = '\0';
1402 while(*wp != '\0'){
1403 if(*wp >= 'A' && *wp <= 'Z') *wp += 'a' - 'A';
1404 wp++;
1405 }
1406 wp--;
1407 while(wp >= aword && odeum->statechars[*(unsigned char *)wp] == OD_EVGLUE){
1408 wsiz--;
1409 wp--;
1410 }
1411 cblistpush(nwords, aword, wsiz);
1412 }
1413 }
1414 wsiz = 0;
1415 }
1416 }
1417
1418
1419 /* Set the classes of characters used by `odanalyzetext'. */
odsetcharclass(ODEUM * odeum,const char * spacechars,const char * delimchars,const char * gluechars)1420 void odsetcharclass(ODEUM *odeum, const char *spacechars, const char *delimchars,
1421 const char *gluechars){
1422 assert(odeum && spacechars && delimchars && gluechars);
1423 memset(odeum->statechars, OD_EVWORD, sizeof(odeum->statechars));
1424 for(; *spacechars != '\0'; spacechars++){
1425 odeum->statechars[*(unsigned char *)spacechars] = OD_EVSPACE;
1426 }
1427 for(; *delimchars != '\0'; delimchars++){
1428 odeum->statechars[*(unsigned char *)delimchars] = OD_EVDELIM;
1429 }
1430 for(; *gluechars != '\0'; gluechars++){
1431 odeum->statechars[*(unsigned char *)gluechars] = OD_EVGLUE;
1432 }
1433 }
1434
1435
1436 /* Query a database using a small boolean query language. */
odquery(ODEUM * odeum,const char * query,int * np,CBLIST * errors)1437 ODPAIR *odquery(ODEUM *odeum, const char *query, int *np, CBLIST *errors){
1438 CBLIST *tokens = cblistopen();
1439 CBLIST *nwords = cblistopen();
1440 ODPAIR *results = NULL;
1441 assert(odeum && query && np);
1442 odanalyzetext(odeum, query, tokens, nwords);
1443 odcleannormalized(odeum, nwords);
1444 odfixtokens(odeum, tokens);
1445 results = odparseexpr(odeum, tokens, nwords, np, errors);
1446 cblistclose(tokens);
1447 cblistclose(nwords);
1448 return results;
1449 }
1450
1451
1452
1453 /*************************************************************************************************
1454 * features for experts
1455 *************************************************************************************************/
1456
1457
1458 /* Get the internal database handle for documents. */
odidbdocs(ODEUM * odeum)1459 CURIA *odidbdocs(ODEUM *odeum){
1460 assert(odeum);
1461 return odeum->docsdb;
1462 }
1463
1464
1465 /* Get the internal database handle for the inverted index. */
odidbindex(ODEUM * odeum)1466 CURIA *odidbindex(ODEUM *odeum){
1467 assert(odeum);
1468 return odeum->indexdb;
1469 }
1470
1471
1472 /* Get the internal database handle for the reverse dictionary. */
odidbrdocs(ODEUM * odeum)1473 VILLA *odidbrdocs(ODEUM *odeum){
1474 assert(odeum);
1475 return odeum->rdocsdb;
1476 }
1477
1478
1479 /* Set the call back function called in merging. */
odsetotcb(void (* otcb)(const char *,ODEUM *,const char *))1480 void odsetotcb(void (*otcb)(const char *, ODEUM *, const char *)){
1481 odotcb = otcb;
1482 }
1483
1484
1485 /* Get the positive one of square roots of a number. */
odsquareroot(double x)1486 double odsquareroot(double x){
1487 double c, rv;
1488 if(x <= 0.0) return 0.0;
1489 c = x > 1.0 ? x : 1;
1490 do {
1491 rv = c;
1492 c = (x / c + c) * 0.5;
1493 } while(c < rv);
1494 return rv;
1495 }
1496
1497
1498 /* Get the absolute of a vector. */
odvecabsolute(const int * vec,int vnum)1499 double odvecabsolute(const int *vec, int vnum){
1500 double rv;
1501 int i;
1502 assert(vec && vnum >= 0);
1503 rv = 0;
1504 for(i = 0; i < vnum; i++){
1505 rv += (double)vec[i] * (double)vec[i];
1506 }
1507 return odsquareroot(rv);
1508 }
1509
1510
1511 /* Get the inner product of two vectors. */
odvecinnerproduct(const int * avec,const int * bvec,int vnum)1512 double odvecinnerproduct(const int *avec, const int *bvec, int vnum){
1513 double rv;
1514 int i;
1515 assert(avec && bvec && vnum >= 0);
1516 rv = 0;
1517 for(i = 0; i < vnum; i++){
1518 rv += (double)avec[i] * (double)bvec[i];
1519 }
1520 return rv;
1521 }
1522
1523
1524
1525 /*************************************************************************************************
1526 * private objects
1527 *************************************************************************************************/
1528
1529
1530 /* Get a database handle.
1531 `name' specifies the name of a database directory.
1532 `omode' specifies the connection mode.
1533 `docsbnum` specifies the number of buckets of the document database.
1534 `indexbnum` specifies the number of buckets of the index database.
1535 `fname' specifies the name of caller function.
1536 The return value is the database handle or `NULL' if it is not successful. */
odopendb(const char * name,int omode,int docsbnum,int indexbnum,const char * fname)1537 static ODEUM *odopendb(const char *name, int omode, int docsbnum, int indexbnum,
1538 const char *fname){
1539 int cromode, vlomode, inode, dmax, dnum;
1540 char docsname[OD_PATHBUFSIZ], indexname[OD_PATHBUFSIZ], rdocsname[OD_PATHBUFSIZ], *tmp;
1541 struct stat sbuf;
1542 CURIA *docsdb, *indexdb;
1543 VILLA *rdocsdb;
1544 CBMAP *cachemap;
1545 CBMAP *sortmap;
1546 ODEUM *odeum;
1547 assert(name);
1548 if(strlen(name) > OD_NAMEMAX){
1549 dpecodeset(DP_EMISC, __FILE__, __LINE__);
1550 return NULL;
1551 }
1552 cromode = CR_OREADER;
1553 vlomode = VL_OREADER;
1554 if(omode & OD_OWRITER){
1555 cromode = CR_OWRITER;
1556 vlomode = VL_OWRITER | VL_OZCOMP | VL_OYCOMP;
1557 if(omode & OD_OCREAT){
1558 cromode |= CR_OCREAT;
1559 vlomode |= VL_OCREAT;
1560 }
1561 if(omode & OD_OTRUNC){
1562 cromode |= CR_OTRUNC;
1563 vlomode |= VL_OTRUNC;
1564 }
1565 }
1566 if(omode & OD_ONOLCK){
1567 cromode |= CR_ONOLCK;
1568 vlomode |= VL_ONOLCK;
1569 }
1570 if(omode & OD_OLCKNB){
1571 cromode |= CR_OLCKNB;
1572 vlomode |= VL_OLCKNB;
1573 }
1574 sprintf(docsname, "%s%c%s", name, MYPATHCHR, OD_DOCSNAME);
1575 sprintf(indexname, "%s%c%s", name, MYPATHCHR, OD_INDEXNAME);
1576 sprintf(rdocsname, "%s%c%s", name, MYPATHCHR, OD_RDOCSNAME);
1577 docsdb = NULL;
1578 indexdb = NULL;
1579 rdocsdb = NULL;
1580 if((omode & OD_OWRITER) && (omode & OD_OCREAT)){
1581 if(mkdir(name, OD_DIRMODE) == -1 && errno != EEXIST){
1582 dpecodeset(DP_EMKDIR, __FILE__, __LINE__);
1583 return NULL;
1584 }
1585 }
1586 if(lstat(name, &sbuf) == -1){
1587 dpecodeset(DP_ESTAT, __FILE__, __LINE__);
1588 return NULL;
1589 }
1590 inode = sbuf.st_ino;
1591 if(!(docsdb = cropen(docsname, cromode, docsbnum, OD_DOCSDNUM))) return NULL;
1592 if(!(indexdb = cropen(indexname, cromode, indexbnum, odindexdnum))){
1593 crclose(docsdb);
1594 return NULL;
1595 }
1596 if(omode & OD_OWRITER){
1597 if(!crsetalign(docsdb, OD_DOCSALIGN) || !crsetfbpsiz(docsdb, OD_DOCSFBP) ||
1598 !crsetalign(indexdb, OD_INDEXALIGN) || !crsetfbpsiz(indexdb, OD_INDEXFBP)){
1599 crclose(indexdb);
1600 crclose(docsdb);
1601 return NULL;
1602 }
1603 }
1604 if(!(rdocsdb = vlopen(rdocsname, vlomode, VL_CMPLEX))){
1605 crclose(indexdb);
1606 crclose(docsdb);
1607 return NULL;
1608 }
1609 vlsettuning(rdocsdb, OD_RDOCSLRM, OD_RDOCSNIM, OD_RDOCSLCN, OD_RDOCSNCN);
1610 if(omode & OD_OWRITER){
1611 cachemap = cbmapopenex(odcachebnum);
1612 sortmap = cbmapopenex(odcachebnum);
1613 } else {
1614 cachemap = NULL;
1615 sortmap = NULL;
1616 }
1617 if(vlrnum(rdocsdb) > 0){
1618 dmax = -1;
1619 dnum = -1;
1620 if((tmp = vlget(rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), NULL)) != NULL){
1621 dmax = atoi(tmp);
1622 free(tmp);
1623 }
1624 if((tmp = vlget(rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), NULL)) != NULL){
1625 dnum = atoi(tmp);
1626 free(tmp);
1627 }
1628 if(dmax < 0 || dnum < 0){
1629 if(sortmap) cbmapclose(sortmap);
1630 if(cachemap) cbmapclose(cachemap);
1631 vlclose(rdocsdb);
1632 crclose(indexdb);
1633 crclose(docsdb);
1634 dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
1635 return NULL;
1636 }
1637 } else {
1638 dmax = 0;
1639 dnum = 0;
1640 }
1641 odeum = cbmalloc(sizeof(ODEUM));
1642 odeum->name = cbmemdup(name, -1);
1643 odeum->wmode = omode & OD_OWRITER;
1644 odeum->fatal = FALSE;
1645 odeum->inode = inode;
1646 odeum->docsdb = docsdb;
1647 odeum->indexdb = indexdb;
1648 odeum->rdocsdb = rdocsdb;
1649 odeum->cachemap = cachemap;
1650 odeum->cacheasiz = 0;
1651 odeum->sortmap = sortmap;
1652 odeum->dmax = dmax;
1653 odeum->dnum = dnum;
1654 odeum->ldid = -1;
1655 odsetcharclass(odeum, OD_SPACECHARS, OD_DELIMCHARS, OD_GLUECHARS);
1656 if(odotcb) odotcb(fname, odeum, "the connection was established");
1657 return odeum;
1658 }
1659
1660
1661 /* Flush the cache for dirty buffer of words.
1662 `odeum' specifies a database handle.
1663 `fname' specifies the name of caller function.
1664 If successful, the return value is true, else, it is false. */
odcacheflush(ODEUM * odeum,const char * fname)1665 static int odcacheflush(ODEUM *odeum, const char *fname){
1666 const char *kbuf, *vbuf;
1667 char otmsg[OD_OTCBBUFSIZ];
1668 int i, rnum, ksiz, vsiz;
1669 assert(odeum);
1670 if((rnum = cbmaprnum(odeum->cachemap)) < 1) return TRUE;
1671 if(odotcb) odotcb(fname, odeum, "flushing caches");
1672 cbmapiterinit(odeum->cachemap);
1673 for(i = 0; (kbuf = cbmapiternext(odeum->cachemap, &ksiz)) != NULL; i++){
1674 vbuf = cbmapget(odeum->cachemap, kbuf, ksiz, &vsiz);
1675 if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, vsiz, CR_DCAT)){
1676 odeum->fatal = TRUE;
1677 return FALSE;
1678 }
1679 if(odotcb && (i + 1) % OD_OTPERWORDS == 0){
1680 sprintf(otmsg, "... (%d/%d)", i + 1, rnum);
1681 odotcb(fname, odeum, otmsg);
1682 }
1683 }
1684 cbmapclose(odeum->cachemap);
1685 odeum->cachemap = cbmapopenex(odcachebnum);
1686 odeum->cacheasiz = 0;
1687 return TRUE;
1688 }
1689
1690
1691 /* Flush all frequent words in the cache for dirty buffer of words.
1692 `odeum' specifies a database handle.
1693 `fname' specifies the name of caller function.
1694 `min' specifies the minimum size of frequent words.
1695 If successful, the return value is true, else, it is false. */
odcacheflushfreq(ODEUM * odeum,const char * fname,int min)1696 static int odcacheflushfreq(ODEUM *odeum, const char *fname, int min){
1697 const char *kbuf, *vbuf;
1698 char otmsg[OD_OTCBBUFSIZ];
1699 int rnum, ksiz, vsiz;
1700 assert(odeum);
1701 if((rnum = cbmaprnum(odeum->cachemap)) < 1) return TRUE;
1702 if(odotcb){
1703 sprintf(otmsg, "flushing frequent words: min=%d asiz=%d rnum=%d)",
1704 min, odeum->cacheasiz, rnum);
1705 odotcb(fname, odeum, otmsg);
1706 }
1707 cbmapiterinit(odeum->cachemap);
1708 while((kbuf = cbmapiternext(odeum->cachemap, &ksiz)) != NULL){
1709 vbuf = cbmapget(odeum->cachemap, kbuf, ksiz, &vsiz);
1710 if(vsiz >= sizeof(ODPAIR) * min){
1711 if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, vsiz, CR_DCAT)){
1712 odeum->fatal = TRUE;
1713 return FALSE;
1714 }
1715 cbmapout(odeum->cachemap, kbuf, ksiz);
1716 odeum->cacheasiz -= vsiz;
1717 }
1718 }
1719 if(odotcb){
1720 sprintf(otmsg, "... (done): min=%d asiz=%d rnum=%d)",
1721 min, odeum->cacheasiz, cbmaprnum(odeum->cachemap));
1722 odotcb(fname, odeum, otmsg);
1723 }
1724 return TRUE;
1725 }
1726
1727
1728 /* Flush the half of rare words in the cache for dirty buffer of words.
1729 `odeum' specifies a database handle.
1730 `fname' specifies the name of caller function.
1731 `ratio' specifies the ratio of rare words.
1732 If successful, the return value is true, else, it is false. */
odcacheflushrare(ODEUM * odeum,const char * fname,double ratio)1733 static int odcacheflushrare(ODEUM *odeum, const char *fname, double ratio){
1734 const char *kbuf, *vbuf;
1735 char otmsg[OD_OTCBBUFSIZ];
1736 int i, rnum, limit, ksiz, vsiz;
1737 assert(odeum);
1738 if((rnum = cbmaprnum(odeum->cachemap)) < 1) return TRUE;
1739 if(odotcb){
1740 sprintf(otmsg, "flushing rare words: ratio=%.2f asiz=%d rnum=%d)",
1741 ratio, odeum->cacheasiz, rnum);
1742 odotcb(fname, odeum, otmsg);
1743 }
1744 cbmapiterinit(odeum->cachemap);
1745 limit = (int)(rnum * ratio);
1746 for(i = 0; i < limit && (kbuf = cbmapiternext(odeum->cachemap, &ksiz)) != NULL; i++){
1747 vbuf = cbmapget(odeum->cachemap, kbuf, ksiz, &vsiz);
1748 if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, vsiz, CR_DCAT)){
1749 odeum->fatal = TRUE;
1750 return FALSE;
1751 }
1752 cbmapout(odeum->cachemap, kbuf, ksiz);
1753 odeum->cacheasiz -= vsiz;
1754 }
1755 if(odotcb){
1756 sprintf(otmsg, "... (done): ratio=%.2f asiz=%d rnum=%d)",
1757 ratio, odeum->cacheasiz, cbmaprnum(odeum->cachemap));
1758 odotcb(fname, odeum, otmsg);
1759 }
1760 return TRUE;
1761 }
1762
1763
1764 /* Sort the records of inverted index.
1765 `odeum' specifies a database handle.
1766 `fname' specifies the name of caller function.
1767 If successful, the return value is true, else, it is false. */
odsortindex(ODEUM * odeum,const char * fname)1768 static int odsortindex(ODEUM *odeum, const char *fname){
1769 const char *word;
1770 char *tmp, otmsg[OD_OTCBBUFSIZ];
1771 int i, rnum, wsiz, tsiz;
1772 ODPAIR *pairs;
1773 assert(odeum);
1774 if((rnum = cbmaprnum(odeum->sortmap)) < 1) return TRUE;
1775 if(odotcb) odotcb(fname, odeum, "sorting the inverted index");
1776 cbmapiterinit(odeum->sortmap);
1777 for(i = 0; (word = cbmapiternext(odeum->sortmap, &wsiz)) != NULL; i++){
1778 if((tmp = crget(odeum->indexdb, word, wsiz, 0, -1, &tsiz)) != NULL){
1779 if(tsiz > sizeof(ODPAIR)){
1780 pairs = (ODPAIR *)tmp;
1781 qsort(pairs, tsiz / sizeof(ODPAIR), sizeof(ODPAIR), odsortcompare);
1782 if(!crput(odeum->indexdb, word, wsiz, tmp, tsiz, CR_DOVER)){
1783 free(tmp);
1784 return FALSE;
1785 }
1786 }
1787 free(tmp);
1788 } else if(dpecode != DP_ENOITEM){
1789 return FALSE;
1790 }
1791 if(odotcb && (i + 1) % OD_OTPERWORDS == 0){
1792 sprintf(otmsg, "... (%d/%d)", i + 1, rnum);
1793 odotcb(fname, odeum, otmsg);
1794 }
1795 }
1796 cbmapclose(odeum->sortmap);
1797 odeum->sortmap = cbmapopenex(odcachebnum);
1798 return TRUE;
1799 }
1800
1801
1802 /* Compare two pairs of structures of a search result.
1803 `a' specifies the pointer to the region of one pair.
1804 `b' specifies the pointer to the region of the other pair.
1805 The return value is positive if the former is big, negative if the latter is big, 0 if both
1806 are equivalent. */
odsortcompare(const void * a,const void * b)1807 static int odsortcompare(const void *a, const void *b){
1808 ODPAIR *ap, *bp;
1809 int rv;
1810 assert(a && b);
1811 ap = (ODPAIR *)a;
1812 bp = (ODPAIR *)b;
1813 rv = bp->score - ap->score;
1814 if(rv != 0) return rv;
1815 return ap->id - bp->id;
1816 }
1817
1818
1819 /* Purge the elements of the deleted documents from the inverted index.
1820 `odeum' specifies a database handle.
1821 `fname' specifies the name of caller function.
1822 If successful, the return value is true, else, it is false. */
odpurgeindex(ODEUM * odeum,const char * fname)1823 static int odpurgeindex(ODEUM *odeum, const char *fname){
1824 ODPAIR *pairs;
1825 char *kbuf, *vbuf, otmsg[OD_OTCBBUFSIZ];
1826 int i, rnum, tnum, ksiz, vsiz, pnum, wi;
1827 assert(odeum);
1828 if((rnum = crrnum(odeum->indexdb)) < 1) return TRUE;
1829 if(odotcb) odotcb(fname, odeum, "purging dispensable regions");
1830 if(!criterinit(odeum->indexdb)) return FALSE;
1831 tnum = 0;
1832 while(TRUE){
1833 if(!(kbuf = criternext(odeum->indexdb, &ksiz))){
1834 if(dpecode != DP_ENOITEM) return FALSE;
1835 break;
1836 }
1837 if(!(vbuf = crget(odeum->indexdb, kbuf, ksiz, 0, -1, &vsiz))){
1838 dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
1839 free(kbuf);
1840 return FALSE;
1841 }
1842 pairs = (ODPAIR *)vbuf;
1843 pnum = vsiz / sizeof(ODPAIR);
1844 wi = 0;
1845 for(i = 0; i < pnum; i++){
1846 if(crvsiz(odeum->docsdb, (char *)&(pairs[i].id), sizeof(int)) != -1){
1847 pairs[wi++] = pairs[i];
1848 }
1849 }
1850 if(wi > 0){
1851 if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, wi * sizeof(ODPAIR), CR_DOVER)){
1852 free(vbuf);
1853 free(kbuf);
1854 return FALSE;
1855 }
1856 } else {
1857 if(!crout(odeum->indexdb, kbuf, ksiz)){
1858 free(vbuf);
1859 free(kbuf);
1860 return FALSE;
1861 }
1862 }
1863 free(vbuf);
1864 free(kbuf);
1865 if(odotcb && (tnum + 1) % OD_OTPERWORDS == 0){
1866 sprintf(otmsg, "... (%d/%d)", tnum + 1, rnum);
1867 odotcb(fname, odeum, otmsg);
1868 }
1869 tnum++;
1870 }
1871 return TRUE;
1872 }
1873
1874
1875 /* Create a map of a document array.
1876 `pairs' specifies the pointer to a document array.
1877 `num' specifies the number of elements of the array.
1878 The return value is a map of the document array. */
odpairsmap(const ODPAIR * pairs,int num)1879 static CBMAP *odpairsmap(const ODPAIR *pairs, int num){
1880 CBMAP *map;
1881 int i;
1882 assert(pairs && num >= 0);
1883 map = cbmapopen();
1884 for(i = 0; i < num; i++){
1885 cbmapput(map, (char *)&(pairs[i].id), sizeof(int),
1886 (char *)&(pairs[i].score), sizeof(int), TRUE);
1887 }
1888 return map;
1889 }
1890
1891
1892 /* compare two pairs of structures of words in a document.
1893 `a' specifies the pointer to the region of one word.
1894 `b' specifies the pointer to the region of the other word.
1895 The return value is positive if the former is big, negative if the latter is big, 0 if both
1896 are equivalent. */
odwordcompare(const void * a,const void * b)1897 static int odwordcompare(const void *a, const void *b){
1898 ODWORD *ap, *bp;
1899 int rv;
1900 assert(a && b);
1901 ap = (ODWORD *)a;
1902 bp = (ODWORD *)b;
1903 if((rv = bp->num - ap->num) != 0) return rv;
1904 if((rv = strlen(bp->word) - strlen(ap->word)) != 0) return rv;
1905 return strcmp(ap->word, bp->word);
1906 }
1907
1908
1909 /* Match an operator without taking it off the token list.
1910 `odeum' specifies a database handle.
1911 `tokens' specifies a list handle of tokens.
1912 The return value is whether the next token is an operator. */
odmatchoperator(ODEUM * odeum,CBLIST * tokens)1913 static int odmatchoperator(ODEUM *odeum, CBLIST *tokens){
1914 const char *tk = NULL;
1915 int tk_len = 0;
1916 tk = cblistval(tokens, 0, &tk_len);
1917 if(tk && (tk[0] == '&' || tk[0] == '|' || tk[0] == '!')) return 1;
1918 return 0;
1919 }
1920
1921
1922 /* Implements the subexpr part of the grammar.
1923 `odeum' specifies a database handle.
1924 `tokens' specifies a list handle of tokens.
1925 `nwords' specifies a list handle of normalized words.
1926 `np' specifies the pointer to a variable to which the number of the elements of the return
1927 value is assigned.
1928 `errors' specifies a list handle into which error messages are stored.
1929 The return value is the pointer to an array of document IDs. */
odparsesubexpr(ODEUM * odeum,CBLIST * tokens,CBLIST * nwords,int * np,CBLIST * errors)1930 static ODPAIR *odparsesubexpr(ODEUM *odeum, CBLIST *tokens, CBLIST *nwords, int *np,
1931 CBLIST *errors){
1932 char *tk = NULL;
1933 int tk_len = 0;
1934 char *nword = NULL; /* used to do the actual search, should match with tokens */
1935 ODPAIR *result = NULL;
1936 int result_num = 0;
1937 int i;
1938 double ival;
1939 if((tk = cblistshift(tokens, &tk_len)) != NULL){
1940 assert(tk != NULL);
1941 if(tk[0] == '('){
1942 free(tk);
1943 /* recurse into expr */
1944 result = odparseexpr(odeum, tokens, nwords, &result_num, errors);
1945 /* match right token RPAREN */
1946 tk = cblistshift(tokens, &tk_len);
1947 /* print an error if either we didn't get anything or we didn't get a ) */
1948 if(tk == NULL){
1949 if(errors) cblistpush(errors, "Expression ended without closing ')'", -1);
1950 } else if(tk[0] != ')'){
1951 if(errors) cblistpush(errors, "Un-balanced parenthesis.", -1);
1952 }
1953 } else if(odeum->statechars[*(unsigned char *)tk] == 0){
1954 /* Perform odsearch with the next norm word that isn't an operator. */
1955 nword = cblistshift(nwords, NULL);
1956 assert(nword != NULL);
1957 if((result = odsearch(odeum, nword, -1, &result_num)) != NULL){
1958 /* TF-IDF tuning */
1959 ival = odlogarithm(result_num);
1960 ival = (ival * ival) / 4.0;
1961 if(ival < 4.0) ival = 4.0;
1962 for(i = 0; i < result_num; i++){
1963 result[i].score = (int)(result[i].score / ival);
1964 }
1965 }
1966 free(nword);
1967 } else {
1968 if(errors) cblistpush(errors, "Invalid sub-expression. Expected '(' or WORD.", -1);
1969 result = cbmalloc(1);
1970 result_num = 0;
1971 }
1972 /* done with the token */
1973 free(tk);
1974 }
1975 *np = result_num;
1976 return result;
1977 }
1978
1979
1980 /* Implements the actual recursive decent parser for the mini query language.
1981 `odeum' specifies a database handle.
1982 `tokens' specifies a list handle of tokens.
1983 `nwords' specifies a list handle of normalized words.
1984 `np' specifies the pointer to a variable to which the number of the elements of the return
1985 value is assigned.
1986 `errors' specifies a list handle into which error messages are stored.
1987 The return value is the pointer to an array of document IDs.
1988 It simply parses an initial subexpr, and then loops over as many (operator subexpr)
1989 sequences as it can find. The odmatchoperator function handles injecting a default &
1990 between consecutive words. */
odparseexpr(ODEUM * odeum,CBLIST * tokens,CBLIST * nwords,int * np,CBLIST * errors)1991 static ODPAIR *odparseexpr(ODEUM *odeum, CBLIST *tokens, CBLIST *nwords, int *np,
1992 CBLIST *errors){
1993 ODPAIR *left = NULL;
1994 ODPAIR *right = NULL;
1995 ODPAIR *temp = NULL;
1996 int left_num = 0;
1997 int right_num = 0;
1998 int temp_num = 0;
1999 char *op = NULL;
2000 int op_len = 0;
2001 if(!(left = odparsesubexpr(odeum, tokens, nwords, &left_num, errors))) return NULL;
2002 /* expr ::= subexpr ( op subexpr )* */
2003 while(odmatchoperator(odeum, tokens)){
2004 op = cblistshift(tokens, &op_len);
2005 if(!(right = odparsesubexpr(odeum, tokens, nwords, &right_num, errors))){
2006 free(op);
2007 free(left);
2008 return NULL;
2009 }
2010 switch(op[0]){
2011 case '&':
2012 temp = odpairsand(left, left_num, right, right_num, &temp_num);
2013 break;
2014 case '|':
2015 temp = odpairsor(left, left_num, right, right_num, &temp_num);
2016 break;
2017 case '!':
2018 temp = odpairsnotand(left, left_num, right, right_num, &temp_num);
2019 break;
2020 default:
2021 if(errors) cblistpush(errors, "Invalid operator. Expected '&', '|', or '!'.", -1);
2022 break;
2023 }
2024 if(temp){
2025 /* an operator was done so we must swap it with the left */
2026 free(left); left = NULL;
2027 left = temp;
2028 left_num = temp_num;
2029 }
2030 free(op);
2031 if(right) free(right);
2032 }
2033 *np = left_num;
2034 return left;
2035 }
2036
2037
2038 /* Processes the tokens in order to break them up further.
2039 `odeum' specifies a database handle.
2040 `tokens' specifies a list handle of tokens. */
odfixtokens(ODEUM * odeum,CBLIST * tokens)2041 static void odfixtokens(ODEUM *odeum, CBLIST *tokens){
2042 const char *tk = NULL;
2043 int tk_len = 0;
2044 int i = 0;
2045 int lastword = 0;
2046 for(i = 0; i < cblistnum(tokens); i++){
2047 tk = cblistval(tokens, i, &tk_len);
2048 assert(tk);
2049 if(tk[0] == '&' || tk[0] == '|' || tk[0] == '!' || tk[0] == '(' || tk[0] == ')'){
2050 lastword = 0;
2051 if(tk_len > 1){
2052 /* need to break it up for the next loop around */
2053 tk = cblistremove(tokens, i, &tk_len);
2054 cblistinsert(tokens, i, tk, 1);
2055 cblistinsert(tokens, i+1, tk+1, tk_len-1);
2056 free((char *)tk);
2057 }
2058 } else if(odeum->statechars[*(unsigned char *)tk] == 0){
2059 /* if the last one was a word and this is a word then we need a default & between them */
2060 if(lastword){
2061 cblistinsert(tokens, i, "&", 1);
2062 i++;
2063 }
2064 lastword = 1;
2065 }
2066 }
2067 }
2068
2069
2070 /* Cleans out the parts of the normalized word list that are not considered words.
2071 `odeum' specifies a database handle.
2072 `tokens' specifies a list handle of tokens. */
odcleannormalized(ODEUM * odeum,CBLIST * nwords)2073 static void odcleannormalized(ODEUM *odeum, CBLIST *nwords){
2074 char *tk = NULL;
2075 int tk_len = 0;
2076 int i = 0;
2077 for(i = 0; i < cblistnum(nwords); i++){
2078 tk = (char *)cblistval(nwords, i, &tk_len);
2079 if(tk_len == 0 || (!odeum->statechars[*(unsigned char *)tk] == 0)){
2080 /* not a word so delete it */
2081 tk = cblistremove(nwords, i, &tk_len);
2082 free(tk);
2083 i--;
2084 }
2085 }
2086 }
2087
2088
2089
2090 /* END OF FILE */
2091