1 /*************************************************************************************************
2  * Implementation of Odeum
3  *                                                      Copyright (C) 2000-2007 Mikio Hirabayashi
4  * This file is part of QDBM, Quick Database Manager.
5  * QDBM is free software; you can redistribute it and/or modify it under the terms of the GNU
6  * Lesser General Public License as published by the Free Software Foundation; either version
7  * 2.1 of the License or any later version.  QDBM is distributed in the hope that it will be
8  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
9  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
10  * details.
11  * You should have received a copy of the GNU Lesser General Public License along with QDBM; if
12  * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
13  * 02111-1307 USA.
14  *************************************************************************************************/
15 
16 
17 #define QDBM_INTERNAL  1
18 
19 #include "odeum.h"
20 #include "myconf.h"
21 
22 #define OD_NAMEMAX     256               /* max size of a database name */
23 #define OD_DIRMODE     00755             /* permission of a creating directory */
24 #define OD_PATHBUFSIZ  1024              /* size of a path buffer */
25 #define OD_NUMBUFSIZ   32                /* size of a buffer for a number */
26 #define OD_MAPPBNUM    127               /* bucket size of a petit map handle */
27 #define OD_DOCSNAME    "docs"            /* name of the database for documents */
28 #define OD_INDEXNAME   "index"           /* name of the database for inverted index */
29 #define OD_RDOCSNAME   "rdocs"           /* name of the database for reverse dictionary */
30 #define OD_DOCSBNUM    2039              /* initial bucket number of document database */
31 #define OD_DOCSDNUM    17                /* division number of document database */
32 #define OD_DOCSALIGN   -4                /* alignment of document database */
33 #define OD_DOCSFBP     32                /* size of free block pool of document database */
34 #define OD_INDEXBNUM   32749             /* initial bucket number of inverted index */
35 #define OD_INDEXDNUM   7                 /* division number of inverted index */
36 #define OD_INDEXALIGN  -2                /* alignment of inverted index */
37 #define OD_INDEXFBP    32                /* size of free block pool of inverted index */
38 #define OD_RDOCSLRM    81                /* records in a leaf node of reverse dictionary */
39 #define OD_RDOCSNIM    192               /* records in a non-leaf node of reverse dictionary */
40 #define OD_RDOCSLCN    128               /* number of leaf cache of reverse dictionary */
41 #define OD_RDOCSNCN    32                /* number of non-leaf cache of reverse dictionary */
42 #define OD_CACHEBNUM   262139            /* number of buckets for dirty buffers */
43 #define OD_CACHESIZ    8388608           /* max bytes to use memory for dirty buffers */
44 #define OD_CFLIVERAT   0.8               /* ratio of usable cache region */
45 #define OD_CFBEGSIZ    2048              /* beginning size of flushing frequent words */
46 #define OD_CFENDSIZ    64                /* lower limit of flushing frequent words */
47 #define OD_CFRFRAT     0.2               /* ratio of flushing rare words a time */
48 #define OD_OTCBBUFSIZ  1024              /* size of a buffer for call back functions */
49 #define OD_OTPERWORDS  10000             /* frequency of call back in merging index */
50 #define OD_OTPERDOCS   1000              /* frequency of call back in merging docs */
51 #define OD_MDBRATIO    2.5               /* ratio of bucket number and document number */
52 #define OD_MIBRATIO    1.5               /* ratio of bucket number and word number */
53 #define OD_MIARATIO    0.75              /* ratio of alignment to the first words */
54 #define OD_MIWUNIT     32                /* writing unit of merging inverted index */
55 #define OD_DMAXEXPR    "dmax"            /* key of max number of the document ID */
56 #define OD_DNUMEXPR    "dnum"            /* key of number of the documents */
57 #define OD_URIEXPR     "1"               /* map key of URI */
58 #define OD_ATTRSEXPR   "2"               /* map key of attributes */
59 #define OD_NWORDSEXPR  "3"               /* map key of normal words */
60 #define OD_AWORDSEXPR  "4"               /* map key of as-is words */
61 #define OD_WTOPRATE    0.1               /* ratio of top words */
62 #define OD_WTOPBONUS   5000              /* bonus points of top words */
63 #define OD_KEYCRATIO   1.75              /* ratio of number to max of keyword candidates */
64 #define OD_WOCCRPOINT  10000             /* points per occurence */
65 #define OD_SPACECHARS  "\t\n\v\f\r "     /* space characters */
66 #define OD_DELIMCHARS  "!\"#$%&'()*/<=>?[\\]^`{|}~"  /* delimiter characters */
67 #define OD_GLUECHARS   "+,-.:;@"         /* glueing characters */
68 #define OD_MAXWORDLEN  48                /* max length of a word */
69 
70 typedef struct {                         /* type of structure for word counting */
71   const char *word;                      /* pointer to the word */
72   int num;                               /* frequency of the word */
73 } ODWORD;
74 
75 enum {                                   /* enumeration for events binded to each character */
76   OD_EVWORD,                             /* word */
77   OD_EVSPACE,                            /* space */
78   OD_EVDELIM,                            /* delimiter */
79   OD_EVGLUE                              /* glue */
80 };
81 
82 
83 /* private global variables */
84 int odindexbnum = OD_INDEXBNUM;
85 int odindexdnum = OD_INDEXDNUM;
86 int odcachebnum = OD_CACHEBNUM;
87 int odcachesiz = OD_CACHESIZ;
88 void (*odotcb)(const char *, ODEUM *, const char *) = NULL;
89 
90 
91 /* private function prototypes */
92 static ODEUM *odopendb(const char *name, int omode, int docsbnum, int indexbnum,
93                        const char *fname);
94 static int odcacheflush(ODEUM *odeum, const char *fname);
95 static int odcacheflushfreq(ODEUM *odeum, const char *fname, int min);
96 static int odcacheflushrare(ODEUM *odeum, const char *fname, double ratio);
97 static int odsortindex(ODEUM *odeum, const char *fname);
98 static int odsortcompare(const void *a, const void *b);
99 static int odpurgeindex(ODEUM *odeum, const char *fname);
100 static CBMAP *odpairsmap(const ODPAIR *pairs, int num);
101 static int odwordcompare(const void *a, const void *b);
102 static int odmatchoperator(ODEUM *odeum, CBLIST *tokens);
103 static ODPAIR *odparsesubexpr(ODEUM *odeum, CBLIST *tokens, CBLIST *nwords, int *np,
104                               CBLIST *errors);
105 static ODPAIR *odparseexpr(ODEUM *odeum, CBLIST *tokens, CBLIST *nwords, int *np,
106                            CBLIST *errors);
107 static void odfixtokens(ODEUM *odeum, CBLIST *tokens);
108 static void odcleannormalized(ODEUM *odeum, CBLIST *nwords);
109 
110 
111 
112 /*************************************************************************************************
113  * public objects
114  *************************************************************************************************/
115 
116 
117 /* Get a database handle. */
odopen(const char * name,int omode)118 ODEUM *odopen(const char *name, int omode){
119   assert(name);
120   return odopendb(name, omode, OD_DOCSBNUM, odindexbnum, "odopen");
121 }
122 
123 
124 /* Close a database handle. */
odclose(ODEUM * odeum)125 int odclose(ODEUM *odeum){
126   char numbuf[OD_NUMBUFSIZ];
127   int err;
128   assert(odeum);
129   err = FALSE;
130   if(odotcb) odotcb("odclose", odeum, "closing the connection");
131   if(odeum->wmode){
132     if(odotcb) odotcb("odclose", odeum, "writing meta information");
133     sprintf(numbuf, "%d", odeum->dmax);
134     if(!vlput(odeum->rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), numbuf, -1, VL_DOVER)) err = TRUE;
135     sprintf(numbuf, "%d", odeum->dnum);
136     if(!vlput(odeum->rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), numbuf, -1, VL_DOVER)) err = TRUE;
137     if(!odcacheflushfreq(odeum, "odclose", OD_CFENDSIZ)) err = TRUE;
138     if(!odcacheflushrare(odeum, "odclose", OD_CFRFRAT)) err = TRUE;
139     if(!odcacheflush(odeum, "odclose")) err = TRUE;
140     if(!odsortindex(odeum, "odclose")) err = TRUE;
141     cbmapclose(odeum->cachemap);
142     cbmapclose(odeum->sortmap);
143   }
144   if(!vlclose(odeum->rdocsdb)) err = TRUE;
145   if(!crclose(odeum->indexdb)) err = TRUE;
146   if(!crclose(odeum->docsdb)) err = TRUE;
147   free(odeum->name);
148   free(odeum);
149   return err ? FALSE : TRUE;
150 }
151 
152 
153 /* Store a document. */
odput(ODEUM * odeum,ODDOC * doc,int wmax,int over)154 int odput(ODEUM *odeum, ODDOC *doc, int wmax, int over){
155   char *tmp, *zbuf;
156   const char *word, *ctmp;
157   int i, docid, tsiz, wsiz, wnum, tmax, num, zsiz;
158   double ival;
159   ODPAIR pair;
160   CBMAP *map;
161   CBLIST *tlist;
162   assert(odeum);
163   if(odeum->fatal){
164     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
165     return FALSE;
166   }
167   if(!odeum->wmode){
168     dpecodeset(DP_EMODE, __FILE__, __LINE__);
169     return FALSE;
170   }
171   if((tmp = vlget(odeum->rdocsdb, doc->uri, -1, &tsiz)) != NULL){
172     if(!over){
173       free(tmp);
174       dpecodeset(DP_EKEEP, __FILE__, __LINE__);
175       return FALSE;
176     }
177     if(tsiz != sizeof(int) || !odoutbyid(odeum, *(int *)tmp)){
178       free(tmp);
179       dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
180       odeum->fatal = TRUE;
181       return FALSE;
182     }
183     free(tmp);
184   }
185   odeum->dmax++;
186   odeum->dnum++;
187   docid = odeum->dmax;
188   map = cbmapopen();
189   cbmapput(map, OD_URIEXPR, sizeof(OD_URIEXPR), doc->uri, -1, TRUE);
190   tmp = cbmapdump(doc->attrs, &tsiz);
191   cbmapput(map, OD_ATTRSEXPR, sizeof(OD_ATTRSEXPR), tmp, tsiz, TRUE);
192   free(tmp);
193   if(wmax < 0 || wmax > cblistnum(doc->nwords)) wmax = cblistnum(doc->nwords);
194   tlist = cblistopen();
195   for(i = 0; i < wmax; i++){
196     ctmp = cblistval(doc->nwords, i, &wsiz);
197     cblistpush(tlist, ctmp, wsiz);
198   }
199   tmp = cblistdump(tlist, &tsiz);
200   cbmapput(map, OD_NWORDSEXPR, sizeof(OD_NWORDSEXPR), tmp, tsiz, TRUE);
201   free(tmp);
202   cblistclose(tlist);
203   tlist = cblistopen();
204   for(i = 0; i < wmax; i++){
205     ctmp = cblistval(doc->awords, i, &wsiz);
206     if(strcmp(ctmp, cblistval(doc->nwords, i, NULL))){
207       cblistpush(tlist, ctmp, wsiz);
208     } else {
209       cblistpush(tlist, "\0", 1);
210     }
211   }
212   tmp = cblistdump(tlist, &tsiz);
213   cbmapput(map, OD_AWORDSEXPR, sizeof(OD_AWORDSEXPR), tmp, tsiz, TRUE);
214   free(tmp);
215   cblistclose(tlist);
216   tmp = cbmapdump(map, &tsiz);
217   cbmapclose(map);
218   if(_qdbm_deflate){
219     if(!(zbuf = _qdbm_deflate(tmp, tsiz, &zsiz, _QDBM_ZMRAW))){
220       free(tmp);
221       dpecodeset(DP_EMISC, __FILE__, __LINE__);
222       odeum->fatal = TRUE;
223       return FALSE;
224     }
225     free(tmp);
226     tmp = zbuf;
227     tsiz = zsiz;
228   }
229   if(!crput(odeum->docsdb, (char *)&docid, sizeof(int), tmp, tsiz, CR_DKEEP)){
230     free(tmp);
231     if(dpecode == DP_EKEEP) dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
232     odeum->fatal = TRUE;
233     return FALSE;
234   }
235   free(tmp);
236   if(!vlput(odeum->rdocsdb, doc->uri, -1, (char *)&docid, sizeof(int), VL_DOVER)){
237     odeum->fatal = TRUE;
238     return FALSE;
239   }
240   map = cbmapopen();
241   wnum = cblistnum(doc->nwords);
242   tmax = (int)(wnum * OD_WTOPRATE);
243   for(i = 0; i < wnum; i++){
244     word = cblistval(doc->nwords, i, &wsiz);
245     if(wsiz < 1) continue;
246     if((ctmp = cbmapget(map, word, wsiz, NULL)) != NULL){
247       num = *(int *)ctmp + OD_WOCCRPOINT;
248     } else {
249       num = i <= tmax ? OD_WTOPBONUS + OD_WOCCRPOINT : OD_WOCCRPOINT;
250     }
251     cbmapput(map, word, wsiz, (char *)&num, sizeof(int), TRUE);
252   }
253   ival = odlogarithm(wnum);
254   ival = (ival * ival * ival) / 8.0;
255   if(ival < 8.0) ival = 8.0;
256   cbmapiterinit(map);
257   while((word = cbmapiternext(map, &wsiz)) != NULL){
258     pair.id = docid;
259     pair.score = (int)(*(int *)cbmapget(map, word, wsiz, NULL) / ival);
260     cbmapputcat(odeum->cachemap, word, wsiz, (char *)&pair, sizeof(pair));
261     cbmapmove(odeum->cachemap, word, wsiz, FALSE);
262     odeum->cacheasiz += sizeof(pair);
263     cbmapput(odeum->sortmap, word, wsiz, "", 0, FALSE);
264   }
265   cbmapclose(map);
266   if(odeum->cacheasiz > odcachesiz){
267     for(i = OD_CFBEGSIZ; odeum->cacheasiz > odcachesiz * OD_CFLIVERAT && i >= OD_CFENDSIZ;
268         i /= 2){
269       if(!odcacheflushfreq(odeum, "odput", i)) return FALSE;
270     }
271     while(odeum->cacheasiz > odcachesiz * OD_CFLIVERAT){
272       if(!odcacheflushrare(odeum, "odput", OD_CFRFRAT)) return FALSE;
273     }
274   }
275   doc->id = docid;
276   odeum->ldid = docid;
277   return TRUE;
278 }
279 
280 
281 /* Delete a document by a URL. */
odout(ODEUM * odeum,const char * uri)282 int odout(ODEUM *odeum, const char *uri){
283   char *tmp;
284   int tsiz, docid;
285   assert(odeum && uri);
286   if(odeum->fatal){
287     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
288     return FALSE;
289   }
290   if(!odeum->wmode){
291     dpecodeset(DP_EMODE, __FILE__, __LINE__);
292     return FALSE;
293   }
294   if(!(tmp = vlget(odeum->rdocsdb, uri, -1, &tsiz))){
295     if(dpecode != DP_ENOITEM) odeum->fatal = TRUE;
296     return FALSE;
297   }
298   if(tsiz != sizeof(int)){
299     free(tmp);
300     dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
301     odeum->fatal = TRUE;
302     return FALSE;
303   }
304   docid = *(int *)tmp;
305   free(tmp);
306   return odoutbyid(odeum, docid);
307 }
308 
309 
310 /* Delete a document specified by an ID number. */
odoutbyid(ODEUM * odeum,int id)311 int odoutbyid(ODEUM *odeum, int id){
312   char *tmp, *zbuf;
313   const char *uritmp;
314   int tsiz, uritsiz, zsiz;
315   CBMAP *map;
316   assert(odeum && id > 0);
317   if(odeum->fatal){
318     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
319     return FALSE;
320   }
321   if(!odeum->wmode){
322     dpecodeset(DP_EMODE, __FILE__, __LINE__);
323     return FALSE;
324   }
325   if(!(tmp = crget(odeum->docsdb, (char *)&id, sizeof(int), 0, -1, &tsiz))){
326     if(dpecode != DP_ENOITEM) odeum->fatal = TRUE;
327     return FALSE;
328   }
329   if(_qdbm_inflate){
330     if(!(zbuf = _qdbm_inflate(tmp, tsiz, &zsiz, _QDBM_ZMRAW))){
331       free(tmp);
332       dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
333       odeum->fatal = TRUE;
334       return FALSE;
335     }
336     free(tmp);
337     tmp = zbuf;
338     tsiz = zsiz;
339   }
340   map = cbmapload(tmp, tsiz);
341   free(tmp);
342   uritmp = cbmapget(map, OD_URIEXPR, sizeof(OD_URIEXPR), &uritsiz);
343   if(!uritmp || !vlout(odeum->rdocsdb, uritmp, uritsiz)){
344     cbmapclose(map);
345     dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
346     odeum->fatal = TRUE;
347     return FALSE;
348   }
349   cbmapclose(map);
350   if(!crout(odeum->docsdb, (char *)&id, sizeof(int))){
351     odeum->fatal = TRUE;
352     return FALSE;
353   }
354   odeum->dnum--;
355   return TRUE;
356 }
357 
358 
359 /* Retrieve a document by a URI. */
odget(ODEUM * odeum,const char * uri)360 ODDOC *odget(ODEUM *odeum, const char *uri){
361   char *tmp;
362   int tsiz, docid;
363   assert(odeum && uri);
364   if(odeum->fatal){
365     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
366     return NULL;
367   }
368   if(!(tmp = vlget(odeum->rdocsdb, uri, -1, &tsiz))){
369     if(dpecode != DP_ENOITEM) odeum->fatal = TRUE;
370     return NULL;
371   }
372   if(tsiz != sizeof(int)){
373     free(tmp);
374     dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
375     odeum->fatal = TRUE;
376     return NULL;
377   }
378   docid = *(int *)tmp;
379   free(tmp);
380   return odgetbyid(odeum, docid);
381 }
382 
383 
384 /* Retrieve a document by an ID number. */
odgetbyid(ODEUM * odeum,int id)385 ODDOC *odgetbyid(ODEUM *odeum, int id){
386   char *tmp, *zbuf;
387   const char *uritmp, *attrstmp, *nwordstmp, *awordstmp, *asis, *normal;
388   int i, tsiz, uritsiz, attrstsiz, nwordstsiz, awordstsiz, zsiz, asiz, nsiz;
389   ODDOC *doc;
390   CBMAP *map;
391   assert(odeum);
392   if(odeum->fatal){
393     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
394     return NULL;
395   }
396   if(id < 1){
397     dpecodeset(DP_ENOITEM, __FILE__, __LINE__);
398     return NULL;
399   }
400   if(!(tmp = crget(odeum->docsdb, (char *)&id, sizeof(int), 0, -1, &tsiz))){
401     if(dpecode != DP_ENOITEM) odeum->fatal = TRUE;
402     return NULL;
403   }
404   if(_qdbm_inflate){
405     if(!(zbuf = _qdbm_inflate(tmp, tsiz, &zsiz, _QDBM_ZMRAW))){
406       free(tmp);
407       dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
408       odeum->fatal = TRUE;
409       return NULL;
410     }
411     free(tmp);
412     tmp = zbuf;
413     tsiz = zsiz;
414   }
415   map = cbmapload(tmp, tsiz);
416   free(tmp);
417   uritmp = cbmapget(map, OD_URIEXPR, sizeof(OD_URIEXPR), &uritsiz);
418   attrstmp = cbmapget(map, OD_ATTRSEXPR, sizeof(OD_ATTRSEXPR), &attrstsiz);
419   nwordstmp = cbmapget(map, OD_NWORDSEXPR, sizeof(OD_NWORDSEXPR), &nwordstsiz);
420   awordstmp = cbmapget(map, OD_AWORDSEXPR, sizeof(OD_AWORDSEXPR), &awordstsiz);
421   if(!uritmp || !attrstmp || !nwordstmp || !awordstmp){
422     cbmapclose(map);
423     dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
424     odeum->fatal = TRUE;
425     return NULL;
426   }
427   doc = cbmalloc(sizeof(ODDOC));
428   doc->id = id;
429   doc->uri = cbmemdup(uritmp, uritsiz);
430   doc->attrs = cbmapload(attrstmp, attrstsiz);
431   doc->nwords = cblistload(nwordstmp, nwordstsiz);
432   doc->awords = cblistload(awordstmp, awordstsiz);
433   cbmapclose(map);
434   for(i = 0; i < cblistnum(doc->awords); i++){
435     asis = cblistval(doc->awords, i, &asiz);
436     if(asiz == 1 && asis[0] == '\0'){
437       normal = cblistval(doc->nwords, i, &nsiz);
438       cblistover(doc->awords, i, normal, nsiz);
439     }
440   }
441   return doc;
442 }
443 
444 
445 /* Retrieve the ID of the document specified by a URI. */
odgetidbyuri(ODEUM * odeum,const char * uri)446 int odgetidbyuri(ODEUM *odeum, const char *uri){
447   char *tmp;
448   int tsiz, docid;
449   assert(odeum && uri);
450   if(odeum->fatal){
451     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
452     return -1;
453   }
454   if(!(tmp = vlget(odeum->rdocsdb, uri, -1, &tsiz))){
455     if(dpecode != DP_ENOITEM) odeum->fatal = TRUE;
456     return -1;
457   }
458   if(tsiz != sizeof(int)){
459     free(tmp);
460     dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
461     odeum->fatal = TRUE;
462     return -1;
463   }
464   docid = *(int *)tmp;
465   free(tmp);
466   return docid;
467 }
468 
469 
470 /* Check whether the document specified by an ID number exists. */
odcheck(ODEUM * odeum,int id)471 int odcheck(ODEUM *odeum, int id){
472   assert(odeum);
473   if(odeum->fatal){
474     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
475     return FALSE;
476   }
477   if(id < 1){
478     dpecodeset(DP_ENOITEM, __FILE__, __LINE__);
479     return FALSE;
480   }
481   return crvsiz(odeum->docsdb, (char *)&id, sizeof(int)) != -1;
482 }
483 
484 
485 /* Search the inverted index for documents including a word. */
odsearch(ODEUM * odeum,const char * word,int max,int * np)486 ODPAIR *odsearch(ODEUM *odeum, const char *word, int max, int *np){
487   char *tmp;
488   int tsiz;
489   assert(odeum && word && np);
490   if(odeum->fatal){
491     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
492     return NULL;
493   }
494   if(odeum->wmode && cbmaprnum(odeum->sortmap) > 0 &&
495      (!odcacheflush(odeum, "odsearch") || !odsortindex(odeum, "odsearch"))){
496     odeum->fatal = TRUE;
497     return NULL;
498   }
499   max = max < 0 ? -1 : max * sizeof(ODPAIR);
500   if(!(tmp = crget(odeum->indexdb, word, -1, 0, max, &tsiz))){
501     if(dpecode != DP_ENOITEM){
502       odeum->fatal = TRUE;
503       return NULL;
504     }
505     *np = 0;
506     return cbmalloc(1);
507   }
508   *np = tsiz / sizeof(ODPAIR);
509   return (ODPAIR *)tmp;
510 }
511 
512 
513 /* Get the number of documents including a word. */
odsearchdnum(ODEUM * odeum,const char * word)514 int odsearchdnum(ODEUM *odeum, const char *word){
515   int rv;
516   assert(odeum && word);
517   if(odeum->fatal){
518     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
519     return -1;
520   }
521   rv = crvsiz(odeum->indexdb, word, -1);
522   return rv < 0 ? -1 : rv / sizeof(ODPAIR);
523 }
524 
525 
526 /* Initialize the iterator of a database handle. */
oditerinit(ODEUM * odeum)527 int oditerinit(ODEUM *odeum){
528   assert(odeum);
529   if(odeum->fatal){
530     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
531     return FALSE;
532   }
533   return criterinit(odeum->docsdb);
534 }
535 
536 
537 /* Get the next key of the iterator. */
oditernext(ODEUM * odeum)538 ODDOC *oditernext(ODEUM *odeum){
539   char *tmp;
540   int tsiz, docsid;
541   ODDOC *doc;
542   assert(odeum);
543   if(odeum->fatal){
544     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
545     return NULL;
546   }
547   doc = NULL;
548   while(TRUE){
549     if(!(tmp = criternext(odeum->docsdb, &tsiz))){
550       if(dpecode != DP_ENOITEM) odeum->fatal = TRUE;
551       return NULL;
552     }
553     if(tsiz != sizeof(int)){
554       free(tmp);
555       dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
556       odeum->fatal = TRUE;
557       return NULL;
558     }
559     docsid = *(int *)tmp;
560     free(tmp);
561     if((doc = odgetbyid(odeum, docsid)) != NULL) break;
562     if(dpecode != DP_ENOITEM){
563       odeum->fatal = TRUE;
564       return NULL;
565     }
566   }
567   return doc;
568 }
569 
570 
571 /* Synchronize updating contents with the files and the devices. */
odsync(ODEUM * odeum)572 int odsync(ODEUM *odeum){
573   char numbuf[OD_NUMBUFSIZ];
574   assert(odeum);
575   if(odeum->fatal){
576     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
577     return FALSE;
578   }
579   if(!odeum->wmode){
580     dpecodeset(DP_EMODE, __FILE__, __LINE__);
581     return FALSE;
582   }
583   if(odotcb) odotcb("odsync", odeum, "writing meta information");
584   sprintf(numbuf, "%d", odeum->dmax);
585   if(!vlput(odeum->rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), numbuf, -1, VL_DOVER)){
586     odeum->fatal = TRUE;
587     return FALSE;
588   }
589   sprintf(numbuf, "%d", odeum->dnum);
590   if(!vlput(odeum->rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), numbuf, -1, VL_DOVER)){
591     odeum->fatal = TRUE;
592     return FALSE;
593   }
594   if(!odcacheflush(odeum, "odsync")){
595     odeum->fatal = TRUE;
596     return FALSE;
597   }
598   if(!odsortindex(odeum, "odsync")){
599     odeum->fatal = TRUE;
600     return FALSE;
601   }
602   if(odotcb) odotcb("odsync", odeum, "synchronizing the document database");
603   if(!crsync(odeum->docsdb)){
604     odeum->fatal = TRUE;
605     return FALSE;
606   }
607   if(odotcb) odotcb("odsync", odeum, "synchronizing the inverted index");
608   if(!crsync(odeum->indexdb)){
609     odeum->fatal = TRUE;
610     return FALSE;
611   }
612   if(odotcb) odotcb("odsync", odeum, "synchronizing the reverse dictionary");
613   if(!vlsync(odeum->rdocsdb)){
614     odeum->fatal = TRUE;
615     return FALSE;
616   }
617   return TRUE;
618 }
619 
620 
621 /* Optimize a database. */
odoptimize(ODEUM * odeum)622 int odoptimize(ODEUM *odeum){
623   assert(odeum);
624   if(odeum->fatal){
625     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
626     return FALSE;
627   }
628   if(!odeum->wmode){
629     dpecodeset(DP_EMODE, __FILE__, __LINE__);
630     return FALSE;
631   }
632   if(!odcacheflush(odeum, "odoptimize")){
633     odeum->fatal = TRUE;
634     return FALSE;
635   }
636   if(odeum->ldid < 1 || odeum->ldid != odeum->dnum){
637     if(!odpurgeindex(odeum, "odoptimize")){
638       odeum->fatal = TRUE;
639       return FALSE;
640     }
641   }
642   if(odeum->ldid > 0){
643     if(!odsortindex(odeum, "odoptimize")){
644       odeum->fatal = TRUE;
645       return FALSE;
646     }
647   }
648   if(odotcb) odotcb("odoptimize", odeum, "optimizing the document database");
649   if(!croptimize(odeum->docsdb, -1)){
650     odeum->fatal = TRUE;
651     return FALSE;
652   }
653   if(odotcb) odotcb("odoptimize", odeum, "optimizing the inverted index");
654   if(!croptimize(odeum->indexdb, -1)){
655     odeum->fatal = TRUE;
656     return FALSE;
657   }
658   if(odotcb) odotcb("odoptimize", odeum, "optimizing the reverse dictionary");
659   if(!vloptimize(odeum->rdocsdb)){
660     odeum->fatal = TRUE;
661     return FALSE;
662   }
663   return TRUE;
664 }
665 
666 
667 /* Get the name of a database. */
odname(ODEUM * odeum)668 char *odname(ODEUM *odeum){
669   assert(odeum);
670   if(odeum->fatal){
671     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
672     return NULL;
673   }
674   return cbmemdup(odeum->name, -1);
675 }
676 
677 
678 /* Get the total size of database files. */
odfsiz(ODEUM * odeum)679 double odfsiz(ODEUM *odeum){
680   double fsiz, rv;
681   assert(odeum);
682   if(odeum->fatal){
683     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
684     return -1;
685   }
686   fsiz = 0;
687   if((rv = crfsizd(odeum->docsdb)) < 0) return -1.0;
688   fsiz += rv;
689   if((rv = crfsizd(odeum->indexdb)) < 0) return -1.0;
690   fsiz += rv;
691   if((rv = vlfsiz(odeum->rdocsdb)) == -1) return -1.0;
692   fsiz += rv;
693   return fsiz;
694 }
695 
696 
697 /* Get the total number of the elements of the bucket arrays for the inverted index. */
odbnum(ODEUM * odeum)698 int odbnum(ODEUM *odeum){
699   assert(odeum);
700   if(odeum->fatal){
701     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
702     return -1;
703   }
704   return crbnum(odeum->indexdb);
705 }
706 
707 
708 /* Get the total number of the used elements of the bucket arrays in the inverted index. */
odbusenum(ODEUM * odeum)709 int odbusenum(ODEUM *odeum){
710   assert(odeum);
711   if(odeum->fatal){
712     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
713     return -1;
714   }
715   return crbusenum(odeum->indexdb);
716 }
717 
718 
719 /* Get the number of the documents stored in a database. */
oddnum(ODEUM * odeum)720 int oddnum(ODEUM *odeum){
721   assert(odeum);
722   if(odeum->fatal){
723     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
724     return -1;
725   }
726   return odeum->dnum;
727 }
728 
729 
730 /* Get the number of the words stored in a database. */
odwnum(ODEUM * odeum)731 int odwnum(ODEUM *odeum){
732   assert(odeum);
733   if(odeum->fatal){
734     dpecodeset(DP_EFATAL, __FILE__, __LINE__);
735     return -1;
736   }
737   return crrnum(odeum->indexdb);
738 }
739 
740 
741 /* Check whether a database handle is a writer or not. */
odwritable(ODEUM * odeum)742 int odwritable(ODEUM *odeum){
743   assert(odeum);
744   return odeum->wmode;
745 }
746 
747 
748 /* Check whether a database has a fatal error or not. */
odfatalerror(ODEUM * odeum)749 int odfatalerror(ODEUM *odeum){
750   assert(odeum);
751   return odeum->fatal;
752 }
753 
754 
755 /* Get the inode number of a database directory. */
odinode(ODEUM * odeum)756 int odinode(ODEUM *odeum){
757   assert(odeum);
758   return odeum->inode;
759 }
760 
761 
762 /* Get the last modified time of a database. */
odmtime(ODEUM * odeum)763 time_t odmtime(ODEUM *odeum){
764   assert(odeum);
765   return crmtime(odeum->indexdb);
766 }
767 
768 
769 /* Merge plural database directories. */
odmerge(const char * name,const CBLIST * elemnames)770 int odmerge(const char *name, const CBLIST *elemnames){
771   ODEUM *odeum, **elems;
772   CURIA *curia, *ecuria;
773   VILLA *villa, *evilla;
774   ODPAIR *pairs;
775   char *word, *kbuf, *vbuf, *dbuf, otmsg[OD_OTCBBUFSIZ];
776   char *wpunit[OD_MIWUNIT], *vpunit[OD_MIWUNIT];
777   int i, j, k, num, dnum, wnum, dbnum, ibnum, tnum, wsunit[OD_MIWUNIT], vsunit[OD_MIWUNIT];
778   int err, *bases, sum, max, wsiz, ksiz, vsiz, uend, unum, pnum, align, id, nid, dsiz;
779   assert(name && elemnames);
780   num = cblistnum(elemnames);
781   elems = cbmalloc(num * sizeof(ODEUM *) + 1);
782   dnum = 0;
783   wnum = 0;
784   for(i = 0; i < num; i++){
785     if(!(elems[i] = odopen(cblistval(elemnames, i, NULL), OD_OREADER))){
786       for(i -= 1; i >= 0; i--){
787         odclose(elems[i]);
788       }
789       free(elems);
790       return FALSE;
791     }
792     dnum += oddnum(elems[i]);
793     wnum += odwnum(elems[i]);
794   }
795   dbnum = (int)(dnum * OD_MDBRATIO / OD_DOCSDNUM);
796   ibnum = (int)(wnum * OD_MIBRATIO / odindexdnum);
797   if(!(odeum = odopendb(name, OD_OWRITER | OD_OCREAT | OD_OTRUNC, dbnum, ibnum, "odmerge"))){
798     for(i = 0; i < num; i++){
799       odclose(elems[i]);
800     }
801     free(elems);
802     return FALSE;
803   }
804   err = FALSE;
805   if(odotcb) odotcb("odmerge", odeum, "calculating the base ID numbers");
806   bases = cbmalloc(num * sizeof(int) + 1);
807   sum = 0;
808   for(i = 0; i < num; i++){
809     ecuria = elems[i]->docsdb;
810     max = 0;
811     if(!criterinit(ecuria) && dpecode != DP_ENOITEM) err = TRUE;
812     while((kbuf = criternext(ecuria, &ksiz)) != NULL){
813       if(ksiz == sizeof(int)){
814         if(*(int *)kbuf > max) max = *(int *)kbuf;
815       }
816       free(kbuf);
817     }
818     bases[i] = sum;
819     sum += max;
820   }
821   curia = odeum->indexdb;
822   for(i = 0; i < num; i++){
823     if(odotcb){
824       sprintf(otmsg, "merging the inverted index (%d/%d)", i + 1, num);
825       odotcb("odmerge", odeum, otmsg);
826     }
827     ecuria = elems[i]->indexdb;
828     tnum = 0;
829     uend = FALSE;
830     if(!criterinit(ecuria) && dpecode != DP_ENOITEM) err = TRUE;
831     while(!uend){
832       for(unum = 0; unum < OD_MIWUNIT; unum++){
833         if(!(word = criternext(ecuria, &wsiz))){
834           uend = TRUE;
835           break;
836         }
837         if(!(vbuf = crget(ecuria, word, wsiz, 0, -1, &vsiz))){
838           err = TRUE;
839           free(word);
840           break;
841         }
842         wpunit[unum] = word;
843         wsunit[unum] = wsiz;
844         vpunit[unum] = vbuf;
845         vsunit[unum] = vsiz;
846       }
847       for(j = 0; j < unum; j++){
848         word = wpunit[j];
849         wsiz = wsunit[j];
850         vbuf = vpunit[j];
851         vsiz = vsunit[j];
852         pairs = (ODPAIR *)vbuf;
853         pnum = vsiz / sizeof(ODPAIR);
854         for(k = 0; k < pnum; k++){
855           pairs[k].id += bases[i];
856         }
857         align = (int)(i < num - 1 ? vsiz * (num - i) * OD_MIARATIO : OD_INDEXALIGN);
858         if(!crsetalign(curia, align)) err = TRUE;
859         if(!crput(curia, word, wsiz, vbuf, vsiz, CR_DCAT)) err = TRUE;
860         free(vbuf);
861         free(word);
862         if(odotcb && (tnum + 1) % OD_OTPERWORDS == 0){
863           sprintf(otmsg, "... (%d/%d)", tnum + 1, crrnum(ecuria));
864           odotcb("odmerge", odeum, otmsg);
865         }
866         tnum++;
867       }
868     }
869   }
870   if(odotcb) odotcb("odmerge", odeum, "sorting the inverted index");
871   tnum = 0;
872   if(!criterinit(curia) && dpecode != DP_ENOITEM) err = TRUE;
873   while((word = criternext(curia, &wsiz)) != NULL){
874     if((vbuf = crget(curia, word, wsiz, 0, -1, &vsiz)) != NULL){
875       if(vsiz > sizeof(ODPAIR)){
876         pairs = (ODPAIR *)vbuf;
877         pnum = vsiz / sizeof(ODPAIR);
878         qsort(pairs, pnum, sizeof(ODPAIR), odsortcompare);
879         if(!crput(curia, word, wsiz, vbuf, vsiz, CR_DOVER)) err = TRUE;
880       }
881       free(vbuf);
882     }
883     free(word);
884     if(odotcb && (tnum + 1) % OD_OTPERWORDS == 0){
885       sprintf(otmsg, "... (%d/%d)", tnum + 1, crrnum(curia));
886       odotcb("odmerge", odeum, otmsg);
887     }
888     tnum++;
889   }
890   if(odotcb) odotcb("odmerge", odeum, "synchronizing the inverted index");
891   if(!crsync(curia)) err = TRUE;
892   dnum = 0;
893   curia = odeum->docsdb;
894   villa = odeum->rdocsdb;
895   for(i = 0; i < num; i++){
896     if(odotcb){
897       sprintf(otmsg, "merging the document database (%d/%d)", i + 1, num);
898       odotcb("odmerge", odeum, otmsg);
899     }
900     evilla = elems[i]->rdocsdb;
901     ecuria = elems[i]->docsdb;
902     tnum = 0;
903     if(!vlcurfirst(evilla) && dpecode != DP_ENOITEM) err = TRUE;
904     while(TRUE){
905       if(!(kbuf = vlcurkey(evilla, &ksiz))) break;
906       if((ksiz == sizeof(OD_DMAXEXPR) && !memcmp(kbuf, OD_DMAXEXPR, ksiz)) ||
907          (ksiz == sizeof(OD_DNUMEXPR) && !memcmp(kbuf, OD_DNUMEXPR, ksiz))){
908         free(kbuf);
909         if(!vlcurnext(evilla)) break;
910         continue;
911       }
912       if(!(vbuf = vlcurval(evilla, &vsiz))){
913         free(kbuf);
914         if(!vlcurnext(evilla)) break;
915         continue;
916       }
917       if(vsiz != sizeof(int)){
918         free(vbuf);
919         free(kbuf);
920         if(!vlcurnext(evilla)) break;
921         continue;
922       }
923       id =  *(int *)vbuf;
924       nid = id + bases[i];
925       if(vlput(villa, kbuf, ksiz, (char *)&nid, sizeof(int), VL_DKEEP)){
926         if((dbuf = crget(ecuria, (char *)&id, sizeof(int), 0, -1, &dsiz)) != NULL){
927           if(crput(curia, (char *)&nid, sizeof(int), dbuf, dsiz, CR_DKEEP)){
928             dnum++;
929           } else {
930             err = TRUE;
931           }
932           free(dbuf);
933         } else {
934           err = TRUE;
935         }
936       } else if(dpecode != DP_EKEEP){
937         err = TRUE;
938       }
939       free(vbuf);
940       free(kbuf);
941       odeum->dnum++;
942       if(odotcb && (tnum + 1) % OD_OTPERDOCS == 0){
943         sprintf(otmsg, "... (%d/%d)", tnum + 1, crrnum(ecuria));
944         odotcb("odmerge", odeum, otmsg);
945       }
946       tnum++;
947       if(!vlcurnext(evilla)) break;
948     }
949   }
950   odeum->dnum = dnum;
951   odeum->dmax = dnum;
952   free(bases);
953   if(odotcb) odotcb("odmerge", odeum, "synchronizing the document index");
954   if(!crsync(curia)) err = TRUE;
955   if(!odclose(odeum)) err = TRUE;
956   for(i = 0; i < num; i++){
957     if(!odclose(elems[i])) err = TRUE;
958   }
959   free(elems);
960   return err ? FALSE : TRUE;
961 }
962 
963 
964 /* Remove a database directory. */
odremove(const char * name)965 int odremove(const char *name){
966   char docsname[OD_PATHBUFSIZ], indexname[OD_PATHBUFSIZ], rdocsname[OD_PATHBUFSIZ];
967   char path[OD_PATHBUFSIZ];
968   const char *file;
969   struct stat sbuf;
970   CBLIST *list;
971   int i;
972   assert(name);
973   sprintf(docsname, "%s%c%s", name, MYPATHCHR, OD_DOCSNAME);
974   sprintf(indexname, "%s%c%s", name, MYPATHCHR, OD_INDEXNAME);
975   sprintf(rdocsname, "%s%c%s", name, MYPATHCHR, OD_RDOCSNAME);
976   if(lstat(name, &sbuf) == -1){
977     dpecodeset(DP_ESTAT, __FILE__, __LINE__);
978     return FALSE;
979   }
980   if(lstat(docsname, &sbuf) != -1 && !crremove(docsname)) return FALSE;
981   if(lstat(indexname, &sbuf) != -1 && !crremove(indexname)) return FALSE;
982   if(lstat(rdocsname, &sbuf) != -1 && !vlremove(rdocsname)) return FALSE;
983   if((list = cbdirlist(name)) != NULL){
984     for(i = 0; i < cblistnum(list); i++){
985       file = cblistval(list, i, NULL);
986       if(!strcmp(file, MYCDIRSTR) || !strcmp(file, MYPDIRSTR)) continue;
987       sprintf(path, "%s%c%s", name, MYPATHCHR, file);
988       if(lstat(path, &sbuf) == -1) continue;
989       if(S_ISDIR(sbuf.st_mode)){
990         if(!crremove(path)) return FALSE;
991       } else {
992         if(!dpremove(path)) return FALSE;
993       }
994     }
995     cblistclose(list);
996   }
997   if(rmdir(name) == -1){
998     dpecodeset(DP_ERMDIR, __FILE__, __LINE__);
999     return FALSE;
1000   }
1001   return TRUE;
1002 }
1003 
1004 
1005 /* Get a document handle. */
oddocopen(const char * uri)1006 ODDOC *oddocopen(const char *uri){
1007   ODDOC *doc;
1008   assert(uri);
1009   doc = cbmalloc(sizeof(ODDOC));
1010   doc->id = -1;
1011   doc->uri = cbmemdup(uri, -1);
1012   doc->attrs = cbmapopenex(OD_MAPPBNUM);
1013   doc->nwords = cblistopen();
1014   doc->awords = cblistopen();
1015   return doc;
1016 }
1017 
1018 
1019 /* Close a document handle. */
oddocclose(ODDOC * doc)1020 void oddocclose(ODDOC *doc){
1021   assert(doc);
1022   cblistclose(doc->awords);
1023   cblistclose(doc->nwords);
1024   cbmapclose(doc->attrs);
1025   free(doc->uri);
1026   free(doc);
1027 }
1028 
1029 
1030 /* Add an attribute to a document. */
oddocaddattr(ODDOC * doc,const char * name,const char * value)1031 void oddocaddattr(ODDOC *doc, const char *name, const char *value){
1032   assert(doc && name && value);
1033   cbmapput(doc->attrs, name, -1, value, -1, TRUE);
1034 }
1035 
1036 
1037 /* Add a word to a document. */
oddocaddword(ODDOC * doc,const char * normal,const char * asis)1038 void oddocaddword(ODDOC *doc, const char *normal, const char *asis){
1039   assert(doc && normal && asis);
1040   cblistpush(doc->nwords, normal, -1);
1041   cblistpush(doc->awords, asis, -1);
1042 }
1043 
1044 
1045 /* Get the ID number of a document. */
oddocid(const ODDOC * doc)1046 int oddocid(const ODDOC *doc){
1047   assert(doc);
1048   return doc->id;
1049 }
1050 
1051 
1052 /* Get the URI of a document. */
oddocuri(const ODDOC * doc)1053 const char *oddocuri(const ODDOC *doc){
1054   assert(doc);
1055   return doc->uri;
1056 }
1057 
1058 
1059 /* Get the value of an attribute of a document. */
oddocgetattr(const ODDOC * doc,const char * name)1060 const char *oddocgetattr(const ODDOC *doc, const char *name){
1061   assert(doc && name);
1062   return cbmapget(doc->attrs, name, -1, NULL);
1063 }
1064 
1065 
1066 /* Get the list handle contains words in normalized form of a document. */
oddocnwords(const ODDOC * doc)1067 const CBLIST *oddocnwords(const ODDOC *doc){
1068   assert(doc);
1069   return doc->nwords;
1070 }
1071 
1072 
1073 /* Get the list handle contains words in appearance form of a document. */
oddocawords(const ODDOC * doc)1074 const CBLIST *oddocawords(const ODDOC *doc){
1075   assert(doc);
1076   return doc->awords;
1077 }
1078 
1079 
1080 /* Get the map handle contains keywords in normalized form and their scores. */
oddocscores(const ODDOC * doc,int max,ODEUM * odeum)1081 CBMAP *oddocscores(const ODDOC *doc, int max, ODEUM *odeum){
1082   const CBLIST *nwords;
1083   CBMAP *map, *kwmap;
1084   const char *word, *ctmp;
1085   char numbuf[OD_NUMBUFSIZ];
1086   ODWORD *owords;
1087   int i, wsiz, wnum, hnum, mnum, nbsiz;
1088   double ival;
1089   assert(doc && max >= 0);
1090   map = cbmapopen();
1091   nwords = oddocnwords(doc);
1092   for(i = 0; i < cblistnum(nwords); i++){
1093     word = cblistval(nwords, i, &wsiz);
1094     if(wsiz < 1) continue;
1095     if((ctmp = cbmapget(map, word, wsiz, NULL)) != NULL){
1096       wnum = *(int *)ctmp + OD_WOCCRPOINT;
1097     } else {
1098       wnum = OD_WOCCRPOINT;
1099     }
1100     cbmapput(map, word, wsiz, (char *)&wnum, sizeof(int), TRUE);
1101   }
1102   mnum = cbmaprnum(map);
1103   owords = cbmalloc(mnum * sizeof(ODWORD) + 1);
1104   cbmapiterinit(map);
1105   for(i = 0; (word = cbmapiternext(map, &wsiz)) != NULL; i++){
1106     owords[i].word = word;
1107     owords[i].num = *(int *)cbmapget(map, word, wsiz, NULL);
1108   }
1109   qsort(owords, mnum, sizeof(ODWORD), odwordcompare);
1110   if(odeum){
1111     if(mnum > max * OD_KEYCRATIO) mnum = (int)(max * OD_KEYCRATIO);
1112     for(i = 0; i < mnum; i++){
1113       if((hnum = odsearchdnum(odeum, owords[i].word)) < 0) hnum = 0;
1114       ival = odlogarithm(hnum);
1115       ival = (ival * ival * ival) / 8.0;
1116       if(ival < 8.0) ival = 8.0;
1117       owords[i].num = (int)(owords[i].num / ival);
1118     }
1119     qsort(owords, mnum, sizeof(ODWORD), odwordcompare);
1120   }
1121   if(mnum > max) mnum = max;
1122   kwmap = cbmapopenex(OD_MAPPBNUM);
1123   for(i = 0; i < mnum; i++){
1124     nbsiz = sprintf(numbuf, "%d", owords[i].num);
1125     cbmapput(kwmap, owords[i].word, -1, numbuf, nbsiz, TRUE);
1126   }
1127   free(owords);
1128   cbmapclose(map);
1129   return kwmap;
1130 }
1131 
1132 
1133 /* Break a text into words in appearance form. */
odbreaktext(const char * text)1134 CBLIST *odbreaktext(const char *text){
1135   const char *word;
1136   CBLIST *elems, *words;
1137   int i, j, dif, wsiz, pv, delim;
1138   assert(text);
1139   words = cblistopen();
1140   elems = cbsplit(text, -1, OD_SPACECHARS);
1141   for(i = 0; i < cblistnum(elems); i++){
1142     word = cblistval(elems, i, &wsiz);
1143     delim = FALSE;
1144     j = 0;
1145     pv = 0;
1146     while(TRUE){
1147       dif = j - pv;
1148       if(j >= wsiz){
1149         if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv);
1150         break;
1151       }
1152       if(delim){
1153         if(!strchr(OD_DELIMCHARS, word[j])){
1154           if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv);
1155           pv = j;
1156           delim = FALSE;
1157         }
1158       } else {
1159         if(strchr(OD_DELIMCHARS, word[j])){
1160           if(dif > 0 && dif <= OD_MAXWORDLEN) cblistpush(words, word + pv, j - pv);
1161           pv = j;
1162           delim = TRUE;
1163         }
1164       }
1165       j++;
1166     }
1167   }
1168   cblistclose(elems);
1169   return words;
1170 }
1171 
1172 
1173 /* Make the normalized form of a word. */
odnormalizeword(const char * asis)1174 char *odnormalizeword(const char *asis){
1175   char *nword;
1176   int i;
1177   assert(asis);
1178   for(i = 0; asis[i] != '\0'; i++){
1179     if(!strchr(OD_DELIMCHARS, asis[i])) break;
1180   }
1181   if(asis[i] == '\0') return cbmemdup("", 0);
1182   nword = cbmemdup(asis, -1);
1183   for(i = 0; nword[i] != '\0'; i++){
1184     if(nword[i] >= 'A' && nword[i] <= 'Z') nword[i] += 'a' - 'A';
1185   }
1186   while(i >= 0){
1187     if(strchr(OD_GLUECHARS, nword[i])){
1188       nword[i] = '\0';
1189     } else {
1190       break;
1191     }
1192     i--;
1193   }
1194   return nword;
1195 }
1196 
1197 
1198 /* Get the common elements of two sets of documents. */
odpairsand(ODPAIR * apairs,int anum,ODPAIR * bpairs,int bnum,int * np)1199 ODPAIR *odpairsand(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){
1200   CBMAP *map;
1201   ODPAIR *result;
1202   const char *tmp;
1203   int i, rnum;
1204   assert(apairs && anum >= 0 && bpairs && bnum >= 0);
1205   map = odpairsmap(bpairs, bnum);
1206   result = cbmalloc(sizeof(ODPAIR) * anum + 1);
1207   rnum = 0;
1208   for(i = 0; i < anum; i++){
1209     if(!(tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL))) continue;
1210     result[rnum].id = apairs[i].id;
1211     result[rnum].score = apairs[i].score + *(int *)tmp;
1212     rnum++;
1213   }
1214   cbmapclose(map);
1215   qsort(result, rnum, sizeof(ODPAIR), odsortcompare);
1216   *np = rnum;
1217   return result;
1218 }
1219 
1220 
1221 /* Get the sum of elements of two sets of documents. */
odpairsor(ODPAIR * apairs,int anum,ODPAIR * bpairs,int bnum,int * np)1222 ODPAIR *odpairsor(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){
1223   CBMAP *map;
1224   ODPAIR *result;
1225   const char *tmp;
1226   int i, score, rnum;
1227   assert(apairs && anum >= 0 && bpairs && bnum >= 0);
1228   map = odpairsmap(bpairs, bnum);
1229   for(i = 0; i < anum; i++){
1230     score = 0;
1231     if((tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL)) != NULL)
1232       score = *(int *)tmp;
1233     score += apairs[i].score;
1234     cbmapput(map, (char *)&(apairs[i].id), sizeof(int),
1235              (char *)&score, sizeof(int), TRUE);
1236   }
1237   rnum = cbmaprnum(map);
1238   result = cbmalloc(rnum * sizeof(ODPAIR) + 1);
1239   cbmapiterinit(map);
1240   for(i = 0; (tmp = cbmapiternext(map, NULL)) != NULL; i++){
1241     result[i].id = *(int *)tmp;
1242     result[i].score = *(int *)cbmapget(map, tmp, sizeof(int), NULL);
1243   }
1244   cbmapclose(map);
1245   qsort(result, rnum, sizeof(ODPAIR), odsortcompare);
1246   *np = rnum;
1247   return result;
1248 }
1249 
1250 
1251 /* Get the difference set of documents. */
odpairsnotand(ODPAIR * apairs,int anum,ODPAIR * bpairs,int bnum,int * np)1252 ODPAIR *odpairsnotand(ODPAIR *apairs, int anum, ODPAIR *bpairs, int bnum, int *np){
1253   CBMAP *map;
1254   ODPAIR *result;
1255   const char *tmp;
1256   int i, rnum;
1257   assert(apairs && anum >= 0 && bpairs && bnum >= 0);
1258   map = odpairsmap(bpairs, bnum);
1259   result = cbmalloc(sizeof(ODPAIR) * anum + 1);
1260   rnum = 0;
1261   for(i = 0; i < anum; i++){
1262     if((tmp = cbmapget(map, (char *)&(apairs[i].id), sizeof(int), NULL)) != NULL) continue;
1263     result[rnum].id = apairs[i].id;
1264     result[rnum].score = apairs[i].score;
1265     rnum++;
1266   }
1267   cbmapclose(map);
1268   qsort(result, rnum, sizeof(ODPAIR), odsortcompare);
1269   *np = rnum;
1270   return result;
1271 }
1272 
1273 
1274 /* Sort a set of documents in descending order of scores. */
odpairssort(ODPAIR * pairs,int pnum)1275 void odpairssort(ODPAIR *pairs, int pnum){
1276   assert(pairs && pnum >= 0);
1277   qsort(pairs, pnum, sizeof(ODPAIR), odsortcompare);
1278 }
1279 
1280 
1281 /* Get the natural logarithm of a number. */
odlogarithm(double x)1282 double odlogarithm(double x){
1283   int i;
1284   if(x <= 1.0) return 0.0;
1285   x = x * x * x * x * x * x * x * x * x * x;
1286   for(i = 0; x > 1.0; i++){
1287     x /= 2.718281828459;
1288   }
1289   return (double)i / 10.0;
1290 }
1291 
1292 
1293 /* Get the cosine of the angle of two vectors. */
odvectorcosine(const int * avec,const int * bvec,int vnum)1294 double odvectorcosine(const int *avec, const int *bvec, int vnum){
1295   double rv;
1296   assert(avec && bvec && vnum >= 0);
1297   rv = odvecinnerproduct(avec, bvec, vnum) /
1298     ((odvecabsolute(avec, vnum) * odvecabsolute(bvec, vnum)));
1299   return rv > 0.0 ? rv : 0.0;
1300 }
1301 
1302 
1303 /* Set the global tuning parameters. */
odsettuning(int ibnum,int idnum,int cbnum,int csiz)1304 void odsettuning(int ibnum, int idnum, int cbnum, int csiz){
1305   if(ibnum > 0) odindexbnum = ibnum;
1306   if(idnum > 0) odindexdnum = idnum;
1307   if(cbnum > 0) odcachebnum = dpprimenum(cbnum);
1308   if(csiz > 0) odcachesiz = csiz;
1309 }
1310 
1311 
1312 /* Break a text into words and store appearance forms and normalized form into lists. */
odanalyzetext(ODEUM * odeum,const char * text,CBLIST * awords,CBLIST * nwords)1313 void odanalyzetext(ODEUM *odeum, const char *text, CBLIST *awords, CBLIST *nwords){
1314   char aword[OD_MAXWORDLEN+1], *wp;
1315   int lev, wsiz;
1316   assert(odeum && text && awords);
1317   lev = OD_EVSPACE;
1318   wsiz = 0;
1319   for(; *text != '\0'; text++){
1320     switch(odeum->statechars[*(unsigned char *)text]){
1321     case OD_EVWORD:
1322       if(wsiz > 0 && lev == OD_EVDELIM){
1323         cblistpush(awords, aword, wsiz);
1324         if(nwords) cblistpush(nwords, "", 0);
1325         wsiz = 0;
1326       }
1327       if(wsiz <= OD_MAXWORDLEN){
1328         aword[wsiz++] = *text;
1329       }
1330       lev = OD_EVWORD;
1331       break;
1332     case OD_EVGLUE:
1333       if(wsiz > 0 && lev == OD_EVDELIM){
1334         cblistpush(awords, aword, wsiz);
1335         if(nwords) cblistpush(nwords, "", 0);
1336         wsiz = 0;
1337       }
1338       if(wsiz <= OD_MAXWORDLEN){
1339         aword[wsiz++] = *text;
1340       }
1341       lev = OD_EVGLUE;
1342       break;
1343     case OD_EVDELIM:
1344       if(wsiz > 0 && lev != OD_EVDELIM){
1345         cblistpush(awords, aword, wsiz);
1346         if(nwords){
1347           wp = aword;
1348           aword[wsiz] = '\0';
1349           while(*wp != '\0'){
1350             if(*wp >= 'A' && *wp <= 'Z') *wp += 'a' - 'A';
1351             wp++;
1352           }
1353           wp--;
1354           while(wp >= aword && odeum->statechars[*(unsigned char *)wp] == OD_EVGLUE){
1355             wsiz--;
1356             wp--;
1357           }
1358           cblistpush(nwords, aword, wsiz);
1359         }
1360         wsiz = 0;
1361       }
1362       if(wsiz <= OD_MAXWORDLEN){
1363         aword[wsiz++] = *text;
1364       }
1365       lev = OD_EVDELIM;
1366       break;
1367     default:
1368       if(wsiz > 0){
1369         cblistpush(awords, aword, wsiz);
1370         if(nwords){
1371           if(lev == OD_EVDELIM){
1372             cblistpush(nwords, "", 0);
1373           } else {
1374             wp = aword;
1375             aword[wsiz] = '\0';
1376             while(*wp != '\0'){
1377               if(*wp >= 'A' && *wp <= 'Z') *wp += 'a' - 'A';
1378               wp++;
1379             }
1380             wp--;
1381             while(wp >= aword && odeum->statechars[*(unsigned char *)wp] == OD_EVGLUE){
1382               wsiz--;
1383               wp--;
1384             }
1385             cblistpush(nwords, aword, wsiz);
1386           }
1387         }
1388         wsiz = 0;
1389       }
1390       lev = OD_EVSPACE;
1391       break;
1392     }
1393   }
1394   if(wsiz > 0){
1395     cblistpush(awords, aword, wsiz);
1396     if(nwords){
1397       if(lev == OD_EVDELIM){
1398         cblistpush(nwords, "", 0);
1399       } else {
1400         wp = aword;
1401         aword[wsiz] = '\0';
1402         while(*wp != '\0'){
1403           if(*wp >= 'A' && *wp <= 'Z') *wp += 'a' - 'A';
1404           wp++;
1405         }
1406         wp--;
1407         while(wp >= aword && odeum->statechars[*(unsigned char *)wp] == OD_EVGLUE){
1408           wsiz--;
1409           wp--;
1410         }
1411         cblistpush(nwords, aword, wsiz);
1412       }
1413     }
1414     wsiz = 0;
1415   }
1416 }
1417 
1418 
1419 /* Set the classes of characters used by `odanalyzetext'. */
odsetcharclass(ODEUM * odeum,const char * spacechars,const char * delimchars,const char * gluechars)1420 void odsetcharclass(ODEUM *odeum, const char *spacechars, const char *delimchars,
1421                     const char *gluechars){
1422   assert(odeum && spacechars && delimchars && gluechars);
1423   memset(odeum->statechars, OD_EVWORD, sizeof(odeum->statechars));
1424   for(; *spacechars != '\0'; spacechars++){
1425     odeum->statechars[*(unsigned char *)spacechars] = OD_EVSPACE;
1426   }
1427   for(; *delimchars != '\0'; delimchars++){
1428     odeum->statechars[*(unsigned char *)delimchars] = OD_EVDELIM;
1429   }
1430   for(; *gluechars != '\0'; gluechars++){
1431     odeum->statechars[*(unsigned char *)gluechars] = OD_EVGLUE;
1432   }
1433 }
1434 
1435 
1436 /* Query a database using a small boolean query language. */
odquery(ODEUM * odeum,const char * query,int * np,CBLIST * errors)1437 ODPAIR *odquery(ODEUM *odeum, const char *query, int *np, CBLIST *errors){
1438   CBLIST *tokens = cblistopen();
1439   CBLIST *nwords = cblistopen();
1440   ODPAIR *results = NULL;
1441   assert(odeum && query && np);
1442   odanalyzetext(odeum, query, tokens, nwords);
1443   odcleannormalized(odeum, nwords);
1444   odfixtokens(odeum, tokens);
1445   results = odparseexpr(odeum, tokens, nwords, np, errors);
1446   cblistclose(tokens);
1447   cblistclose(nwords);
1448   return results;
1449 }
1450 
1451 
1452 
1453 /*************************************************************************************************
1454  * features for experts
1455  *************************************************************************************************/
1456 
1457 
1458 /* Get the internal database handle for documents. */
odidbdocs(ODEUM * odeum)1459 CURIA *odidbdocs(ODEUM *odeum){
1460   assert(odeum);
1461   return odeum->docsdb;
1462 }
1463 
1464 
1465 /* Get the internal database handle for the inverted index. */
odidbindex(ODEUM * odeum)1466 CURIA *odidbindex(ODEUM *odeum){
1467   assert(odeum);
1468   return odeum->indexdb;
1469 }
1470 
1471 
1472 /* Get the internal database handle for the reverse dictionary. */
odidbrdocs(ODEUM * odeum)1473 VILLA *odidbrdocs(ODEUM *odeum){
1474   assert(odeum);
1475   return odeum->rdocsdb;
1476 }
1477 
1478 
1479 /* Set the call back function called in merging. */
odsetotcb(void (* otcb)(const char *,ODEUM *,const char *))1480 void odsetotcb(void (*otcb)(const char *, ODEUM *, const char *)){
1481   odotcb = otcb;
1482 }
1483 
1484 
1485 /* Get the positive one of square roots of a number. */
odsquareroot(double x)1486 double odsquareroot(double x){
1487   double c, rv;
1488   if(x <= 0.0) return 0.0;
1489   c = x > 1.0 ? x : 1;
1490   do {
1491     rv = c;
1492     c = (x / c + c) * 0.5;
1493   } while(c < rv);
1494   return rv;
1495 }
1496 
1497 
1498 /* Get the absolute of a vector. */
odvecabsolute(const int * vec,int vnum)1499 double odvecabsolute(const int *vec, int vnum){
1500   double rv;
1501   int i;
1502   assert(vec && vnum >= 0);
1503   rv = 0;
1504   for(i = 0; i < vnum; i++){
1505     rv += (double)vec[i] * (double)vec[i];
1506   }
1507   return odsquareroot(rv);
1508 }
1509 
1510 
1511 /* Get the inner product of two vectors. */
odvecinnerproduct(const int * avec,const int * bvec,int vnum)1512 double odvecinnerproduct(const int *avec, const int *bvec, int vnum){
1513   double rv;
1514   int i;
1515   assert(avec && bvec && vnum >= 0);
1516   rv = 0;
1517   for(i = 0; i < vnum; i++){
1518     rv += (double)avec[i] * (double)bvec[i];
1519   }
1520   return rv;
1521 }
1522 
1523 
1524 
1525 /*************************************************************************************************
1526  * private objects
1527  *************************************************************************************************/
1528 
1529 
1530 /* Get a database handle.
1531    `name' specifies the name of a database directory.
1532    `omode' specifies the connection mode.
1533    `docsbnum` specifies the number of buckets of the document database.
1534    `indexbnum` specifies the number of buckets of the index database.
1535    `fname' specifies the name of caller function.
1536    The return value is the database handle or `NULL' if it is not successful. */
odopendb(const char * name,int omode,int docsbnum,int indexbnum,const char * fname)1537 static ODEUM *odopendb(const char *name, int omode, int docsbnum, int indexbnum,
1538                        const char *fname){
1539   int cromode, vlomode, inode, dmax, dnum;
1540   char docsname[OD_PATHBUFSIZ], indexname[OD_PATHBUFSIZ], rdocsname[OD_PATHBUFSIZ], *tmp;
1541   struct stat sbuf;
1542   CURIA *docsdb, *indexdb;
1543   VILLA *rdocsdb;
1544   CBMAP *cachemap;
1545   CBMAP *sortmap;
1546   ODEUM *odeum;
1547   assert(name);
1548   if(strlen(name) > OD_NAMEMAX){
1549     dpecodeset(DP_EMISC, __FILE__, __LINE__);
1550     return NULL;
1551   }
1552   cromode = CR_OREADER;
1553   vlomode = VL_OREADER;
1554   if(omode & OD_OWRITER){
1555     cromode = CR_OWRITER;
1556     vlomode = VL_OWRITER | VL_OZCOMP | VL_OYCOMP;
1557     if(omode & OD_OCREAT){
1558       cromode |= CR_OCREAT;
1559       vlomode |= VL_OCREAT;
1560     }
1561     if(omode & OD_OTRUNC){
1562       cromode |= CR_OTRUNC;
1563       vlomode |= VL_OTRUNC;
1564     }
1565   }
1566   if(omode & OD_ONOLCK){
1567     cromode |= CR_ONOLCK;
1568     vlomode |= VL_ONOLCK;
1569   }
1570   if(omode & OD_OLCKNB){
1571     cromode |= CR_OLCKNB;
1572     vlomode |= VL_OLCKNB;
1573   }
1574   sprintf(docsname, "%s%c%s", name, MYPATHCHR, OD_DOCSNAME);
1575   sprintf(indexname, "%s%c%s", name, MYPATHCHR, OD_INDEXNAME);
1576   sprintf(rdocsname, "%s%c%s", name, MYPATHCHR, OD_RDOCSNAME);
1577   docsdb = NULL;
1578   indexdb = NULL;
1579   rdocsdb = NULL;
1580   if((omode & OD_OWRITER) && (omode & OD_OCREAT)){
1581     if(mkdir(name, OD_DIRMODE) == -1 && errno != EEXIST){
1582       dpecodeset(DP_EMKDIR, __FILE__, __LINE__);
1583       return NULL;
1584     }
1585   }
1586   if(lstat(name, &sbuf) == -1){
1587     dpecodeset(DP_ESTAT, __FILE__, __LINE__);
1588     return NULL;
1589   }
1590   inode = sbuf.st_ino;
1591   if(!(docsdb = cropen(docsname, cromode, docsbnum, OD_DOCSDNUM))) return NULL;
1592   if(!(indexdb = cropen(indexname, cromode, indexbnum, odindexdnum))){
1593     crclose(docsdb);
1594     return NULL;
1595   }
1596   if(omode & OD_OWRITER){
1597     if(!crsetalign(docsdb, OD_DOCSALIGN) || !crsetfbpsiz(docsdb, OD_DOCSFBP) ||
1598        !crsetalign(indexdb, OD_INDEXALIGN) || !crsetfbpsiz(indexdb, OD_INDEXFBP)){
1599       crclose(indexdb);
1600       crclose(docsdb);
1601       return NULL;
1602     }
1603   }
1604   if(!(rdocsdb = vlopen(rdocsname, vlomode, VL_CMPLEX))){
1605     crclose(indexdb);
1606     crclose(docsdb);
1607     return NULL;
1608   }
1609   vlsettuning(rdocsdb, OD_RDOCSLRM, OD_RDOCSNIM, OD_RDOCSLCN, OD_RDOCSNCN);
1610   if(omode & OD_OWRITER){
1611     cachemap = cbmapopenex(odcachebnum);
1612     sortmap = cbmapopenex(odcachebnum);
1613   } else {
1614     cachemap = NULL;
1615     sortmap = NULL;
1616   }
1617   if(vlrnum(rdocsdb) > 0){
1618     dmax = -1;
1619     dnum = -1;
1620     if((tmp = vlget(rdocsdb, OD_DMAXEXPR, sizeof(OD_DMAXEXPR), NULL)) != NULL){
1621       dmax = atoi(tmp);
1622       free(tmp);
1623     }
1624     if((tmp = vlget(rdocsdb, OD_DNUMEXPR, sizeof(OD_DNUMEXPR), NULL)) != NULL){
1625       dnum = atoi(tmp);
1626       free(tmp);
1627     }
1628     if(dmax < 0 || dnum < 0){
1629       if(sortmap) cbmapclose(sortmap);
1630       if(cachemap) cbmapclose(cachemap);
1631       vlclose(rdocsdb);
1632       crclose(indexdb);
1633       crclose(docsdb);
1634       dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
1635       return NULL;
1636     }
1637   } else {
1638     dmax = 0;
1639     dnum = 0;
1640   }
1641   odeum = cbmalloc(sizeof(ODEUM));
1642   odeum->name = cbmemdup(name, -1);
1643   odeum->wmode = omode & OD_OWRITER;
1644   odeum->fatal = FALSE;
1645   odeum->inode = inode;
1646   odeum->docsdb = docsdb;
1647   odeum->indexdb = indexdb;
1648   odeum->rdocsdb = rdocsdb;
1649   odeum->cachemap = cachemap;
1650   odeum->cacheasiz = 0;
1651   odeum->sortmap = sortmap;
1652   odeum->dmax = dmax;
1653   odeum->dnum = dnum;
1654   odeum->ldid = -1;
1655   odsetcharclass(odeum, OD_SPACECHARS, OD_DELIMCHARS, OD_GLUECHARS);
1656   if(odotcb) odotcb(fname, odeum, "the connection was established");
1657   return odeum;
1658 }
1659 
1660 
1661 /* Flush the cache for dirty buffer of words.
1662    `odeum' specifies a database handle.
1663    `fname' specifies the name of caller function.
1664    If successful, the return value is true, else, it is false. */
odcacheflush(ODEUM * odeum,const char * fname)1665 static int odcacheflush(ODEUM *odeum, const char *fname){
1666   const char *kbuf, *vbuf;
1667   char otmsg[OD_OTCBBUFSIZ];
1668   int i, rnum, ksiz, vsiz;
1669   assert(odeum);
1670   if((rnum = cbmaprnum(odeum->cachemap)) < 1) return TRUE;
1671   if(odotcb) odotcb(fname, odeum, "flushing caches");
1672   cbmapiterinit(odeum->cachemap);
1673   for(i = 0; (kbuf = cbmapiternext(odeum->cachemap, &ksiz)) != NULL; i++){
1674     vbuf = cbmapget(odeum->cachemap, kbuf, ksiz, &vsiz);
1675     if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, vsiz, CR_DCAT)){
1676       odeum->fatal = TRUE;
1677       return FALSE;
1678     }
1679     if(odotcb && (i + 1) % OD_OTPERWORDS == 0){
1680       sprintf(otmsg, "... (%d/%d)", i + 1, rnum);
1681       odotcb(fname, odeum, otmsg);
1682     }
1683   }
1684   cbmapclose(odeum->cachemap);
1685   odeum->cachemap = cbmapopenex(odcachebnum);
1686   odeum->cacheasiz = 0;
1687   return TRUE;
1688 }
1689 
1690 
1691 /* Flush all frequent words in the cache for dirty buffer of words.
1692    `odeum' specifies a database handle.
1693    `fname' specifies the name of caller function.
1694    `min' specifies the minimum size of frequent words.
1695    If successful, the return value is true, else, it is false. */
odcacheflushfreq(ODEUM * odeum,const char * fname,int min)1696 static int odcacheflushfreq(ODEUM *odeum, const char *fname, int min){
1697   const char *kbuf, *vbuf;
1698   char otmsg[OD_OTCBBUFSIZ];
1699   int rnum, ksiz, vsiz;
1700   assert(odeum);
1701   if((rnum = cbmaprnum(odeum->cachemap)) < 1) return TRUE;
1702   if(odotcb){
1703     sprintf(otmsg, "flushing frequent words: min=%d asiz=%d rnum=%d)",
1704             min, odeum->cacheasiz, rnum);
1705     odotcb(fname, odeum, otmsg);
1706   }
1707   cbmapiterinit(odeum->cachemap);
1708   while((kbuf = cbmapiternext(odeum->cachemap, &ksiz)) != NULL){
1709     vbuf = cbmapget(odeum->cachemap, kbuf, ksiz, &vsiz);
1710     if(vsiz >= sizeof(ODPAIR) * min){
1711       if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, vsiz, CR_DCAT)){
1712         odeum->fatal = TRUE;
1713         return FALSE;
1714       }
1715       cbmapout(odeum->cachemap, kbuf, ksiz);
1716       odeum->cacheasiz -= vsiz;
1717     }
1718   }
1719   if(odotcb){
1720     sprintf(otmsg, "... (done): min=%d asiz=%d rnum=%d)",
1721             min, odeum->cacheasiz, cbmaprnum(odeum->cachemap));
1722     odotcb(fname, odeum, otmsg);
1723   }
1724   return TRUE;
1725 }
1726 
1727 
1728 /* Flush the half of rare words in the cache for dirty buffer of words.
1729    `odeum' specifies a database handle.
1730    `fname' specifies the name of caller function.
1731    `ratio' specifies the ratio of rare words.
1732    If successful, the return value is true, else, it is false. */
odcacheflushrare(ODEUM * odeum,const char * fname,double ratio)1733 static int odcacheflushrare(ODEUM *odeum, const char *fname, double ratio){
1734   const char *kbuf, *vbuf;
1735   char otmsg[OD_OTCBBUFSIZ];
1736   int i, rnum, limit, ksiz, vsiz;
1737   assert(odeum);
1738   if((rnum = cbmaprnum(odeum->cachemap)) < 1) return TRUE;
1739   if(odotcb){
1740     sprintf(otmsg, "flushing rare words: ratio=%.2f asiz=%d rnum=%d)",
1741             ratio, odeum->cacheasiz, rnum);
1742     odotcb(fname, odeum, otmsg);
1743   }
1744   cbmapiterinit(odeum->cachemap);
1745   limit = (int)(rnum * ratio);
1746   for(i = 0; i < limit && (kbuf = cbmapiternext(odeum->cachemap, &ksiz)) != NULL; i++){
1747     vbuf = cbmapget(odeum->cachemap, kbuf, ksiz, &vsiz);
1748     if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, vsiz, CR_DCAT)){
1749       odeum->fatal = TRUE;
1750       return FALSE;
1751     }
1752     cbmapout(odeum->cachemap, kbuf, ksiz);
1753     odeum->cacheasiz -= vsiz;
1754   }
1755   if(odotcb){
1756     sprintf(otmsg, "... (done): ratio=%.2f asiz=%d rnum=%d)",
1757             ratio, odeum->cacheasiz, cbmaprnum(odeum->cachemap));
1758     odotcb(fname, odeum, otmsg);
1759   }
1760   return TRUE;
1761 }
1762 
1763 
1764 /* Sort the records of inverted index.
1765    `odeum' specifies a database handle.
1766    `fname' specifies the name of caller function.
1767    If successful, the return value is true, else, it is false. */
odsortindex(ODEUM * odeum,const char * fname)1768 static int odsortindex(ODEUM *odeum, const char *fname){
1769   const char *word;
1770   char *tmp, otmsg[OD_OTCBBUFSIZ];
1771   int i, rnum, wsiz, tsiz;
1772   ODPAIR *pairs;
1773   assert(odeum);
1774   if((rnum = cbmaprnum(odeum->sortmap)) < 1) return TRUE;
1775   if(odotcb) odotcb(fname, odeum, "sorting the inverted index");
1776   cbmapiterinit(odeum->sortmap);
1777   for(i = 0; (word = cbmapiternext(odeum->sortmap, &wsiz)) != NULL; i++){
1778     if((tmp = crget(odeum->indexdb, word, wsiz, 0, -1, &tsiz)) != NULL){
1779       if(tsiz > sizeof(ODPAIR)){
1780         pairs = (ODPAIR *)tmp;
1781         qsort(pairs, tsiz / sizeof(ODPAIR), sizeof(ODPAIR), odsortcompare);
1782         if(!crput(odeum->indexdb, word, wsiz, tmp, tsiz, CR_DOVER)){
1783           free(tmp);
1784           return FALSE;
1785         }
1786       }
1787       free(tmp);
1788     } else if(dpecode != DP_ENOITEM){
1789       return FALSE;
1790     }
1791     if(odotcb && (i + 1) % OD_OTPERWORDS == 0){
1792       sprintf(otmsg, "... (%d/%d)", i + 1, rnum);
1793       odotcb(fname, odeum, otmsg);
1794     }
1795   }
1796   cbmapclose(odeum->sortmap);
1797   odeum->sortmap = cbmapopenex(odcachebnum);
1798   return TRUE;
1799 }
1800 
1801 
1802 /* Compare two pairs of structures of a search result.
1803    `a' specifies the pointer to the region of one pair.
1804    `b' specifies the pointer to the region of the other pair.
1805    The return value is positive if the former is big, negative if the latter is big, 0 if both
1806    are equivalent. */
odsortcompare(const void * a,const void * b)1807 static int odsortcompare(const void *a, const void *b){
1808   ODPAIR *ap, *bp;
1809   int rv;
1810   assert(a && b);
1811   ap = (ODPAIR *)a;
1812   bp = (ODPAIR *)b;
1813   rv = bp->score - ap->score;
1814   if(rv != 0) return rv;
1815   return ap->id - bp->id;
1816 }
1817 
1818 
1819 /* Purge the elements of the deleted documents from the inverted index.
1820    `odeum' specifies a database handle.
1821    `fname' specifies the name of caller function.
1822    If successful, the return value is true, else, it is false. */
odpurgeindex(ODEUM * odeum,const char * fname)1823 static int odpurgeindex(ODEUM *odeum, const char *fname){
1824   ODPAIR *pairs;
1825   char *kbuf, *vbuf, otmsg[OD_OTCBBUFSIZ];
1826   int i, rnum, tnum, ksiz, vsiz, pnum, wi;
1827   assert(odeum);
1828   if((rnum = crrnum(odeum->indexdb)) < 1) return TRUE;
1829   if(odotcb) odotcb(fname, odeum, "purging dispensable regions");
1830   if(!criterinit(odeum->indexdb)) return FALSE;
1831   tnum = 0;
1832   while(TRUE){
1833     if(!(kbuf = criternext(odeum->indexdb, &ksiz))){
1834       if(dpecode != DP_ENOITEM) return FALSE;
1835       break;
1836     }
1837     if(!(vbuf = crget(odeum->indexdb, kbuf, ksiz, 0, -1, &vsiz))){
1838       dpecodeset(DP_EBROKEN, __FILE__, __LINE__);
1839       free(kbuf);
1840       return FALSE;
1841     }
1842     pairs = (ODPAIR *)vbuf;
1843     pnum = vsiz / sizeof(ODPAIR);
1844     wi = 0;
1845     for(i = 0; i < pnum; i++){
1846       if(crvsiz(odeum->docsdb, (char *)&(pairs[i].id), sizeof(int)) != -1){
1847         pairs[wi++] = pairs[i];
1848       }
1849     }
1850     if(wi > 0){
1851       if(!crput(odeum->indexdb, kbuf, ksiz, vbuf, wi * sizeof(ODPAIR), CR_DOVER)){
1852         free(vbuf);
1853         free(kbuf);
1854         return FALSE;
1855       }
1856     } else {
1857       if(!crout(odeum->indexdb, kbuf, ksiz)){
1858         free(vbuf);
1859         free(kbuf);
1860         return FALSE;
1861       }
1862     }
1863     free(vbuf);
1864     free(kbuf);
1865     if(odotcb && (tnum + 1) % OD_OTPERWORDS == 0){
1866       sprintf(otmsg, "... (%d/%d)", tnum + 1, rnum);
1867       odotcb(fname, odeum, otmsg);
1868     }
1869     tnum++;
1870   }
1871   return TRUE;
1872 }
1873 
1874 
1875 /* Create a map of a document array.
1876    `pairs' specifies the pointer to a document array.
1877    `num' specifies the number of elements of the array.
1878    The return value is a map of the document array. */
odpairsmap(const ODPAIR * pairs,int num)1879 static CBMAP *odpairsmap(const ODPAIR *pairs, int num){
1880   CBMAP *map;
1881   int i;
1882   assert(pairs && num >= 0);
1883   map = cbmapopen();
1884   for(i = 0; i < num; i++){
1885     cbmapput(map, (char *)&(pairs[i].id), sizeof(int),
1886              (char *)&(pairs[i].score), sizeof(int), TRUE);
1887   }
1888   return map;
1889 }
1890 
1891 
1892 /* compare two pairs of structures of words in a document.
1893    `a' specifies the pointer to the region of one word.
1894    `b' specifies the pointer to the region of the other word.
1895    The return value is positive if the former is big, negative if the latter is big, 0 if both
1896    are equivalent. */
odwordcompare(const void * a,const void * b)1897 static int odwordcompare(const void *a, const void *b){
1898   ODWORD *ap, *bp;
1899   int rv;
1900   assert(a && b);
1901   ap = (ODWORD *)a;
1902   bp = (ODWORD *)b;
1903   if((rv = bp->num - ap->num) != 0) return rv;
1904   if((rv = strlen(bp->word) - strlen(ap->word)) != 0) return rv;
1905   return strcmp(ap->word, bp->word);
1906 }
1907 
1908 
1909 /* Match an operator without taking it off the token list.
1910    `odeum' specifies a database handle.
1911    `tokens' specifies a list handle of tokens.
1912    The return value is whether the next token is an operator. */
odmatchoperator(ODEUM * odeum,CBLIST * tokens)1913 static int odmatchoperator(ODEUM *odeum, CBLIST *tokens){
1914   const char *tk = NULL;
1915   int tk_len = 0;
1916   tk = cblistval(tokens, 0, &tk_len);
1917   if(tk && (tk[0] == '&' || tk[0] == '|' || tk[0] == '!')) return 1;
1918   return 0;
1919 }
1920 
1921 
1922 /* Implements the subexpr part of the grammar.
1923    `odeum' specifies a database handle.
1924    `tokens' specifies a list handle of tokens.
1925    `nwords' specifies a list handle of normalized words.
1926    `np' specifies the pointer to a variable to which the number of the elements of the return
1927    value is assigned.
1928    `errors' specifies a list handle into which error messages are stored.
1929    The return value is the pointer to an array of document IDs. */
odparsesubexpr(ODEUM * odeum,CBLIST * tokens,CBLIST * nwords,int * np,CBLIST * errors)1930 static ODPAIR *odparsesubexpr(ODEUM *odeum, CBLIST *tokens, CBLIST *nwords, int *np,
1931                               CBLIST *errors){
1932   char *tk = NULL;
1933   int tk_len = 0;
1934   char *nword = NULL;  /* used to do the actual search, should match with tokens */
1935   ODPAIR *result = NULL;
1936   int result_num = 0;
1937   int i;
1938   double ival;
1939   if((tk = cblistshift(tokens, &tk_len)) != NULL){
1940     assert(tk != NULL);
1941     if(tk[0] == '('){
1942       free(tk);
1943       /* recurse into expr */
1944       result = odparseexpr(odeum, tokens, nwords, &result_num, errors);
1945       /* match right token RPAREN */
1946       tk = cblistshift(tokens, &tk_len);
1947       /* print an error if either we didn't get anything or we didn't get a ) */
1948       if(tk == NULL){
1949         if(errors) cblistpush(errors, "Expression ended without closing ')'", -1);
1950       } else if(tk[0] != ')'){
1951         if(errors) cblistpush(errors, "Un-balanced parenthesis.", -1);
1952       }
1953     } else if(odeum->statechars[*(unsigned char *)tk] == 0){
1954       /* Perform odsearch with the next norm word that isn't an operator. */
1955       nword = cblistshift(nwords, NULL);
1956       assert(nword != NULL);
1957       if((result = odsearch(odeum, nword, -1, &result_num)) != NULL){
1958         /* TF-IDF tuning */
1959         ival = odlogarithm(result_num);
1960         ival = (ival * ival) / 4.0;
1961         if(ival < 4.0) ival = 4.0;
1962         for(i = 0; i < result_num; i++){
1963           result[i].score = (int)(result[i].score / ival);
1964         }
1965       }
1966       free(nword);
1967     } else {
1968       if(errors) cblistpush(errors, "Invalid sub-expression.  Expected '(' or WORD.", -1);
1969       result = cbmalloc(1);
1970       result_num = 0;
1971     }
1972     /* done with the token */
1973     free(tk);
1974   }
1975   *np = result_num;
1976   return result;
1977 }
1978 
1979 
1980 /* Implements the actual recursive decent parser for the mini query language.
1981    `odeum' specifies a database handle.
1982    `tokens' specifies a list handle of tokens.
1983    `nwords' specifies a list handle of normalized words.
1984    `np' specifies the pointer to a variable to which the number of the elements of the return
1985    value is assigned.
1986    `errors' specifies a list handle into which error messages are stored.
1987    The return value is the pointer to an array of document IDs.
1988    It simply parses an initial subexpr, and then loops over as many (operator subexpr)
1989    sequences as it can find.  The odmatchoperator function handles injecting a default &
1990    between consecutive words. */
odparseexpr(ODEUM * odeum,CBLIST * tokens,CBLIST * nwords,int * np,CBLIST * errors)1991 static ODPAIR *odparseexpr(ODEUM *odeum, CBLIST *tokens, CBLIST *nwords, int *np,
1992                            CBLIST *errors){
1993   ODPAIR *left = NULL;
1994   ODPAIR *right = NULL;
1995   ODPAIR *temp = NULL;
1996   int left_num = 0;
1997   int right_num = 0;
1998   int temp_num = 0;
1999   char *op = NULL;
2000   int op_len = 0;
2001   if(!(left = odparsesubexpr(odeum, tokens, nwords, &left_num, errors))) return NULL;
2002   /* expr ::= subexpr ( op subexpr )* */
2003   while(odmatchoperator(odeum, tokens)){
2004     op = cblistshift(tokens, &op_len);
2005     if(!(right = odparsesubexpr(odeum, tokens, nwords, &right_num, errors))){
2006       free(op);
2007       free(left);
2008       return NULL;
2009     }
2010     switch(op[0]){
2011     case '&':
2012       temp = odpairsand(left, left_num, right, right_num, &temp_num);
2013       break;
2014     case '|':
2015       temp = odpairsor(left, left_num, right, right_num, &temp_num);
2016       break;
2017     case '!':
2018       temp = odpairsnotand(left, left_num, right, right_num, &temp_num);
2019       break;
2020     default:
2021       if(errors) cblistpush(errors, "Invalid operator.  Expected '&', '|', or '!'.", -1);
2022       break;
2023     }
2024     if(temp){
2025       /* an operator was done so we must swap it with the left */
2026       free(left); left = NULL;
2027       left = temp;
2028       left_num = temp_num;
2029     }
2030     free(op);
2031     if(right) free(right);
2032   }
2033   *np = left_num;
2034   return left;
2035 }
2036 
2037 
2038 /* Processes the tokens in order to break them up further.
2039    `odeum' specifies a database handle.
2040    `tokens' specifies a list handle of tokens. */
odfixtokens(ODEUM * odeum,CBLIST * tokens)2041 static void odfixtokens(ODEUM *odeum, CBLIST *tokens){
2042   const char *tk = NULL;
2043   int tk_len = 0;
2044   int i = 0;
2045   int lastword = 0;
2046   for(i = 0; i < cblistnum(tokens); i++){
2047     tk = cblistval(tokens, i, &tk_len);
2048     assert(tk);
2049     if(tk[0] == '&' || tk[0] == '|' || tk[0] == '!' || tk[0] == '(' || tk[0] == ')'){
2050       lastword = 0;
2051       if(tk_len > 1){
2052         /* need to break it up for the next loop around */
2053         tk = cblistremove(tokens, i, &tk_len);
2054         cblistinsert(tokens, i, tk, 1);
2055         cblistinsert(tokens, i+1, tk+1, tk_len-1);
2056         free((char *)tk);
2057       }
2058     } else if(odeum->statechars[*(unsigned char *)tk] == 0){
2059       /* if the last one was a word and this is a word then we need a default & between them */
2060       if(lastword){
2061         cblistinsert(tokens, i, "&", 1);
2062         i++;
2063       }
2064       lastword = 1;
2065     }
2066   }
2067 }
2068 
2069 
2070 /* Cleans out the parts of the normalized word list that are not considered words.
2071    `odeum' specifies a database handle.
2072    `tokens' specifies a list handle of tokens. */
odcleannormalized(ODEUM * odeum,CBLIST * nwords)2073 static void odcleannormalized(ODEUM *odeum, CBLIST *nwords){
2074   char *tk = NULL;
2075   int tk_len = 0;
2076   int i = 0;
2077   for(i = 0; i < cblistnum(nwords); i++){
2078     tk = (char *)cblistval(nwords, i, &tk_len);
2079     if(tk_len == 0 || (!odeum->statechars[*(unsigned char *)tk] == 0)){
2080       /* not a word so delete it */
2081       tk = cblistremove(nwords, i, &tk_len);
2082       free(tk);
2083       i--;
2084     }
2085   }
2086 }
2087 
2088 
2089 
2090 /* END OF FILE */
2091