1 /*************************************************************************************************
2  * Utility for indexing document files into a database of Odeum
3  *                                                      Copyright (C) 2000-2007 Mikio Hirabayashi
4  * This file is part of QDBM, Quick Database Manager.
5  * QDBM is free software; you can redistribute it and/or modify it under the terms of the GNU
6  * Lesser General Public License as published by the Free Software Foundation; either version
7  * 2.1 of the License or any later version.  QDBM is distributed in the hope that it will be
8  * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
9  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
10  * details.
11  * You should have received a copy of the GNU Lesser General Public License along with QDBM; if
12  * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
13  * 02111-1307 USA.
14  *************************************************************************************************/
15 
16 
17 #include <depot.h>
18 #include <cabin.h>
19 #include <odeum.h>
20 #include <stdlib.h>
21 #include <stdio.h>
22 #include <string.h>
23 #include <stdarg.h>
24 #include <time.h>
25 #include <signal.h>
26 
27 #undef TRUE
28 #define TRUE           1                 /* boolean true */
29 #undef FALSE
30 #define FALSE          0                 /* boolean false */
31 
32 #define PATHCHR        '/'               /* delimiter character of path */
33 #define EXTCHR         '.'               /* delimiter character of extension */
34 #define CDIRSTR        "."               /* string of current directory */
35 #define PDIRSTR        ".."              /* string of parent directory */
36 #define MTDBNAME       "_mtime"          /* name of the database for last modified times */
37 #define MTDBLRM        81                /* records in a leaf node of time database */
38 #define MTDBNIM        192               /* records in a non-leaf node of time database */
39 #define MTDBLCN        64                /* number of leaf cache of time database */
40 #define MTDBNCN        32                /* number of non-leaf cache of time database */
41 #define SCDBNAME       "_score"          /* name of the database for scores */
42 #define SCDBBNUM       32749             /* bucket number of the score database */
43 #define SCDBALIGN      -3                /* alignment of the score database */
44 #define PATHBUFSIZ     2048              /* size of a path buffer */
45 #define MAXLOAD        0.85              /* max ratio of bucket loading */
46 #define KEYNUM         32                /* number of keywords to store */
47 
48 
49 /* for Win32 and RISC OS */
50 #if defined(_WIN32)
51 #undef PATHCHR
52 #define PATHCHR        '\\'
53 #undef EXTCHR
54 #define EXTCHR         '.'
55 #undef CDIRSTR
56 #define CDIRSTR        "."
57 #undef PDIRSTR
58 #define PDIRSTR        ".."
59 #elif defined(__riscos__) || defined(__riscos)
60 #include <unixlib/local.h>
61 int __riscosify_control = __RISCOSIFY_NO_PROCESS;
62 #undef PATHCHR
63 #define PATHCHR        '.'
64 #undef EXTCHR
65 #define EXTCHR         '/'
66 #undef CDIRSTR
67 #define CDIRSTR        "@"
68 #undef PDIRSTR
69 #define PDIRSTR        "^"
70 #endif
71 
72 
73 /* global variables */
74 const char *progname;                    /* program name */
75 int sigterm;                             /* flag for termination signal */
76 
77 
78 /* function prototypes */
79 int main(int argc, char **argv);
80 void setsignals(void);
81 void sigtermhandler(int num);
82 void usage(void);
83 int runregister(int argc, char **argv);
84 int runrelate(int argc, char **argv);
85 int runpurge(int argc, char **argv);
86 int bwimatchlist(const char *str, const CBLIST *keys);
87 char *fgetl(FILE *ifp);
88 void otcb(const char *fname, ODEUM *odeum, const char *msg);
89 void pdperror(const char *name);
90 void printferror(const char *format, ...);
91 void printfinfo(const char *format, ...);
92 const char *datestr(time_t t);
93 int proclist(const char *name, const char *lfile, int wmax,
94              const CBLIST *tsuflist, const CBLIST *hsuflist);
95 int procdir(const char *name, const char *dir, int wmax,
96             const CBLIST *tsuflist, const CBLIST *hsuflist);
97 int indexdir(ODEUM *odeum, VILLA *mtdb, const char *name, const char *dir, int wmax,
98              const CBLIST *tsuflist, const CBLIST *hsuflist);
99 int indexfile(ODEUM *odeum, VILLA *mtdb, const char *name, const char *file, int wmax,
100               const CBLIST *tsuflist, const CBLIST *hsuflist);
101 char *filetouri(const char *file);
102 ODDOC *makedocplain(const char *uri, const char *text, const char *date);
103 ODDOC *makedochtml(const char *uri, const char *html, const char *date);
104 CBMAP *htmlescpairs(void);
105 int procrelate(const char *name);
106 int procpurge(const char *name);
107 
108 
109 /* main routine */
main(int argc,char ** argv)110 int main(int argc, char **argv){
111   int rv;
112   cbstdiobin();
113   progname = argv[0];
114   sigterm = FALSE;
115   setsignals();
116   if(argc < 2) usage();
117   odsetotcb(otcb);
118   rv = 0;
119   if(!strcmp(argv[1], "register")){
120     rv = runregister(argc, argv);
121   } else if(!strcmp(argv[1], "relate")){
122     rv = runrelate(argc, argv);
123   } else if(!strcmp(argv[1], "purge")){
124     rv = runpurge(argc, argv);
125   } else {
126     usage();
127   }
128   return rv;
129 }
130 
131 
132 /* set signal handlers */
setsignals(void)133 void setsignals(void){
134   signal(1, sigtermhandler);
135   signal(2, sigtermhandler);
136   signal(3, sigtermhandler);
137   signal(13, sigtermhandler);
138   signal(15, sigtermhandler);
139 }
140 
141 
142 /* handler of termination signal */
sigtermhandler(int num)143 void sigtermhandler(int num){
144   signal(num, SIG_DFL);
145   sigterm = TRUE;
146   printfinfo("the termination signal %d catched", num);
147 }
148 
149 
150 /* print the usage and exit */
usage(void)151 void usage(void){
152   fprintf(stderr, "%s: indexer of document files\n", progname);
153   fprintf(stderr, "\n");
154   fprintf(stderr, "usage:\n");
155   fprintf(stderr, "  %s register [-l file] [-wmax num] [-tsuf sufs] [-hsuf sufs] name [dir]\n",
156           progname);
157   fprintf(stderr, "  %s relate name\n", progname);
158   fprintf(stderr, "  %s purge name\n", progname);
159   fprintf(stderr, "\n");
160   exit(1);
161 }
162 
163 
164 /* parse arguments of register command */
runregister(int argc,char ** argv)165 int runregister(int argc, char **argv){
166   char *name, *dir, *lfile, *tsuf, *hsuf, path[PATHBUFSIZ];
167   int i, wmax, plen, rv;
168   CBLIST *tsuflist, *hsuflist;
169   name = NULL;
170   dir = NULL;
171   lfile = NULL;
172   tsuf = NULL;
173   hsuf = NULL;
174   wmax = -1;
175   for(i = 2; i < argc; i++){
176     if(!name && argv[i][0] == '-'){
177       if(!strcmp(argv[i], "-l")){
178         if(++i >= argc) usage();
179         lfile = argv[i];
180       } else if(!strcmp(argv[i], "-wmax")){
181         if(++i >= argc) usage();
182         wmax = atoi(argv[i]);
183       } else if(!strcmp(argv[i], "-tsuf")){
184         if(++i >= argc) usage();
185         tsuf = argv[i];
186       } else if(!strcmp(argv[i], "-hsuf")){
187         if(++i >= argc) usage();
188         hsuf = argv[i];
189       } else {
190         usage();
191       }
192     } else if(!name){
193       name = argv[i];
194     } else if(!dir){
195       dir = argv[i];
196     } else {
197       usage();
198     }
199   }
200   if(!name) usage();
201   if(!dir) dir = CDIRSTR;
202   plen = sprintf(path, "%s", dir);
203   if(plen > 1 && path[plen-1] == PATHCHR) path[plen-1] = '\0';
204   tsuflist = cbsplit(tsuf ? tsuf : ".txt,.text", -1, ",");
205   hsuflist = cbsplit(hsuf ? hsuf : ".html,.htm", -1, ",");
206   if(lfile){
207     rv = proclist(name, lfile, wmax, tsuflist, hsuflist);
208   } else {
209     rv = procdir(name, path, wmax, tsuflist, hsuflist);
210   }
211   cblistclose(hsuflist);
212   cblistclose(tsuflist);
213   return rv;
214 }
215 
216 
217 /* parse arguments of relate command */
runrelate(int argc,char ** argv)218 int runrelate(int argc, char **argv){
219   char *name;
220   int i, rv;
221   name = NULL;
222   for(i = 2; i < argc; i++){
223     if(!name && argv[i][0] == '-'){
224       usage();
225     } else if(!name){
226       name = argv[i];
227     } else {
228       usage();
229     }
230   }
231   if(!name) usage();
232   rv = procrelate(name);
233   return rv;
234 }
235 
236 
237 /* parse arguments of purge command */
runpurge(int argc,char ** argv)238 int runpurge(int argc, char **argv){
239   char *name;
240   int i, rv;
241   name = NULL;
242   for(i = 2; i < argc; i++){
243     if(!name && argv[i][0] == '-'){
244       usage();
245     } else if(!name){
246       name = argv[i];
247     } else {
248       usage();
249     }
250   }
251   if(!name) usage();
252   rv = procpurge(name);
253   return rv;
254 }
255 
256 
257 /* case insensitive backward matching with a list */
bwimatchlist(const char * str,const CBLIST * keys)258 int bwimatchlist(const char *str, const CBLIST *keys){
259   int i;
260   for(i = 0; i < cblistnum(keys); i++){
261     if(cbstrbwimatch(str, cblistval(keys, i, NULL))) return TRUE;
262   }
263   return FALSE;
264 }
265 
266 
267 /* read a line */
fgetl(FILE * ifp)268 char *fgetl(FILE *ifp){
269   char *buf;
270   int c, len, blen;
271   buf = NULL;
272   len = 0;
273   blen = 256;
274   while((c = fgetc(ifp)) != EOF){
275     if(blen <= len) blen *= 2;
276     buf = cbrealloc(buf, blen + 1);
277     if(c == '\n') c = '\0';
278     buf[len++] = c;
279     if(c == '\0') break;
280   }
281   if(!buf) return NULL;
282   buf[len] = '\0';
283   return buf;
284 }
285 
286 
287 /* report the outturn */
otcb(const char * fname,ODEUM * odeum,const char * msg)288 void otcb(const char *fname, ODEUM *odeum, const char *msg){
289   char *name;
290   name = odname(odeum);
291   printf("%s: %s: %s: %s\n", progname, fname, name, msg);
292   free(name);
293 }
294 
295 
296 /* print an error message */
pdperror(const char * name)297 void pdperror(const char *name){
298   printf("%s: ERROR: %s: %s\n", progname, name, dperrmsg(dpecode));
299   fflush(stdout);
300 }
301 
302 
303 /* print formatted error string and flush the buffer */
printferror(const char * format,...)304 void printferror(const char *format, ...){
305   va_list ap;
306   va_start(ap, format);
307   printf("%s: ERROR: ", progname);
308   vprintf(format, ap);
309   putchar('\n');
310   fflush(stdout);
311   va_end(ap);
312 }
313 
314 
315 /* print formatted information string and flush the buffer */
printfinfo(const char * format,...)316 void printfinfo(const char *format, ...){
317   va_list ap;
318   va_start(ap, format);
319   printf("%s: INFO: ", progname);
320   vprintf(format, ap);
321   putchar('\n');
322   fflush(stdout);
323   va_end(ap);
324 }
325 
326 
327 /* get static string of the date */
datestr(time_t t)328 const char *datestr(time_t t){
329   static char buf[32];
330   struct tm *stp;
331   if(!(stp = localtime(&t))) return "0000/00/00 00:00:00";
332   sprintf(buf, "%04d/%02d/%02d %02d:%02d:%02d",
333           stp->tm_year + 1900, stp->tm_mon + 1, stp->tm_mday,
334           stp->tm_hour, stp->tm_min, stp->tm_sec);
335   return buf;
336 }
337 
338 
339 /* processing with finding files in a list file */
proclist(const char * name,const char * lfile,int wmax,const CBLIST * tsuflist,const CBLIST * hsuflist)340 int proclist(const char *name, const char *lfile, int wmax,
341              const CBLIST *tsuflist, const CBLIST *hsuflist){
342   ODEUM *odeum;
343   VILLA *mtdb;
344   FILE *ifp;
345   char *line, path[PATHBUFSIZ];
346   int err, fatal;
347   if(!strcmp(lfile, "-")){
348     ifp = stdin;
349   } else {
350     if(!(ifp = fopen(lfile, "rb"))){
351       printferror("%s: file cannot be opened", lfile);
352       return 1;
353     }
354   }
355   printfinfo("%s: registration started", name);
356   if(!(odeum = odopen(name, OD_OWRITER | OD_OCREAT))){
357     pdperror(name);
358     if(ifp != stdin) fclose(ifp);
359     return 1;
360   }
361   sprintf(path, "%s%c%s", name, PATHCHR, MTDBNAME);
362   if(!(mtdb = vlopen(path, VL_OWRITER | VL_OCREAT, VL_CMPLEX))){
363     pdperror(name);
364     odclose(odeum);
365     if(ifp != stdin) fclose(ifp);
366     return 1;
367   }
368   vlsettuning(mtdb, MTDBLRM, MTDBNIM, MTDBLCN, MTDBNCN);
369   printfinfo("%s: database opened: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
370              name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
371   err = FALSE;
372   while((line = fgetl(ifp)) != NULL){
373     if(sigterm){
374       printferror("aborting due to a termination signal");
375       free(line);
376       err = TRUE;
377       break;
378     }
379     if(!indexfile(odeum, mtdb, name, line, wmax, tsuflist, hsuflist)) err = TRUE;
380     free(line);
381   }
382   fatal = odfatalerror(odeum);
383   printfinfo("%s: database closing: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
384              name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
385   if(!vlclose(mtdb)){
386     pdperror(name);
387     err = TRUE;
388   }
389   if(!odclose(odeum)){
390     pdperror(name);
391     err = TRUE;
392   }
393   if(ifp != stdin) fclose(ifp);
394   if(err){
395     printfinfo("%s: registration was over%s", name, fatal ? " with fatal error" : "");
396   } else {
397     printfinfo("%s: registration completed successfully", name);
398   }
399   return err ? 1 : 0;
400 }
401 
402 
403 /* processing with finding files in a directory */
procdir(const char * name,const char * dir,int wmax,const CBLIST * tsuflist,const CBLIST * hsuflist)404 int procdir(const char *name, const char *dir, int wmax,
405             const CBLIST *tsuflist, const CBLIST *hsuflist){
406   ODEUM *odeum;
407   VILLA *mtdb;
408   char path[PATHBUFSIZ];
409   int err, fatal;
410   printfinfo("%s: registration started", name);
411   if(!(odeum = odopen(name, OD_OWRITER | OD_OCREAT))){
412     pdperror(name);
413     return 1;
414   }
415   sprintf(path, "%s%c%s", name, PATHCHR, MTDBNAME);
416   if(!(mtdb = vlopen(path, VL_OWRITER | VL_OCREAT, VL_CMPLEX))){
417     pdperror(name);
418     odclose(odeum);
419     return 1;
420   }
421   vlsettuning(mtdb, MTDBLRM, MTDBNIM, MTDBLCN, MTDBNCN);
422   printfinfo("%s: database opened: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
423              name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
424   err = FALSE;
425   if(!indexdir(odeum, mtdb, name, dir, wmax, tsuflist, hsuflist)) err = TRUE;
426   fatal = odfatalerror(odeum);
427   printfinfo("%s: database closing: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
428              name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
429   if(!vlclose(mtdb)){
430     pdperror(name);
431     err = TRUE;
432   }
433   if(!odclose(odeum)){
434     pdperror(name);
435     err = TRUE;
436   }
437   if(err){
438     printfinfo("%s: registration was over%s", name, fatal ? " with fatal error" : "");
439   } else {
440     printfinfo("%s: registration completed successfully", name);
441   }
442   return err ? 1 : 0;
443 }
444 
445 
446 /* find and index files in a directory */
indexdir(ODEUM * odeum,VILLA * mtdb,const char * name,const char * dir,int wmax,const CBLIST * tsuflist,const CBLIST * hsuflist)447 int indexdir(ODEUM *odeum, VILLA *mtdb, const char *name, const char *dir, int wmax,
448              const CBLIST *tsuflist, const CBLIST *hsuflist){
449   CBLIST *files;
450   const char *file;
451   char path[PATHBUFSIZ];
452   int i, isroot, isdir, err;
453   if(!(files = cbdirlist(dir))){
454     printferror("%s: directory cannot be opened", dir);
455     return FALSE;
456   }
457   isroot = dir[0] == PATHCHR && dir[1] == '\0';
458   err = FALSE;
459   for(i = 0; i < cblistnum(files); i++){
460     if(sigterm){
461       printferror("aborting due to a termination signal");
462       cblistclose(files);
463       return FALSE;
464     }
465     file = cblistval(files, i, NULL);
466     if(!strcmp(file, CDIRSTR) || !strcmp(file, PDIRSTR)) continue;
467     if(isroot){
468       sprintf(path, "%s%s", dir, file);
469     } else {
470       sprintf(path, "%s%c%s", dir, PATHCHR, file);
471     }
472     if(!cbfilestat(path, &isdir, NULL, NULL)){
473       printferror("%s: file does not exist", file);
474       err = TRUE;
475       continue;
476     }
477     if(isdir){
478       if(!indexdir(odeum, mtdb, name, path, wmax, tsuflist, hsuflist)) err = TRUE;
479     } else {
480       if(!indexfile(odeum, mtdb, name, path, wmax, tsuflist, hsuflist)) err = TRUE;
481     }
482   }
483   cblistclose(files);
484   return err ? FALSE : TRUE;
485 }
486 
487 
488 /* index a file into the database */
indexfile(ODEUM * odeum,VILLA * mtdb,const char * name,const char * file,int wmax,const CBLIST * tsuflist,const CBLIST * hsuflist)489 int indexfile(ODEUM *odeum, VILLA *mtdb, const char *name, const char *file, int wmax,
490               const CBLIST *tsuflist, const CBLIST *hsuflist){
491   static int cnt = 0;
492   char *vbuf, *buf, *uri;
493   const char *title;
494   int size, hot, vsiz, wnum, bnum;
495   time_t mtime;
496   ODDOC *doc;
497   if(!cbfilestat(file, NULL, &size, &mtime)){
498     printferror("%s: file does not exist", file);
499     return FALSE;
500   }
501   hot = TRUE;
502   if((vbuf = vlget(mtdb, file, -1, &vsiz)) != NULL){
503     if(vsiz == sizeof(int) && mtime <= *(int *)vbuf) hot = FALSE;
504     free(vbuf);
505   }
506   if(!hot){
507     printfinfo("%s: passed", file);
508     return TRUE;
509   }
510   doc = NULL;
511   uri = filetouri(file);
512   if(bwimatchlist(file, tsuflist)){
513     if(!(buf = cbreadfile(file, NULL))){
514       printferror("%s: file cannot be opened", file);
515       return FALSE;
516     }
517     doc = makedocplain(uri, buf, datestr(mtime));
518     free(buf);
519   } else if(bwimatchlist(file, hsuflist)){
520     if(!(buf = cbreadfile(file, NULL))){
521       printferror("%s: file cannot be opened", file);
522       return FALSE;
523     }
524     doc = makedochtml(uri, buf, datestr(mtime));
525     free(buf);
526   }
527   free(uri);
528   if(doc){
529     if(!(title = oddocgetattr(doc, "title")) || strlen(title) < 1){
530       if((title = strrchr(file, PATHCHR)) != NULL){
531         title++;
532       }  else {
533         title = file;
534       }
535       oddocaddattr(doc, "title", title);
536     }
537     if(odput(odeum, doc, wmax, TRUE) &&
538        vlput(mtdb, file, -1, (char *)&mtime, sizeof(int), VL_DOVER)){
539       printfinfo("%s: registered: id=%d wnum=%d",
540                  file, oddocid(doc), cblistnum(oddocnwords(doc)));
541       cnt++;
542     } else {
543       pdperror(file);
544     }
545     oddocclose(doc);
546   }
547   wnum = odwnum(odeum);
548   bnum = odbnum(odeum);
549   if(wnum != -1 && bnum != -1 && (double)wnum / (double)bnum > MAXLOAD){
550     printfinfo("%s: optimizing started: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
551                name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
552     if(!odoptimize(odeum)){
553       pdperror(file);
554       return FALSE;
555     }
556     printfinfo("%s: optimizing completed: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
557                name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
558   }
559   if(cnt >= 256){
560     printfinfo("%s: database status: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
561                name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
562     cnt = 0;
563   }
564   return TRUE;
565 }
566 
567 
568 /* make the url from file path */
filetouri(const char * file)569 char *filetouri(const char *file){
570   CBLIST *list;
571   char str[PATHBUFSIZ], *wp, *enc;
572   const char *name;
573   int i, nsiz;
574   sprintf(str, "%c", PATHCHR);
575   list = cbsplit(file, -1, str);
576   wp = str;
577   for(i = 0; i < cblistnum(list); i++){
578     if(i > 0) *(wp++) = '/';
579     name = cblistval(list, i, &nsiz);
580     enc = cburlencode(name, nsiz);
581     wp += sprintf(wp, "%s", enc);
582     free(enc);
583   }
584   cblistclose(list);
585   *wp = '\0';
586   return cbmemdup(str, -1);
587 }
588 
589 
590 /* make a document of plain text */
makedocplain(const char * uri,const char * text,const char * date)591 ODDOC *makedocplain(const char *uri, const char *text, const char *date){
592   ODDOC *doc;
593   CBLIST *awords;
594   const char *asis;
595   char *normal;
596   int i;
597   doc = oddocopen(uri);
598   if(date) oddocaddattr(doc, "date", date);
599   awords = odbreaktext(text);
600   for(i = 0; i < cblistnum(awords); i++){
601     asis = cblistval(awords, i, NULL);
602     normal = odnormalizeword(asis);
603     oddocaddword(doc, normal, asis);
604     free(normal);
605   }
606   cblistclose(awords);
607   return doc;
608 }
609 
610 
611 /* make a document of HTML */
makedochtml(const char * uri,const char * html,const char * date)612 ODDOC *makedochtml(const char *uri, const char *html, const char *date){
613   ODDOC *doc;
614   CBMAP *pairs;
615   CBLIST *elems, *awords;
616   const char *text, *asis;
617   char *rtext, *normal;
618   int i, j, body;
619   pairs = htmlescpairs();
620   doc = oddocopen(uri);
621   if(date) oddocaddattr(doc, "date", date);
622   elems = cbxmlbreak(html, TRUE);
623   body = FALSE;
624   for(i = 0; i < cblistnum(elems); i++){
625     text = cblistval(elems, i, NULL);
626     if(cbstrfwimatch(text, "<title")){
627       i++;
628       if(i < cblistnum(elems)){
629         text = cblistval(elems, i, NULL);
630         if(text[0] == '<') text = "";
631         rtext = cbreplace(text, pairs);
632         for(j = 0; rtext[j] != '\0'; j++){
633           if(strchr("\t\n\v\f\r", rtext[j])) rtext[j] = ' ';
634         }
635         while(--j >= 0){
636           if(rtext[j] != ' ') break;
637           rtext[j] = '\0';
638         }
639         for(j = 0; rtext[j] != '\0'; j++){
640           if(rtext[j] != ' ') break;
641         }
642         oddocaddattr(doc, "title", rtext + j);
643         awords = odbreaktext(rtext);
644         for(j = 0; j < cblistnum(awords); j++){
645           asis = cblistval(awords, j, NULL);
646           normal = odnormalizeword(asis);
647           oddocaddword(doc, normal, "");
648           free(normal);
649         }
650         cblistclose(awords);
651         free(rtext);
652       }
653     } else if(cbstrfwimatch(text, "<body")){
654       body = TRUE;
655     } else if(body && text[0] != '<'){
656       rtext = cbreplace(text, pairs);
657       awords = odbreaktext(rtext);
658       for(j = 0; j < cblistnum(awords); j++){
659         asis = cblistval(awords, j, NULL);
660         normal = odnormalizeword(asis);
661         oddocaddword(doc, normal, asis);
662         free(normal);
663       }
664       cblistclose(awords);
665       free(rtext);
666     }
667   }
668   if(!body){
669     for(i = 0; i < cblistnum(elems); i++){
670       text = cblistval(elems, i, NULL);
671       if(cbstrfwimatch(text, "<title")){
672         i++;
673       } else if(text[0] != '<'){
674         rtext = cbreplace(text, pairs);
675         awords = odbreaktext(rtext);
676         for(j = 0; j < cblistnum(awords); j++){
677           asis = cblistval(awords, j, NULL);
678           normal = odnormalizeword(asis);
679           oddocaddword(doc, normal, asis);
680           free(normal);
681         }
682         cblistclose(awords);
683         free(rtext);
684       }
685     }
686   }
687   cblistclose(elems);
688   return doc;
689 }
690 
691 
692 /* get pairs of escaping characters */
htmlescpairs(void)693 CBMAP *htmlescpairs(void){
694   char *latinext[] = {
695     " ", "!", "(cent)", "(pound)", "(currency)", "(yen)", "|", "(section)", "\"", "(C)",
696     "", "<<", "(not)", "-", "(R)", "~", "(degree)", "+-", "^2", "^3",
697     "'", "(u)", "(P)", "*", ",", "^1", "", ">>", "(1/4)", "(1/2)",
698     "(3/4)", "?", "A", "A", "A", "A", "A", "A", "AE", "C",
699     "E", "E", "E", "E", "I", "I", "I", "I", "D", "N",
700     "O", "O", "O", "O", "O", "*", "O", "U", "U", "U",
701     "U", "Y", "P", "s", "a", "a", "a", "a", "a", "a",
702     "ae", "c", "e", "e", "e", "e", "i", "i", "i", "i",
703     "o", "n", "o", "o", "o", "o", "o", "/", "o", "u",
704     "u", "u", "u", "y", "p", "y", NULL
705   };
706   static CBMAP *pairs = NULL;
707   char kbuf[8], vbuf[8];
708   int i, ksiz, vsiz;
709   if(pairs) return pairs;
710   pairs = cbmapopen();
711   cbglobalgc(pairs, (void (*)(void *))cbmapclose);
712   cbmapput(pairs, "&amp;", -1, "&", -1, TRUE);
713   cbmapput(pairs, "&lt;", -1, "<", -1, TRUE);
714   cbmapput(pairs, "&gt;", -1, ">", -1, TRUE);
715   cbmapput(pairs, "&quot;", -1, "\"", -1, TRUE);
716   cbmapput(pairs, "&apos;", -1, "'", -1, TRUE);
717   cbmapput(pairs, "&nbsp;", -1, " ", -1, TRUE);
718   cbmapput(pairs, "&copy;", -1, "(C)", -1, TRUE);
719   cbmapput(pairs, "&reg;", -1, "(R)", -1, TRUE);
720   cbmapput(pairs, "&trade;", -1, "(TM)", -1, TRUE);
721   for(i = 1; i <= 127; i++){
722     ksiz = sprintf(kbuf, "&#%d;", i);
723     vsiz = sprintf(vbuf, "%c", i);
724     cbmapput(pairs, kbuf, ksiz, vbuf, vsiz, TRUE);
725   }
726   cbmapput(pairs, "&#130;", -1, ",", -1, TRUE);
727   cbmapput(pairs, "&#132;", -1, ",,", -1, TRUE);
728   cbmapput(pairs, "&#133;", -1, "...", -1, TRUE);
729   cbmapput(pairs, "&#139;", -1, "<", -1, TRUE);
730   cbmapput(pairs, "&#145;", -1, "'", -1, TRUE);
731   cbmapput(pairs, "&#146;", -1, "'", -1, TRUE);
732   cbmapput(pairs, "&#147;", -1, "\"", -1, TRUE);
733   cbmapput(pairs, "&#148;", -1, "\"", -1, TRUE);
734   cbmapput(pairs, "&#150;", -1, "-", -1, TRUE);
735   cbmapput(pairs, "&#151;", -1, "-", -1, TRUE);
736   cbmapput(pairs, "&#152;", -1, "~", -1, TRUE);
737   cbmapput(pairs, "&#153;", -1, "(TM)", -1, TRUE);
738   cbmapput(pairs, "&#155;", -1, ">", -1, TRUE);
739   for(i = 0; latinext[i]; i++){
740     ksiz = sprintf(kbuf, "&#%d;", i + 160);
741     cbmapput(pairs, kbuf, ksiz, latinext[i], -1, TRUE);
742   }
743   return pairs;
744 }
745 
746 
747 /* register scores of documents */
procrelate(const char * name)748 int procrelate(const char *name){
749   ODEUM *odeum;
750   DEPOT *scdb;
751   ODDOC *doc;
752   CBMAP *scores;
753   const char *file;
754   char path[PATHBUFSIZ], *mbuf;
755   int err, fatal, id, msiz;
756   printfinfo("%s: relating started", name);
757   if(!(odeum = odopen(name, OD_OWRITER))){
758     pdperror(name);
759     return 1;
760   }
761   sprintf(path, "%s%c%s", name, PATHCHR, SCDBNAME);
762   if(!(scdb = dpopen(path, OD_OWRITER | OD_OCREAT, SCDBBNUM))){
763     pdperror(name);
764     odclose(odeum);
765     return 1;
766   }
767   if(!dpsetalign(scdb, SCDBALIGN)){
768     pdperror(name);
769     dpclose(scdb);
770     odclose(odeum);
771     return 1;
772   }
773   printfinfo("%s: database opened: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
774              name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
775   err = FALSE;
776   if(!oditerinit(odeum)){
777     pdperror(name);
778     err = TRUE;
779   } else {
780     while(TRUE){
781       if(sigterm){
782         printferror("aborting due to a termination signal");
783         err = TRUE;
784         break;
785       }
786       if(!(doc = oditernext(odeum))){
787         if(dpecode != DP_ENOITEM){
788           pdperror(name);
789           err = TRUE;
790         }
791         break;
792       }
793       file = oddocuri(doc);
794       id = oddocid(doc);
795       scores = oddocscores(doc, KEYNUM, odeum);
796       mbuf = cbmapdump(scores, &msiz);
797       if(!dpput(scdb, (char *)&id, sizeof(int), mbuf, msiz, DP_DOVER)){
798         pdperror(name);
799         err = TRUE;
800       } else {
801         printfinfo("%s: related", file);
802       }
803       free(mbuf);
804       cbmapclose(scores);
805       oddocclose(doc);
806       if(err) break;
807     }
808   }
809   fatal = odfatalerror(odeum);
810   printfinfo("%s: database closing: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
811              name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
812   if(!dpclose(scdb)){
813     pdperror(name);
814     err = TRUE;
815   }
816   if(!odclose(odeum)){
817     pdperror(name);
818     err = TRUE;
819   }
820   if(err){
821     printfinfo("%s: relating was over%s", name, fatal ? " with fatal error" : "");
822   } else {
823     printfinfo("%s: relating completed successfully", name);
824   }
825   return err ? 1 : 0;
826 }
827 
828 
829 /* purge documents which is not existing. */
procpurge(const char * name)830 int procpurge(const char *name){
831   ODEUM *odeum;
832   ODDOC *doc;
833   const char *file;
834   int err, fatal;
835   printfinfo("%s: purging started", name);
836   if(!(odeum = odopen(name, OD_OWRITER))){
837     pdperror(name);
838     return 1;
839   }
840   printfinfo("%s: database opened: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
841              name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
842   err = FALSE;
843   if(!oditerinit(odeum)){
844     pdperror(name);
845     err = TRUE;
846   } else {
847     while(TRUE){
848       if(sigterm){
849         printferror("aborting due to a termination signal");
850         err = TRUE;
851         break;
852       }
853       if(!(doc = oditernext(odeum))){
854         if(dpecode != DP_ENOITEM){
855           pdperror(name);
856           err = TRUE;
857         }
858         break;
859       }
860       file = oddocuri(doc);
861       if(cbfilestat(file, NULL, NULL, NULL)){
862         printfinfo("%s: passed", file);
863       } else {
864         if(!odout(odeum, file)){
865           pdperror(file);
866           err = TRUE;
867         }
868         printfinfo("%s: purged", file);
869       }
870       oddocclose(doc);
871     }
872   }
873   fatal = odfatalerror(odeum);
874   printfinfo("%s: database closing: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
875              name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
876   if(!odclose(odeum)){
877     pdperror(name);
878     err = TRUE;
879   }
880   if(err){
881     printfinfo("%s: purging was over%s", name, fatal ? " with fatal error" : "");
882   } else {
883     printfinfo("%s: purging completed successfully", name);
884   }
885   return err ? 1 : 0;
886 }
887 
888 
889 
890 /* END OF FILE */
891