1 /*************************************************************************************************
2 * Utility for indexing document files into a database of Odeum
3 * Copyright (C) 2000-2007 Mikio Hirabayashi
4 * This file is part of QDBM, Quick Database Manager.
5 * QDBM is free software; you can redistribute it and/or modify it under the terms of the GNU
6 * Lesser General Public License as published by the Free Software Foundation; either version
7 * 2.1 of the License or any later version. QDBM is distributed in the hope that it will be
8 * useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
9 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
10 * details.
11 * You should have received a copy of the GNU Lesser General Public License along with QDBM; if
12 * not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
13 * 02111-1307 USA.
14 *************************************************************************************************/
15
16
17 #include <depot.h>
18 #include <cabin.h>
19 #include <odeum.h>
20 #include <stdlib.h>
21 #include <stdio.h>
22 #include <string.h>
23 #include <stdarg.h>
24 #include <time.h>
25 #include <signal.h>
26
27 #undef TRUE
28 #define TRUE 1 /* boolean true */
29 #undef FALSE
30 #define FALSE 0 /* boolean false */
31
32 #define PATHCHR '/' /* delimiter character of path */
33 #define EXTCHR '.' /* delimiter character of extension */
34 #define CDIRSTR "." /* string of current directory */
35 #define PDIRSTR ".." /* string of parent directory */
36 #define MTDBNAME "_mtime" /* name of the database for last modified times */
37 #define MTDBLRM 81 /* records in a leaf node of time database */
38 #define MTDBNIM 192 /* records in a non-leaf node of time database */
39 #define MTDBLCN 64 /* number of leaf cache of time database */
40 #define MTDBNCN 32 /* number of non-leaf cache of time database */
41 #define SCDBNAME "_score" /* name of the database for scores */
42 #define SCDBBNUM 32749 /* bucket number of the score database */
43 #define SCDBALIGN -3 /* alignment of the score database */
44 #define PATHBUFSIZ 2048 /* size of a path buffer */
45 #define MAXLOAD 0.85 /* max ratio of bucket loading */
46 #define KEYNUM 32 /* number of keywords to store */
47
48
49 /* for Win32 and RISC OS */
50 #if defined(_WIN32)
51 #undef PATHCHR
52 #define PATHCHR '\\'
53 #undef EXTCHR
54 #define EXTCHR '.'
55 #undef CDIRSTR
56 #define CDIRSTR "."
57 #undef PDIRSTR
58 #define PDIRSTR ".."
59 #elif defined(__riscos__) || defined(__riscos)
60 #include <unixlib/local.h>
61 int __riscosify_control = __RISCOSIFY_NO_PROCESS;
62 #undef PATHCHR
63 #define PATHCHR '.'
64 #undef EXTCHR
65 #define EXTCHR '/'
66 #undef CDIRSTR
67 #define CDIRSTR "@"
68 #undef PDIRSTR
69 #define PDIRSTR "^"
70 #endif
71
72
73 /* global variables */
74 const char *progname; /* program name */
75 int sigterm; /* flag for termination signal */
76
77
78 /* function prototypes */
79 int main(int argc, char **argv);
80 void setsignals(void);
81 void sigtermhandler(int num);
82 void usage(void);
83 int runregister(int argc, char **argv);
84 int runrelate(int argc, char **argv);
85 int runpurge(int argc, char **argv);
86 int bwimatchlist(const char *str, const CBLIST *keys);
87 char *fgetl(FILE *ifp);
88 void otcb(const char *fname, ODEUM *odeum, const char *msg);
89 void pdperror(const char *name);
90 void printferror(const char *format, ...);
91 void printfinfo(const char *format, ...);
92 const char *datestr(time_t t);
93 int proclist(const char *name, const char *lfile, int wmax,
94 const CBLIST *tsuflist, const CBLIST *hsuflist);
95 int procdir(const char *name, const char *dir, int wmax,
96 const CBLIST *tsuflist, const CBLIST *hsuflist);
97 int indexdir(ODEUM *odeum, VILLA *mtdb, const char *name, const char *dir, int wmax,
98 const CBLIST *tsuflist, const CBLIST *hsuflist);
99 int indexfile(ODEUM *odeum, VILLA *mtdb, const char *name, const char *file, int wmax,
100 const CBLIST *tsuflist, const CBLIST *hsuflist);
101 char *filetouri(const char *file);
102 ODDOC *makedocplain(const char *uri, const char *text, const char *date);
103 ODDOC *makedochtml(const char *uri, const char *html, const char *date);
104 CBMAP *htmlescpairs(void);
105 int procrelate(const char *name);
106 int procpurge(const char *name);
107
108
109 /* main routine */
main(int argc,char ** argv)110 int main(int argc, char **argv){
111 int rv;
112 cbstdiobin();
113 progname = argv[0];
114 sigterm = FALSE;
115 setsignals();
116 if(argc < 2) usage();
117 odsetotcb(otcb);
118 rv = 0;
119 if(!strcmp(argv[1], "register")){
120 rv = runregister(argc, argv);
121 } else if(!strcmp(argv[1], "relate")){
122 rv = runrelate(argc, argv);
123 } else if(!strcmp(argv[1], "purge")){
124 rv = runpurge(argc, argv);
125 } else {
126 usage();
127 }
128 return rv;
129 }
130
131
132 /* set signal handlers */
setsignals(void)133 void setsignals(void){
134 signal(1, sigtermhandler);
135 signal(2, sigtermhandler);
136 signal(3, sigtermhandler);
137 signal(13, sigtermhandler);
138 signal(15, sigtermhandler);
139 }
140
141
142 /* handler of termination signal */
sigtermhandler(int num)143 void sigtermhandler(int num){
144 signal(num, SIG_DFL);
145 sigterm = TRUE;
146 printfinfo("the termination signal %d catched", num);
147 }
148
149
150 /* print the usage and exit */
usage(void)151 void usage(void){
152 fprintf(stderr, "%s: indexer of document files\n", progname);
153 fprintf(stderr, "\n");
154 fprintf(stderr, "usage:\n");
155 fprintf(stderr, " %s register [-l file] [-wmax num] [-tsuf sufs] [-hsuf sufs] name [dir]\n",
156 progname);
157 fprintf(stderr, " %s relate name\n", progname);
158 fprintf(stderr, " %s purge name\n", progname);
159 fprintf(stderr, "\n");
160 exit(1);
161 }
162
163
164 /* parse arguments of register command */
runregister(int argc,char ** argv)165 int runregister(int argc, char **argv){
166 char *name, *dir, *lfile, *tsuf, *hsuf, path[PATHBUFSIZ];
167 int i, wmax, plen, rv;
168 CBLIST *tsuflist, *hsuflist;
169 name = NULL;
170 dir = NULL;
171 lfile = NULL;
172 tsuf = NULL;
173 hsuf = NULL;
174 wmax = -1;
175 for(i = 2; i < argc; i++){
176 if(!name && argv[i][0] == '-'){
177 if(!strcmp(argv[i], "-l")){
178 if(++i >= argc) usage();
179 lfile = argv[i];
180 } else if(!strcmp(argv[i], "-wmax")){
181 if(++i >= argc) usage();
182 wmax = atoi(argv[i]);
183 } else if(!strcmp(argv[i], "-tsuf")){
184 if(++i >= argc) usage();
185 tsuf = argv[i];
186 } else if(!strcmp(argv[i], "-hsuf")){
187 if(++i >= argc) usage();
188 hsuf = argv[i];
189 } else {
190 usage();
191 }
192 } else if(!name){
193 name = argv[i];
194 } else if(!dir){
195 dir = argv[i];
196 } else {
197 usage();
198 }
199 }
200 if(!name) usage();
201 if(!dir) dir = CDIRSTR;
202 plen = sprintf(path, "%s", dir);
203 if(plen > 1 && path[plen-1] == PATHCHR) path[plen-1] = '\0';
204 tsuflist = cbsplit(tsuf ? tsuf : ".txt,.text", -1, ",");
205 hsuflist = cbsplit(hsuf ? hsuf : ".html,.htm", -1, ",");
206 if(lfile){
207 rv = proclist(name, lfile, wmax, tsuflist, hsuflist);
208 } else {
209 rv = procdir(name, path, wmax, tsuflist, hsuflist);
210 }
211 cblistclose(hsuflist);
212 cblistclose(tsuflist);
213 return rv;
214 }
215
216
217 /* parse arguments of relate command */
runrelate(int argc,char ** argv)218 int runrelate(int argc, char **argv){
219 char *name;
220 int i, rv;
221 name = NULL;
222 for(i = 2; i < argc; i++){
223 if(!name && argv[i][0] == '-'){
224 usage();
225 } else if(!name){
226 name = argv[i];
227 } else {
228 usage();
229 }
230 }
231 if(!name) usage();
232 rv = procrelate(name);
233 return rv;
234 }
235
236
237 /* parse arguments of purge command */
runpurge(int argc,char ** argv)238 int runpurge(int argc, char **argv){
239 char *name;
240 int i, rv;
241 name = NULL;
242 for(i = 2; i < argc; i++){
243 if(!name && argv[i][0] == '-'){
244 usage();
245 } else if(!name){
246 name = argv[i];
247 } else {
248 usage();
249 }
250 }
251 if(!name) usage();
252 rv = procpurge(name);
253 return rv;
254 }
255
256
257 /* case insensitive backward matching with a list */
bwimatchlist(const char * str,const CBLIST * keys)258 int bwimatchlist(const char *str, const CBLIST *keys){
259 int i;
260 for(i = 0; i < cblistnum(keys); i++){
261 if(cbstrbwimatch(str, cblistval(keys, i, NULL))) return TRUE;
262 }
263 return FALSE;
264 }
265
266
267 /* read a line */
fgetl(FILE * ifp)268 char *fgetl(FILE *ifp){
269 char *buf;
270 int c, len, blen;
271 buf = NULL;
272 len = 0;
273 blen = 256;
274 while((c = fgetc(ifp)) != EOF){
275 if(blen <= len) blen *= 2;
276 buf = cbrealloc(buf, blen + 1);
277 if(c == '\n') c = '\0';
278 buf[len++] = c;
279 if(c == '\0') break;
280 }
281 if(!buf) return NULL;
282 buf[len] = '\0';
283 return buf;
284 }
285
286
287 /* report the outturn */
otcb(const char * fname,ODEUM * odeum,const char * msg)288 void otcb(const char *fname, ODEUM *odeum, const char *msg){
289 char *name;
290 name = odname(odeum);
291 printf("%s: %s: %s: %s\n", progname, fname, name, msg);
292 free(name);
293 }
294
295
296 /* print an error message */
pdperror(const char * name)297 void pdperror(const char *name){
298 printf("%s: ERROR: %s: %s\n", progname, name, dperrmsg(dpecode));
299 fflush(stdout);
300 }
301
302
303 /* print formatted error string and flush the buffer */
printferror(const char * format,...)304 void printferror(const char *format, ...){
305 va_list ap;
306 va_start(ap, format);
307 printf("%s: ERROR: ", progname);
308 vprintf(format, ap);
309 putchar('\n');
310 fflush(stdout);
311 va_end(ap);
312 }
313
314
315 /* print formatted information string and flush the buffer */
printfinfo(const char * format,...)316 void printfinfo(const char *format, ...){
317 va_list ap;
318 va_start(ap, format);
319 printf("%s: INFO: ", progname);
320 vprintf(format, ap);
321 putchar('\n');
322 fflush(stdout);
323 va_end(ap);
324 }
325
326
327 /* get static string of the date */
datestr(time_t t)328 const char *datestr(time_t t){
329 static char buf[32];
330 struct tm *stp;
331 if(!(stp = localtime(&t))) return "0000/00/00 00:00:00";
332 sprintf(buf, "%04d/%02d/%02d %02d:%02d:%02d",
333 stp->tm_year + 1900, stp->tm_mon + 1, stp->tm_mday,
334 stp->tm_hour, stp->tm_min, stp->tm_sec);
335 return buf;
336 }
337
338
339 /* processing with finding files in a list file */
proclist(const char * name,const char * lfile,int wmax,const CBLIST * tsuflist,const CBLIST * hsuflist)340 int proclist(const char *name, const char *lfile, int wmax,
341 const CBLIST *tsuflist, const CBLIST *hsuflist){
342 ODEUM *odeum;
343 VILLA *mtdb;
344 FILE *ifp;
345 char *line, path[PATHBUFSIZ];
346 int err, fatal;
347 if(!strcmp(lfile, "-")){
348 ifp = stdin;
349 } else {
350 if(!(ifp = fopen(lfile, "rb"))){
351 printferror("%s: file cannot be opened", lfile);
352 return 1;
353 }
354 }
355 printfinfo("%s: registration started", name);
356 if(!(odeum = odopen(name, OD_OWRITER | OD_OCREAT))){
357 pdperror(name);
358 if(ifp != stdin) fclose(ifp);
359 return 1;
360 }
361 sprintf(path, "%s%c%s", name, PATHCHR, MTDBNAME);
362 if(!(mtdb = vlopen(path, VL_OWRITER | VL_OCREAT, VL_CMPLEX))){
363 pdperror(name);
364 odclose(odeum);
365 if(ifp != stdin) fclose(ifp);
366 return 1;
367 }
368 vlsettuning(mtdb, MTDBLRM, MTDBNIM, MTDBLCN, MTDBNCN);
369 printfinfo("%s: database opened: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
370 name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
371 err = FALSE;
372 while((line = fgetl(ifp)) != NULL){
373 if(sigterm){
374 printferror("aborting due to a termination signal");
375 free(line);
376 err = TRUE;
377 break;
378 }
379 if(!indexfile(odeum, mtdb, name, line, wmax, tsuflist, hsuflist)) err = TRUE;
380 free(line);
381 }
382 fatal = odfatalerror(odeum);
383 printfinfo("%s: database closing: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
384 name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
385 if(!vlclose(mtdb)){
386 pdperror(name);
387 err = TRUE;
388 }
389 if(!odclose(odeum)){
390 pdperror(name);
391 err = TRUE;
392 }
393 if(ifp != stdin) fclose(ifp);
394 if(err){
395 printfinfo("%s: registration was over%s", name, fatal ? " with fatal error" : "");
396 } else {
397 printfinfo("%s: registration completed successfully", name);
398 }
399 return err ? 1 : 0;
400 }
401
402
403 /* processing with finding files in a directory */
procdir(const char * name,const char * dir,int wmax,const CBLIST * tsuflist,const CBLIST * hsuflist)404 int procdir(const char *name, const char *dir, int wmax,
405 const CBLIST *tsuflist, const CBLIST *hsuflist){
406 ODEUM *odeum;
407 VILLA *mtdb;
408 char path[PATHBUFSIZ];
409 int err, fatal;
410 printfinfo("%s: registration started", name);
411 if(!(odeum = odopen(name, OD_OWRITER | OD_OCREAT))){
412 pdperror(name);
413 return 1;
414 }
415 sprintf(path, "%s%c%s", name, PATHCHR, MTDBNAME);
416 if(!(mtdb = vlopen(path, VL_OWRITER | VL_OCREAT, VL_CMPLEX))){
417 pdperror(name);
418 odclose(odeum);
419 return 1;
420 }
421 vlsettuning(mtdb, MTDBLRM, MTDBNIM, MTDBLCN, MTDBNCN);
422 printfinfo("%s: database opened: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
423 name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
424 err = FALSE;
425 if(!indexdir(odeum, mtdb, name, dir, wmax, tsuflist, hsuflist)) err = TRUE;
426 fatal = odfatalerror(odeum);
427 printfinfo("%s: database closing: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
428 name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
429 if(!vlclose(mtdb)){
430 pdperror(name);
431 err = TRUE;
432 }
433 if(!odclose(odeum)){
434 pdperror(name);
435 err = TRUE;
436 }
437 if(err){
438 printfinfo("%s: registration was over%s", name, fatal ? " with fatal error" : "");
439 } else {
440 printfinfo("%s: registration completed successfully", name);
441 }
442 return err ? 1 : 0;
443 }
444
445
446 /* find and index files in a directory */
indexdir(ODEUM * odeum,VILLA * mtdb,const char * name,const char * dir,int wmax,const CBLIST * tsuflist,const CBLIST * hsuflist)447 int indexdir(ODEUM *odeum, VILLA *mtdb, const char *name, const char *dir, int wmax,
448 const CBLIST *tsuflist, const CBLIST *hsuflist){
449 CBLIST *files;
450 const char *file;
451 char path[PATHBUFSIZ];
452 int i, isroot, isdir, err;
453 if(!(files = cbdirlist(dir))){
454 printferror("%s: directory cannot be opened", dir);
455 return FALSE;
456 }
457 isroot = dir[0] == PATHCHR && dir[1] == '\0';
458 err = FALSE;
459 for(i = 0; i < cblistnum(files); i++){
460 if(sigterm){
461 printferror("aborting due to a termination signal");
462 cblistclose(files);
463 return FALSE;
464 }
465 file = cblistval(files, i, NULL);
466 if(!strcmp(file, CDIRSTR) || !strcmp(file, PDIRSTR)) continue;
467 if(isroot){
468 sprintf(path, "%s%s", dir, file);
469 } else {
470 sprintf(path, "%s%c%s", dir, PATHCHR, file);
471 }
472 if(!cbfilestat(path, &isdir, NULL, NULL)){
473 printferror("%s: file does not exist", file);
474 err = TRUE;
475 continue;
476 }
477 if(isdir){
478 if(!indexdir(odeum, mtdb, name, path, wmax, tsuflist, hsuflist)) err = TRUE;
479 } else {
480 if(!indexfile(odeum, mtdb, name, path, wmax, tsuflist, hsuflist)) err = TRUE;
481 }
482 }
483 cblistclose(files);
484 return err ? FALSE : TRUE;
485 }
486
487
488 /* index a file into the database */
indexfile(ODEUM * odeum,VILLA * mtdb,const char * name,const char * file,int wmax,const CBLIST * tsuflist,const CBLIST * hsuflist)489 int indexfile(ODEUM *odeum, VILLA *mtdb, const char *name, const char *file, int wmax,
490 const CBLIST *tsuflist, const CBLIST *hsuflist){
491 static int cnt = 0;
492 char *vbuf, *buf, *uri;
493 const char *title;
494 int size, hot, vsiz, wnum, bnum;
495 time_t mtime;
496 ODDOC *doc;
497 if(!cbfilestat(file, NULL, &size, &mtime)){
498 printferror("%s: file does not exist", file);
499 return FALSE;
500 }
501 hot = TRUE;
502 if((vbuf = vlget(mtdb, file, -1, &vsiz)) != NULL){
503 if(vsiz == sizeof(int) && mtime <= *(int *)vbuf) hot = FALSE;
504 free(vbuf);
505 }
506 if(!hot){
507 printfinfo("%s: passed", file);
508 return TRUE;
509 }
510 doc = NULL;
511 uri = filetouri(file);
512 if(bwimatchlist(file, tsuflist)){
513 if(!(buf = cbreadfile(file, NULL))){
514 printferror("%s: file cannot be opened", file);
515 return FALSE;
516 }
517 doc = makedocplain(uri, buf, datestr(mtime));
518 free(buf);
519 } else if(bwimatchlist(file, hsuflist)){
520 if(!(buf = cbreadfile(file, NULL))){
521 printferror("%s: file cannot be opened", file);
522 return FALSE;
523 }
524 doc = makedochtml(uri, buf, datestr(mtime));
525 free(buf);
526 }
527 free(uri);
528 if(doc){
529 if(!(title = oddocgetattr(doc, "title")) || strlen(title) < 1){
530 if((title = strrchr(file, PATHCHR)) != NULL){
531 title++;
532 } else {
533 title = file;
534 }
535 oddocaddattr(doc, "title", title);
536 }
537 if(odput(odeum, doc, wmax, TRUE) &&
538 vlput(mtdb, file, -1, (char *)&mtime, sizeof(int), VL_DOVER)){
539 printfinfo("%s: registered: id=%d wnum=%d",
540 file, oddocid(doc), cblistnum(oddocnwords(doc)));
541 cnt++;
542 } else {
543 pdperror(file);
544 }
545 oddocclose(doc);
546 }
547 wnum = odwnum(odeum);
548 bnum = odbnum(odeum);
549 if(wnum != -1 && bnum != -1 && (double)wnum / (double)bnum > MAXLOAD){
550 printfinfo("%s: optimizing started: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
551 name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
552 if(!odoptimize(odeum)){
553 pdperror(file);
554 return FALSE;
555 }
556 printfinfo("%s: optimizing completed: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
557 name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
558 }
559 if(cnt >= 256){
560 printfinfo("%s: database status: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
561 name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
562 cnt = 0;
563 }
564 return TRUE;
565 }
566
567
568 /* make the url from file path */
filetouri(const char * file)569 char *filetouri(const char *file){
570 CBLIST *list;
571 char str[PATHBUFSIZ], *wp, *enc;
572 const char *name;
573 int i, nsiz;
574 sprintf(str, "%c", PATHCHR);
575 list = cbsplit(file, -1, str);
576 wp = str;
577 for(i = 0; i < cblistnum(list); i++){
578 if(i > 0) *(wp++) = '/';
579 name = cblistval(list, i, &nsiz);
580 enc = cburlencode(name, nsiz);
581 wp += sprintf(wp, "%s", enc);
582 free(enc);
583 }
584 cblistclose(list);
585 *wp = '\0';
586 return cbmemdup(str, -1);
587 }
588
589
590 /* make a document of plain text */
makedocplain(const char * uri,const char * text,const char * date)591 ODDOC *makedocplain(const char *uri, const char *text, const char *date){
592 ODDOC *doc;
593 CBLIST *awords;
594 const char *asis;
595 char *normal;
596 int i;
597 doc = oddocopen(uri);
598 if(date) oddocaddattr(doc, "date", date);
599 awords = odbreaktext(text);
600 for(i = 0; i < cblistnum(awords); i++){
601 asis = cblistval(awords, i, NULL);
602 normal = odnormalizeword(asis);
603 oddocaddword(doc, normal, asis);
604 free(normal);
605 }
606 cblistclose(awords);
607 return doc;
608 }
609
610
611 /* make a document of HTML */
makedochtml(const char * uri,const char * html,const char * date)612 ODDOC *makedochtml(const char *uri, const char *html, const char *date){
613 ODDOC *doc;
614 CBMAP *pairs;
615 CBLIST *elems, *awords;
616 const char *text, *asis;
617 char *rtext, *normal;
618 int i, j, body;
619 pairs = htmlescpairs();
620 doc = oddocopen(uri);
621 if(date) oddocaddattr(doc, "date", date);
622 elems = cbxmlbreak(html, TRUE);
623 body = FALSE;
624 for(i = 0; i < cblistnum(elems); i++){
625 text = cblistval(elems, i, NULL);
626 if(cbstrfwimatch(text, "<title")){
627 i++;
628 if(i < cblistnum(elems)){
629 text = cblistval(elems, i, NULL);
630 if(text[0] == '<') text = "";
631 rtext = cbreplace(text, pairs);
632 for(j = 0; rtext[j] != '\0'; j++){
633 if(strchr("\t\n\v\f\r", rtext[j])) rtext[j] = ' ';
634 }
635 while(--j >= 0){
636 if(rtext[j] != ' ') break;
637 rtext[j] = '\0';
638 }
639 for(j = 0; rtext[j] != '\0'; j++){
640 if(rtext[j] != ' ') break;
641 }
642 oddocaddattr(doc, "title", rtext + j);
643 awords = odbreaktext(rtext);
644 for(j = 0; j < cblistnum(awords); j++){
645 asis = cblistval(awords, j, NULL);
646 normal = odnormalizeword(asis);
647 oddocaddword(doc, normal, "");
648 free(normal);
649 }
650 cblistclose(awords);
651 free(rtext);
652 }
653 } else if(cbstrfwimatch(text, "<body")){
654 body = TRUE;
655 } else if(body && text[0] != '<'){
656 rtext = cbreplace(text, pairs);
657 awords = odbreaktext(rtext);
658 for(j = 0; j < cblistnum(awords); j++){
659 asis = cblistval(awords, j, NULL);
660 normal = odnormalizeword(asis);
661 oddocaddword(doc, normal, asis);
662 free(normal);
663 }
664 cblistclose(awords);
665 free(rtext);
666 }
667 }
668 if(!body){
669 for(i = 0; i < cblistnum(elems); i++){
670 text = cblistval(elems, i, NULL);
671 if(cbstrfwimatch(text, "<title")){
672 i++;
673 } else if(text[0] != '<'){
674 rtext = cbreplace(text, pairs);
675 awords = odbreaktext(rtext);
676 for(j = 0; j < cblistnum(awords); j++){
677 asis = cblistval(awords, j, NULL);
678 normal = odnormalizeword(asis);
679 oddocaddword(doc, normal, asis);
680 free(normal);
681 }
682 cblistclose(awords);
683 free(rtext);
684 }
685 }
686 }
687 cblistclose(elems);
688 return doc;
689 }
690
691
692 /* get pairs of escaping characters */
htmlescpairs(void)693 CBMAP *htmlescpairs(void){
694 char *latinext[] = {
695 " ", "!", "(cent)", "(pound)", "(currency)", "(yen)", "|", "(section)", "\"", "(C)",
696 "", "<<", "(not)", "-", "(R)", "~", "(degree)", "+-", "^2", "^3",
697 "'", "(u)", "(P)", "*", ",", "^1", "", ">>", "(1/4)", "(1/2)",
698 "(3/4)", "?", "A", "A", "A", "A", "A", "A", "AE", "C",
699 "E", "E", "E", "E", "I", "I", "I", "I", "D", "N",
700 "O", "O", "O", "O", "O", "*", "O", "U", "U", "U",
701 "U", "Y", "P", "s", "a", "a", "a", "a", "a", "a",
702 "ae", "c", "e", "e", "e", "e", "i", "i", "i", "i",
703 "o", "n", "o", "o", "o", "o", "o", "/", "o", "u",
704 "u", "u", "u", "y", "p", "y", NULL
705 };
706 static CBMAP *pairs = NULL;
707 char kbuf[8], vbuf[8];
708 int i, ksiz, vsiz;
709 if(pairs) return pairs;
710 pairs = cbmapopen();
711 cbglobalgc(pairs, (void (*)(void *))cbmapclose);
712 cbmapput(pairs, "&", -1, "&", -1, TRUE);
713 cbmapput(pairs, "<", -1, "<", -1, TRUE);
714 cbmapput(pairs, ">", -1, ">", -1, TRUE);
715 cbmapput(pairs, """, -1, "\"", -1, TRUE);
716 cbmapput(pairs, "'", -1, "'", -1, TRUE);
717 cbmapput(pairs, " ", -1, " ", -1, TRUE);
718 cbmapput(pairs, "©", -1, "(C)", -1, TRUE);
719 cbmapput(pairs, "®", -1, "(R)", -1, TRUE);
720 cbmapput(pairs, "™", -1, "(TM)", -1, TRUE);
721 for(i = 1; i <= 127; i++){
722 ksiz = sprintf(kbuf, "&#%d;", i);
723 vsiz = sprintf(vbuf, "%c", i);
724 cbmapput(pairs, kbuf, ksiz, vbuf, vsiz, TRUE);
725 }
726 cbmapput(pairs, "‚", -1, ",", -1, TRUE);
727 cbmapput(pairs, "„", -1, ",,", -1, TRUE);
728 cbmapput(pairs, "…", -1, "...", -1, TRUE);
729 cbmapput(pairs, "‹", -1, "<", -1, TRUE);
730 cbmapput(pairs, "‘", -1, "'", -1, TRUE);
731 cbmapput(pairs, "’", -1, "'", -1, TRUE);
732 cbmapput(pairs, "“", -1, "\"", -1, TRUE);
733 cbmapput(pairs, "”", -1, "\"", -1, TRUE);
734 cbmapput(pairs, "–", -1, "-", -1, TRUE);
735 cbmapput(pairs, "—", -1, "-", -1, TRUE);
736 cbmapput(pairs, "˜", -1, "~", -1, TRUE);
737 cbmapput(pairs, "™", -1, "(TM)", -1, TRUE);
738 cbmapput(pairs, "›", -1, ">", -1, TRUE);
739 for(i = 0; latinext[i]; i++){
740 ksiz = sprintf(kbuf, "&#%d;", i + 160);
741 cbmapput(pairs, kbuf, ksiz, latinext[i], -1, TRUE);
742 }
743 return pairs;
744 }
745
746
747 /* register scores of documents */
procrelate(const char * name)748 int procrelate(const char *name){
749 ODEUM *odeum;
750 DEPOT *scdb;
751 ODDOC *doc;
752 CBMAP *scores;
753 const char *file;
754 char path[PATHBUFSIZ], *mbuf;
755 int err, fatal, id, msiz;
756 printfinfo("%s: relating started", name);
757 if(!(odeum = odopen(name, OD_OWRITER))){
758 pdperror(name);
759 return 1;
760 }
761 sprintf(path, "%s%c%s", name, PATHCHR, SCDBNAME);
762 if(!(scdb = dpopen(path, OD_OWRITER | OD_OCREAT, SCDBBNUM))){
763 pdperror(name);
764 odclose(odeum);
765 return 1;
766 }
767 if(!dpsetalign(scdb, SCDBALIGN)){
768 pdperror(name);
769 dpclose(scdb);
770 odclose(odeum);
771 return 1;
772 }
773 printfinfo("%s: database opened: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
774 name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
775 err = FALSE;
776 if(!oditerinit(odeum)){
777 pdperror(name);
778 err = TRUE;
779 } else {
780 while(TRUE){
781 if(sigterm){
782 printferror("aborting due to a termination signal");
783 err = TRUE;
784 break;
785 }
786 if(!(doc = oditernext(odeum))){
787 if(dpecode != DP_ENOITEM){
788 pdperror(name);
789 err = TRUE;
790 }
791 break;
792 }
793 file = oddocuri(doc);
794 id = oddocid(doc);
795 scores = oddocscores(doc, KEYNUM, odeum);
796 mbuf = cbmapdump(scores, &msiz);
797 if(!dpput(scdb, (char *)&id, sizeof(int), mbuf, msiz, DP_DOVER)){
798 pdperror(name);
799 err = TRUE;
800 } else {
801 printfinfo("%s: related", file);
802 }
803 free(mbuf);
804 cbmapclose(scores);
805 oddocclose(doc);
806 if(err) break;
807 }
808 }
809 fatal = odfatalerror(odeum);
810 printfinfo("%s: database closing: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
811 name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
812 if(!dpclose(scdb)){
813 pdperror(name);
814 err = TRUE;
815 }
816 if(!odclose(odeum)){
817 pdperror(name);
818 err = TRUE;
819 }
820 if(err){
821 printfinfo("%s: relating was over%s", name, fatal ? " with fatal error" : "");
822 } else {
823 printfinfo("%s: relating completed successfully", name);
824 }
825 return err ? 1 : 0;
826 }
827
828
829 /* purge documents which is not existing. */
procpurge(const char * name)830 int procpurge(const char *name){
831 ODEUM *odeum;
832 ODDOC *doc;
833 const char *file;
834 int err, fatal;
835 printfinfo("%s: purging started", name);
836 if(!(odeum = odopen(name, OD_OWRITER))){
837 pdperror(name);
838 return 1;
839 }
840 printfinfo("%s: database opened: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
841 name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
842 err = FALSE;
843 if(!oditerinit(odeum)){
844 pdperror(name);
845 err = TRUE;
846 } else {
847 while(TRUE){
848 if(sigterm){
849 printferror("aborting due to a termination signal");
850 err = TRUE;
851 break;
852 }
853 if(!(doc = oditernext(odeum))){
854 if(dpecode != DP_ENOITEM){
855 pdperror(name);
856 err = TRUE;
857 }
858 break;
859 }
860 file = oddocuri(doc);
861 if(cbfilestat(file, NULL, NULL, NULL)){
862 printfinfo("%s: passed", file);
863 } else {
864 if(!odout(odeum, file)){
865 pdperror(file);
866 err = TRUE;
867 }
868 printfinfo("%s: purged", file);
869 }
870 oddocclose(doc);
871 }
872 }
873 fatal = odfatalerror(odeum);
874 printfinfo("%s: database closing: fsiz=%.0f dnum=%d wnum=%d bnum=%d",
875 name, odfsiz(odeum), oddnum(odeum), odwnum(odeum), odbnum(odeum));
876 if(!odclose(odeum)){
877 pdperror(name);
878 err = TRUE;
879 }
880 if(err){
881 printfinfo("%s: purging was over%s", name, fatal ? " with fatal error" : "");
882 } else {
883 printfinfo("%s: purging completed successfully", name);
884 }
885 return err ? 1 : 0;
886 }
887
888
889
890 /* END OF FILE */
891