1 /*
2  *
3  * hlist.c -
4  *
5  * Copyright (C) 1997-1999 Satoru Takabayashi  All rights reserved.
6  * This is free software with ABSOLUTELY NO WARRANTY.
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; if not, write to the Free Software
20  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21  * 02111-1307, USA
22  *
23  * This file must be encoded in EUC-JP encoding
24  *
25  */
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <math.h>
31 #include "namazu.h"
32 #include "util.h"
33 
34 /* merge the left and  right with AND rule */
andmerge(HLIST left,HLIST right,int * ignore)35 HLIST andmerge(HLIST left, HLIST right, int *ignore)
36 {
37     int i, j, v;
38 
39     if (*ignore && left.n > 0)
40 	return left;
41     if (*ignore && right.n > 0)
42 	return right;
43 
44     if (left.n <= 0)
45 	return left;
46     if (right.n <= 0)
47 	return right;
48 
49     for (v = 0, i = 0, j = 0; i < left.n; i++) {
50 	for (;; j++) {
51 	    if (j >= right.n)
52 		goto OUT;
53 	    if (left.fid[i] < right.fid[j])
54 		break;
55 	    if (left.fid[i] == right.fid[j]) {
56 
57 		copy_hlist(left, v, left, i);
58                 if (TfIdf) {
59                     left.scr[v] = left.scr[i] + right.scr[j];
60                 } else {
61                     /* assign a smaller number, left or right*/
62                     left.scr[v] = left.scr[i] < right.scr[j] ?
63                         left.scr[i] : right.scr[j];
64                 }
65 		v++;
66 		j++;
67 		break;
68 	    }
69 	}
70     }
71   OUT:
72     free_hlist(right);
73     left.n = v;
74     if (left.n <= 0)
75 	free_hlist(left);
76     return left;
77 }
78 
79 
80 /* merge the left and  right with NOT rule */
notmerge(HLIST left,HLIST right,int * ignore)81 HLIST notmerge(HLIST left, HLIST right, int *ignore)
82 {
83     int i, j, v, f;
84 
85     if (*ignore && left.n > 0)
86 	return left;
87     if (*ignore && right.n > 0)
88 	return right;
89 
90     if (right.n <= 0)
91 	return left;
92     if (left.n <= 0)
93 	return left;
94 
95     for (v = 0, i = 0, j = 0; i < left.n; i++) {
96 	for (f = 0; j < right.n; j++) {
97 	    if (left.fid[i] < right.fid[j])
98 		break;
99 	    if (left.fid[i] == right.fid[j]) {
100 		j++;
101 		f = 1;
102 		break;
103 	    }
104 	}
105 	if (!f) {
106 	    copy_hlist(left, v, left, i);
107 	    v++;
108 	}
109     }
110     free_hlist(right);
111     left.n = v;
112     if (left.n <= 0)
113 	free_hlist(left);
114     return left;
115 }
116 
117 
118 /*
119  * merge the left and  right with OR rule
120  */
ormerge(HLIST left,HLIST right)121 HLIST ormerge(HLIST left, HLIST right)
122 {
123     int i, j, v, n;
124     HLIST val;
125 
126     if (left.n <= 0 && right.n <= 0)
127 	return left;
128     if (left.n <= 0)
129 	return right;
130     if (right.n <= 0)
131 	return left;
132 
133     n = left.n + right.n;
134 
135     malloc_hlist(&val, n);
136 
137     for (v = 0, i = 0, j = 0; i < left.n; i++) {
138 	for (; left.fid[i] >= right.fid[j] && j < right.n; j++) {
139 	    if (left.fid[i] == right.fid[j]) {
140 
141                 if (TfIdf) {
142                     left.scr[i] = left.scr[i] + right.scr[j];
143                 } else {
144                     /* assign a large number, left or right */
145                     left.scr[i] = left.scr[i] > right.scr[j] ?
146                         left.scr[i] : right.scr[j];
147                 }
148 		j++;
149 		break;
150 	    } else {
151 		copy_hlist(val, v, right, j);
152 		v++;
153 	    }
154 	}
155 	copy_hlist(val, v, left, i);
156 	v++;
157     }
158 
159     for (; j < right.n; j++) {
160 	copy_hlist(val, v, right, j);
161 	v++;
162     }
163 
164     free_hlist(left);
165     free_hlist(right);
166     val.n = v;
167     return val;
168 }
169 
malloc_hlist(HLIST * hlist,int n)170 void malloc_hlist(HLIST * hlist, int n)
171 {
172     if (n <= 0) return;
173     hlist->fid = (int *)malloc(n * sizeof(int));
174     if (hlist->fid == NULL) {
175 	 error("malloc_hlist");
176     }
177     hlist->scr = (int *)malloc(n * sizeof(int));
178     if (hlist->scr == NULL) {
179 	 error("malloc_hlist");
180     }
181     hlist->did = (int *)malloc(n * sizeof(int));
182     if (hlist->did == NULL) {
183 	 error("malloc_hlist");
184     }
185     hlist->date = (int *)malloc(n * sizeof(int));
186     if (hlist->date == NULL) {
187 	 error("malloc_hlist");
188     }
189 }
190 
realloc_hlist(HLIST * hlist,int n)191 void realloc_hlist(HLIST * hlist, int n)
192 {
193     if (n <= 0) return;
194     hlist->fid = (int *) realloc(hlist->fid, n * sizeof(int));
195     if (hlist->fid == NULL) {
196 	 error("realloc_hlist");
197     }
198     hlist->scr = (int *) realloc(hlist->scr, n * sizeof(int));
199     if (hlist->scr == NULL) {
200 	 error("realloc_hlist");
201     }
202     hlist->did = (int *) realloc(hlist->did, n * sizeof(int));
203     if (hlist->did == NULL) {
204 	 error("realloc_hlist");
205     }
206     hlist->date = (int *) realloc(hlist->date, n * sizeof(int));
207     if (hlist->date == NULL) {
208 	 error("realloc_hlist");
209     }
210 }
211 
free_hlist(HLIST hlist)212 void free_hlist(HLIST hlist)
213 {
214     if (hlist.n <= 0) return;
215     free(hlist.fid);
216     free(hlist.scr);
217     free(hlist.did);
218     free(hlist.date);
219 }
220 
copy_hlist(HLIST to,int n_to,HLIST from,int n_from)221 void copy_hlist(HLIST to, int n_to, HLIST from, int n_from)
222 {
223     to.fid[n_to] = from.fid[n_from];
224     to.scr[n_to] = from.scr[n_from];
225     to.did[n_to] = from.did[n_from];
226     to.date[n_to] = from.date[n_from];
227 }
228 
memcpy_hlist(HLIST to,HLIST from,int n)229 void memcpy_hlist(HLIST to, HLIST from, int n)
230 {
231     memcpy(to.fid + n, from.fid, from.n * sizeof (int));
232     memcpy(to.scr + n, from.scr, from.n * sizeof (int));
233     memcpy(to.did + n, from.did, from.n * sizeof (int));
234     memcpy(to.date + n, from.date, from.n * sizeof (int));
235 }
236 
set_did_hlist(HLIST hlist,int id)237 void set_did_hlist(HLIST hlist, int id)
238 {
239     int i;
240     for (i = 0; i < hlist.n; i++) {
241         hlist.did[i] = id;
242     }
243 }
244 
merge_hlist(HLIST * hlists)245 HLIST merge_hlist(HLIST *hlists)
246 {
247     int i, n;
248     HLIST value;
249 
250     if (DbNumber == 1) return hlists[0];
251     for(i = n = 0; i < DbNumber; i++) {
252         if (hlists[i].n > 0) {
253             n += hlists[i].n;
254         }
255     }
256     malloc_hlist(&value, n);
257     for(i = n = 0; i < DbNumber; i++) {
258         if (hlists[i].n <= 0)
259             continue;
260         memcpy_hlist(value, hlists[i], n);
261         n += hlists[i].n;
262         free_hlist(hlists[i]);
263     }
264     value.n = n;
265     return value;
266 }
267 
set_date_zero_all(HLIST hlist)268 void set_date_zero_all(HLIST hlist)
269 {
270     int i;
271 
272     for (i = 0; i < hlist.n; i++) {
273         hlist.date[i] = 0;
274     }
275 }
276 
277 /* get date info from NMZ.t and do the missing number processing */
do_date_processing(HLIST hlist)278 HLIST do_date_processing(HLIST hlist)
279 {
280     FILE *date_index;
281     int i;
282 
283     date_index = fopen(DATEINDEX, "rb");
284     if (date_index == NULL) {
285         if (Debug) {
286             fprintf(stderr, "%s: cannot open file.\n", DATEINDEX);
287         }
288         set_date_zero_all(hlist);
289         return hlist; /* error */
290     }
291 
292     for (i = 0; i < hlist.n ; i++) {
293         if (-1 == fseek(date_index, hlist.fid[i] * sizeof(hlist.date[i]), 0)) {
294             set_date_zero_all(hlist);
295             return hlist; /* error */
296         }
297         freadx(&hlist.date[i], sizeof(hlist.date[i]), 1, date_index);
298 
299         if (hlist.date[i] == -1) {
300             /* the missing number, this document has been deleted */
301             int j;
302 
303             for (j = i + 1; j < hlist.n; j++) { /* shift */
304                 copy_hlist(hlist, j - 1, hlist, j);
305             }
306             hlist.n--;
307             i--;
308         }
309     }
310 
311     fclose(date_index);
312     return hlist;
313 }
314 
315 /* get the hit list */
get_hlist(int index)316 HLIST get_hlist(int index)
317 {
318     int n, *buf, i;
319     HLIST hlist;
320     double idf = 0;
321     uchar tmp[BUFSIZE];
322     hlist.n = 0;
323 
324     if (-1 == fseek(Index, get_index_pointer(IndexIndex, index), 0))
325 	return hlist; /* error */
326 
327     fgets(tmp, BUFSIZE, Index); /* read and dispose */
328 
329     freadx(&n, sizeof(int), 1, Index);
330 
331     if (TfIdf) {
332         idf = log((double)AllDocumentN / (n/2)) / log(2);
333         if (Debug)
334             fprintf(stderr, "idf: %f (N:%d, n:%d)\n", idf, AllDocumentN, n/2);
335     }
336 
337     if (n >= IGNORE_HIT * 2) {
338         /* '* 2' means NMZ.i contains a file-ID and a score. */
339         hlist.n = TOO_MUCH_HIT;
340     } else {
341         buf = (int *) malloc(n * sizeof(int));
342 	if (buf == NULL) {
343 	     error("hlist");
344         }
345 	malloc_hlist(&hlist, n / 2);
346 	freadx(buf, sizeof(int), n, Index);
347 	for (i = 0; i < n; i += 2) {
348 	    hlist.fid[i / 2] = *(buf + i);
349 	    hlist.scr[i / 2] = *(buf + i + 1);
350             if (TfIdf) {
351                 hlist.scr[i / 2] = (int)(hlist.scr[i / 2] * idf) + 1;
352             }
353 	}
354 	free(buf);
355         hlist.n = n / 2;
356         hlist = do_date_processing(hlist);
357     }
358     return hlist;
359 }
360 
361 
362 /* use merge sort a stable sort algorithm
363  * original of this code was contributed by Furukawa-san [11/13/1997]
364  */
365 
366 #define SORT_BY_SCORE 1
367 #define SORT_BY_DATE 2
368 
nmz_mergesort(int first,int last,HLIST hlist,HLIST work,int mode)369 void nmz_mergesort(int first, int last, HLIST hlist, HLIST work, int mode)
370 {
371     int middle;
372     static int i, j, k, p;
373 
374     if (first < last) {
375 	middle = (first + last) / 2;
376 	nmz_mergesort(first, middle, hlist, work, mode);
377 	nmz_mergesort(middle + 1, last, hlist, work, mode);
378 	p = 0;
379 	for (i = first; i <= middle; i++) {
380 	    copy_hlist(work, p, hlist, i);
381 	    p++;
382 	}
383 	i = middle + 1;
384 	j = 0;
385 	k = first;
386 	while (i <= last && j < p) {
387             int bool = 0;
388 
389             if (mode == SORT_BY_SCORE) {
390                 if (work.scr[j] >= hlist.scr[i]) {
391                     bool = 1;
392                 }
393             } else if (mode == SORT_BY_DATE){
394                 if (work.date[j] >= hlist.date[i]) {
395                     bool = 1;
396                 }
397             }
398 	    if (bool) {
399 		copy_hlist(hlist, k, work, j);
400 		k++;
401 		j++;
402 	    } else {
403 		copy_hlist(hlist, k, hlist, i);
404 		k++;
405 		i++;
406 	    }
407 	}
408 	while (j < p) {
409 	    copy_hlist(hlist, k, work, j);
410 	    k++;
411 	    j++;
412 	}
413     }
414 }
415 
416 
417 /* interface to invoke merge sort function */
sort_hlist(HLIST hlist,char * mode)418 void sort_hlist(HLIST hlist, char *mode)
419 {
420     HLIST work;
421     malloc_hlist(&work, hlist.n);
422 
423     if (! strcmp(mode, "score")) {
424         nmz_mergesort(0, hlist.n - 1, hlist, work, SORT_BY_SCORE);
425     } else if (! strcmp(mode, "date")) {
426         nmz_mergesort(0, hlist.n - 1, hlist, work, SORT_BY_DATE);
427     }
428     free_hlist(work);
429 }
430 
erase_url(uchar * s)431 void erase_url(uchar *s)
432 {
433     int i, j;
434     uchar tmp[BUFSIZE];
435 
436     strcpy(tmp, s);
437 
438     if (!strncmp("<DD><A HREF=\"", &tmp[0], 13)) {
439        for (i = 13; tmp[i] != '>'; i++)
440        ;
441        strcpy(tmp, "<DD>");
442        for (i++, j = 4; tmp[i]; i++, j++) {
443            if (!strncmp("</A>", &tmp[i], 4))
444                i += 4;
445            s[j] = tmp[i];
446        }
447        s[j] = '\0';
448     }
449 }
450 
451 /* replace a URL */
replace_url(uchar * s,int opt)452 void replace_url(uchar * s, int opt)
453 {
454     int n;
455     int n_from, n_to, i, j;
456     uchar tmp[BUFSIZE];
457 
458     strcpy(tmp, s);
459 
460   for(n=0;n<url_no;n++) {
461     n_from = strlen(URL_REPLACE_FROM[n]);
462     n_to = strlen(URL_REPLACE_TO[n]);
463 
464     if (!strncmp(URL_REPLACE_FROM[n], tmp, n_from)) {
465 	strcpy(s, URL_REPLACE_TO[n]);
466 	for (i = n_from, j = n_to; tmp[i] != '>'; i++, j++)
467 	    s[j] = tmp[i];
468 	s[j++] = tmp[i++];
469 	if (opt && !strncmp(URL_REPLACE_FROM[n], tmp + i, n_from)) {
470 	    strcpy(s + j, URL_REPLACE_TO[n]);
471 	    i += n_from;
472 	    j += n_to;
473 	}
474 	for (; tmp[i]; i++, j++)
475 	    s[j] = tmp[i];
476 	s[j] = '\0';
477     }
478   }
479 }
480 
481 
make_fullpathname_flist(int n)482 void make_fullpathname_flist(int n)
483 {
484     uchar *base;
485 
486     base = DbNames[n];
487 
488     pathcat(base, FLIST);
489     pathcat(base, FLISTINDEX);
490 }
491 
open_files2(int n)492 void open_files2(int n)
493 {
494     make_fullpathname_flist(n);
495     Flist = fopen(FLIST, "rb");
496     if (Flist == NULL) {
497         error(FLIST);
498     }
499     FlistIndex = fopen(FLISTINDEX, "rb");
500     if (FlistIndex == NULL) {
501         error(FLISTINDEX);
502     }
503 }
504 
filesclose2()505 void filesclose2()
506 {
507     fclose(Flist);
508     fclose(FlistIndex);
509 }
510 
erase_size(uchar * s)511 void erase_size(uchar *s)
512 {
513     int i;
514     for (i = strlen(s) -1; i > 0; i--) {
515         if (!strncmp(s + i, "size", 4)) {
516             break;
517         }
518     }
519     *(s + i - 1) = '\0';
520 }
521 
522 /* display the hlist */
put_hlist(HLIST hlist)523 void put_hlist(HLIST hlist)
524 {
525     int i, j, before_did = -1;
526     uchar buf[BUFSIZE];
527 
528     if (HitCountOnly) {
529         printf("%d\n", hlist.n);
530         return;
531     }
532 
533     if (hlist.n <= 0 || HListMax == 0)
534 	return;
535     for (i = HListWhence; i < hlist.n; i++) {
536 	if (!AllList && (i >= HListWhence + HListMax))
537 	    break;
538         if (hlist.did[i] != before_did) {
539             if (before_did != -1)
540                 filesclose2();
541             open_files2(hlist.did[i]);
542             before_did = hlist.did[i];
543         }
544 
545 	fseek(Flist,get_index_pointer(FlistIndex, hlist.fid[i]), 0);
546 
547         if (Debug) {
548             printf("[[ %d ]]\n", hlist.fid[i]);
549         }
550 	for (j = 0;; j++) {
551 	    if (!fgets(buf, BUFSIZE, Flist) || *buf == '\n') {
552 		if (HtmlOutput)
553 		    fputc('\n', stdout);
554 		break;
555 	    }
556 	    if (j == ABSTRACT_LINE && (ShortFormat || MoreShortFormat))
557 		continue;
558 	    if (j == SCORE_LINE && MoreShortFormat)
559 		continue;
560 
561 	    *(lastc(buf)) = '\0';
562 
563             if (MoreShortFormat) {
564                 /* erase a message of size (XXX bytes) (stupid processing)*/
565                 erase_size(buf);
566             }
567             if (!NoReplace && URL_REPLACE_FROM[0]) { /* replace a URL */
568                 if (!strncmp("<DD><A HREF=\"", buf, 13)) {
569                     replace_url(&buf[13], 1);
570                 } else if (!strncmp("<STRONG><A HREF=\"", buf, 17)) {
571                     replace_url(&buf[17], 0);
572                 }
573             }
574             if (!HtmlOutput && DecodeURL &&
575                 !strncmp("<DD><A HREF=\"", buf, 13)) {
576                     decode_url_string(&buf[13]);
577             }
578             if (ModeTknamazu) {
579                 erase_url(buf);
580             }
581 
582 	    if (HtmlOutput) {
583                 /* as NMZ.f is encoded with output inteded encoding,
584                  fputs as is without codeconv and etc processing */
585 		fputs(buf, stdout);
586             }
587 	    else {
588 		jistoeuc(buf);
589 		fputs_without_html_tag(buf, stdout);
590 	    }
591 	    if (j <= 0) {
592                 if (!MoreShortFormat) {
593                     printf("%d. ", i + 1);
594                 }
595 		continue;
596 	    }
597 	    if (j == SCORE_LINE)
598 		printf(" (score: %d)", hlist.scr[i]);
599 	    fputc('\n', stdout);
600 	}
601     }
602     filesclose2();
603 }
604 
605 
606 
607 /*
608  * reverse the hlist
609  * original of this routine was contributed by Furukawa-san [11/13/1997]
610  */
reverse_hlist(HLIST hlist)611 void reverse_hlist(HLIST hlist)
612 {
613     int m, n;
614     HLIST tmp;
615 
616     malloc_hlist(&tmp, 1);
617     m = 0;
618     n = hlist.n - 1;
619     while (m < n) {
620 	copy_hlist(tmp, 0, hlist, m);
621 	copy_hlist(hlist, m, hlist, n);
622 	copy_hlist(hlist, n, tmp, 0);
623 	m++;
624 	n--;
625     }
626     free_hlist(tmp);
627 }
628