1 /*
2 *
3 * hlist.c -
4 *
5 * Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
6 * This is free software with ABSOLUTELY NO WARRANTY.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
21 * 02111-1307, USA
22 *
23 * This file must be encoded in EUC-JP encoding
24 *
25 */
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unistd.h>
30 #include <math.h>
31 #include "namazu.h"
32 #include "util.h"
33
34 /* merge the left and right with AND rule */
andmerge(HLIST left,HLIST right,int * ignore)35 HLIST andmerge(HLIST left, HLIST right, int *ignore)
36 {
37 int i, j, v;
38
39 if (*ignore && left.n > 0)
40 return left;
41 if (*ignore && right.n > 0)
42 return right;
43
44 if (left.n <= 0)
45 return left;
46 if (right.n <= 0)
47 return right;
48
49 for (v = 0, i = 0, j = 0; i < left.n; i++) {
50 for (;; j++) {
51 if (j >= right.n)
52 goto OUT;
53 if (left.fid[i] < right.fid[j])
54 break;
55 if (left.fid[i] == right.fid[j]) {
56
57 copy_hlist(left, v, left, i);
58 if (TfIdf) {
59 left.scr[v] = left.scr[i] + right.scr[j];
60 } else {
61 /* assign a smaller number, left or right*/
62 left.scr[v] = left.scr[i] < right.scr[j] ?
63 left.scr[i] : right.scr[j];
64 }
65 v++;
66 j++;
67 break;
68 }
69 }
70 }
71 OUT:
72 free_hlist(right);
73 left.n = v;
74 if (left.n <= 0)
75 free_hlist(left);
76 return left;
77 }
78
79
80 /* merge the left and right with NOT rule */
notmerge(HLIST left,HLIST right,int * ignore)81 HLIST notmerge(HLIST left, HLIST right, int *ignore)
82 {
83 int i, j, v, f;
84
85 if (*ignore && left.n > 0)
86 return left;
87 if (*ignore && right.n > 0)
88 return right;
89
90 if (right.n <= 0)
91 return left;
92 if (left.n <= 0)
93 return left;
94
95 for (v = 0, i = 0, j = 0; i < left.n; i++) {
96 for (f = 0; j < right.n; j++) {
97 if (left.fid[i] < right.fid[j])
98 break;
99 if (left.fid[i] == right.fid[j]) {
100 j++;
101 f = 1;
102 break;
103 }
104 }
105 if (!f) {
106 copy_hlist(left, v, left, i);
107 v++;
108 }
109 }
110 free_hlist(right);
111 left.n = v;
112 if (left.n <= 0)
113 free_hlist(left);
114 return left;
115 }
116
117
118 /*
119 * merge the left and right with OR rule
120 */
ormerge(HLIST left,HLIST right)121 HLIST ormerge(HLIST left, HLIST right)
122 {
123 int i, j, v, n;
124 HLIST val;
125
126 if (left.n <= 0 && right.n <= 0)
127 return left;
128 if (left.n <= 0)
129 return right;
130 if (right.n <= 0)
131 return left;
132
133 n = left.n + right.n;
134
135 malloc_hlist(&val, n);
136
137 for (v = 0, i = 0, j = 0; i < left.n; i++) {
138 for (; left.fid[i] >= right.fid[j] && j < right.n; j++) {
139 if (left.fid[i] == right.fid[j]) {
140
141 if (TfIdf) {
142 left.scr[i] = left.scr[i] + right.scr[j];
143 } else {
144 /* assign a large number, left or right */
145 left.scr[i] = left.scr[i] > right.scr[j] ?
146 left.scr[i] : right.scr[j];
147 }
148 j++;
149 break;
150 } else {
151 copy_hlist(val, v, right, j);
152 v++;
153 }
154 }
155 copy_hlist(val, v, left, i);
156 v++;
157 }
158
159 for (; j < right.n; j++) {
160 copy_hlist(val, v, right, j);
161 v++;
162 }
163
164 free_hlist(left);
165 free_hlist(right);
166 val.n = v;
167 return val;
168 }
169
malloc_hlist(HLIST * hlist,int n)170 void malloc_hlist(HLIST * hlist, int n)
171 {
172 if (n <= 0) return;
173 hlist->fid = (int *)malloc(n * sizeof(int));
174 if (hlist->fid == NULL) {
175 error("malloc_hlist");
176 }
177 hlist->scr = (int *)malloc(n * sizeof(int));
178 if (hlist->scr == NULL) {
179 error("malloc_hlist");
180 }
181 hlist->did = (int *)malloc(n * sizeof(int));
182 if (hlist->did == NULL) {
183 error("malloc_hlist");
184 }
185 hlist->date = (int *)malloc(n * sizeof(int));
186 if (hlist->date == NULL) {
187 error("malloc_hlist");
188 }
189 }
190
realloc_hlist(HLIST * hlist,int n)191 void realloc_hlist(HLIST * hlist, int n)
192 {
193 if (n <= 0) return;
194 hlist->fid = (int *) realloc(hlist->fid, n * sizeof(int));
195 if (hlist->fid == NULL) {
196 error("realloc_hlist");
197 }
198 hlist->scr = (int *) realloc(hlist->scr, n * sizeof(int));
199 if (hlist->scr == NULL) {
200 error("realloc_hlist");
201 }
202 hlist->did = (int *) realloc(hlist->did, n * sizeof(int));
203 if (hlist->did == NULL) {
204 error("realloc_hlist");
205 }
206 hlist->date = (int *) realloc(hlist->date, n * sizeof(int));
207 if (hlist->date == NULL) {
208 error("realloc_hlist");
209 }
210 }
211
free_hlist(HLIST hlist)212 void free_hlist(HLIST hlist)
213 {
214 if (hlist.n <= 0) return;
215 free(hlist.fid);
216 free(hlist.scr);
217 free(hlist.did);
218 free(hlist.date);
219 }
220
copy_hlist(HLIST to,int n_to,HLIST from,int n_from)221 void copy_hlist(HLIST to, int n_to, HLIST from, int n_from)
222 {
223 to.fid[n_to] = from.fid[n_from];
224 to.scr[n_to] = from.scr[n_from];
225 to.did[n_to] = from.did[n_from];
226 to.date[n_to] = from.date[n_from];
227 }
228
memcpy_hlist(HLIST to,HLIST from,int n)229 void memcpy_hlist(HLIST to, HLIST from, int n)
230 {
231 memcpy(to.fid + n, from.fid, from.n * sizeof (int));
232 memcpy(to.scr + n, from.scr, from.n * sizeof (int));
233 memcpy(to.did + n, from.did, from.n * sizeof (int));
234 memcpy(to.date + n, from.date, from.n * sizeof (int));
235 }
236
set_did_hlist(HLIST hlist,int id)237 void set_did_hlist(HLIST hlist, int id)
238 {
239 int i;
240 for (i = 0; i < hlist.n; i++) {
241 hlist.did[i] = id;
242 }
243 }
244
merge_hlist(HLIST * hlists)245 HLIST merge_hlist(HLIST *hlists)
246 {
247 int i, n;
248 HLIST value;
249
250 if (DbNumber == 1) return hlists[0];
251 for(i = n = 0; i < DbNumber; i++) {
252 if (hlists[i].n > 0) {
253 n += hlists[i].n;
254 }
255 }
256 malloc_hlist(&value, n);
257 for(i = n = 0; i < DbNumber; i++) {
258 if (hlists[i].n <= 0)
259 continue;
260 memcpy_hlist(value, hlists[i], n);
261 n += hlists[i].n;
262 free_hlist(hlists[i]);
263 }
264 value.n = n;
265 return value;
266 }
267
set_date_zero_all(HLIST hlist)268 void set_date_zero_all(HLIST hlist)
269 {
270 int i;
271
272 for (i = 0; i < hlist.n; i++) {
273 hlist.date[i] = 0;
274 }
275 }
276
277 /* get date info from NMZ.t and do the missing number processing */
do_date_processing(HLIST hlist)278 HLIST do_date_processing(HLIST hlist)
279 {
280 FILE *date_index;
281 int i;
282
283 date_index = fopen(DATEINDEX, "rb");
284 if (date_index == NULL) {
285 if (Debug) {
286 fprintf(stderr, "%s: cannot open file.\n", DATEINDEX);
287 }
288 set_date_zero_all(hlist);
289 return hlist; /* error */
290 }
291
292 for (i = 0; i < hlist.n ; i++) {
293 if (-1 == fseek(date_index, hlist.fid[i] * sizeof(hlist.date[i]), 0)) {
294 set_date_zero_all(hlist);
295 return hlist; /* error */
296 }
297 freadx(&hlist.date[i], sizeof(hlist.date[i]), 1, date_index);
298
299 if (hlist.date[i] == -1) {
300 /* the missing number, this document has been deleted */
301 int j;
302
303 for (j = i + 1; j < hlist.n; j++) { /* shift */
304 copy_hlist(hlist, j - 1, hlist, j);
305 }
306 hlist.n--;
307 i--;
308 }
309 }
310
311 fclose(date_index);
312 return hlist;
313 }
314
315 /* get the hit list */
get_hlist(int index)316 HLIST get_hlist(int index)
317 {
318 int n, *buf, i;
319 HLIST hlist;
320 double idf = 0;
321 uchar tmp[BUFSIZE];
322 hlist.n = 0;
323
324 if (-1 == fseek(Index, get_index_pointer(IndexIndex, index), 0))
325 return hlist; /* error */
326
327 fgets(tmp, BUFSIZE, Index); /* read and dispose */
328
329 freadx(&n, sizeof(int), 1, Index);
330
331 if (TfIdf) {
332 idf = log((double)AllDocumentN / (n/2)) / log(2);
333 if (Debug)
334 fprintf(stderr, "idf: %f (N:%d, n:%d)\n", idf, AllDocumentN, n/2);
335 }
336
337 if (n >= IGNORE_HIT * 2) {
338 /* '* 2' means NMZ.i contains a file-ID and a score. */
339 hlist.n = TOO_MUCH_HIT;
340 } else {
341 buf = (int *) malloc(n * sizeof(int));
342 if (buf == NULL) {
343 error("hlist");
344 }
345 malloc_hlist(&hlist, n / 2);
346 freadx(buf, sizeof(int), n, Index);
347 for (i = 0; i < n; i += 2) {
348 hlist.fid[i / 2] = *(buf + i);
349 hlist.scr[i / 2] = *(buf + i + 1);
350 if (TfIdf) {
351 hlist.scr[i / 2] = (int)(hlist.scr[i / 2] * idf) + 1;
352 }
353 }
354 free(buf);
355 hlist.n = n / 2;
356 hlist = do_date_processing(hlist);
357 }
358 return hlist;
359 }
360
361
362 /* use merge sort a stable sort algorithm
363 * original of this code was contributed by Furukawa-san [11/13/1997]
364 */
365
366 #define SORT_BY_SCORE 1
367 #define SORT_BY_DATE 2
368
nmz_mergesort(int first,int last,HLIST hlist,HLIST work,int mode)369 void nmz_mergesort(int first, int last, HLIST hlist, HLIST work, int mode)
370 {
371 int middle;
372 static int i, j, k, p;
373
374 if (first < last) {
375 middle = (first + last) / 2;
376 nmz_mergesort(first, middle, hlist, work, mode);
377 nmz_mergesort(middle + 1, last, hlist, work, mode);
378 p = 0;
379 for (i = first; i <= middle; i++) {
380 copy_hlist(work, p, hlist, i);
381 p++;
382 }
383 i = middle + 1;
384 j = 0;
385 k = first;
386 while (i <= last && j < p) {
387 int bool = 0;
388
389 if (mode == SORT_BY_SCORE) {
390 if (work.scr[j] >= hlist.scr[i]) {
391 bool = 1;
392 }
393 } else if (mode == SORT_BY_DATE){
394 if (work.date[j] >= hlist.date[i]) {
395 bool = 1;
396 }
397 }
398 if (bool) {
399 copy_hlist(hlist, k, work, j);
400 k++;
401 j++;
402 } else {
403 copy_hlist(hlist, k, hlist, i);
404 k++;
405 i++;
406 }
407 }
408 while (j < p) {
409 copy_hlist(hlist, k, work, j);
410 k++;
411 j++;
412 }
413 }
414 }
415
416
417 /* interface to invoke merge sort function */
sort_hlist(HLIST hlist,char * mode)418 void sort_hlist(HLIST hlist, char *mode)
419 {
420 HLIST work;
421 malloc_hlist(&work, hlist.n);
422
423 if (! strcmp(mode, "score")) {
424 nmz_mergesort(0, hlist.n - 1, hlist, work, SORT_BY_SCORE);
425 } else if (! strcmp(mode, "date")) {
426 nmz_mergesort(0, hlist.n - 1, hlist, work, SORT_BY_DATE);
427 }
428 free_hlist(work);
429 }
430
erase_url(uchar * s)431 void erase_url(uchar *s)
432 {
433 int i, j;
434 uchar tmp[BUFSIZE];
435
436 strcpy(tmp, s);
437
438 if (!strncmp("<DD><A HREF=\"", &tmp[0], 13)) {
439 for (i = 13; tmp[i] != '>'; i++)
440 ;
441 strcpy(tmp, "<DD>");
442 for (i++, j = 4; tmp[i]; i++, j++) {
443 if (!strncmp("</A>", &tmp[i], 4))
444 i += 4;
445 s[j] = tmp[i];
446 }
447 s[j] = '\0';
448 }
449 }
450
451 /* replace a URL */
replace_url(uchar * s,int opt)452 void replace_url(uchar * s, int opt)
453 {
454 int n;
455 int n_from, n_to, i, j;
456 uchar tmp[BUFSIZE];
457
458 strcpy(tmp, s);
459
460 for(n=0;n<url_no;n++) {
461 n_from = strlen(URL_REPLACE_FROM[n]);
462 n_to = strlen(URL_REPLACE_TO[n]);
463
464 if (!strncmp(URL_REPLACE_FROM[n], tmp, n_from)) {
465 strcpy(s, URL_REPLACE_TO[n]);
466 for (i = n_from, j = n_to; tmp[i] != '>'; i++, j++)
467 s[j] = tmp[i];
468 s[j++] = tmp[i++];
469 if (opt && !strncmp(URL_REPLACE_FROM[n], tmp + i, n_from)) {
470 strcpy(s + j, URL_REPLACE_TO[n]);
471 i += n_from;
472 j += n_to;
473 }
474 for (; tmp[i]; i++, j++)
475 s[j] = tmp[i];
476 s[j] = '\0';
477 }
478 }
479 }
480
481
make_fullpathname_flist(int n)482 void make_fullpathname_flist(int n)
483 {
484 uchar *base;
485
486 base = DbNames[n];
487
488 pathcat(base, FLIST);
489 pathcat(base, FLISTINDEX);
490 }
491
open_files2(int n)492 void open_files2(int n)
493 {
494 make_fullpathname_flist(n);
495 Flist = fopen(FLIST, "rb");
496 if (Flist == NULL) {
497 error(FLIST);
498 }
499 FlistIndex = fopen(FLISTINDEX, "rb");
500 if (FlistIndex == NULL) {
501 error(FLISTINDEX);
502 }
503 }
504
filesclose2()505 void filesclose2()
506 {
507 fclose(Flist);
508 fclose(FlistIndex);
509 }
510
erase_size(uchar * s)511 void erase_size(uchar *s)
512 {
513 int i;
514 for (i = strlen(s) -1; i > 0; i--) {
515 if (!strncmp(s + i, "size", 4)) {
516 break;
517 }
518 }
519 *(s + i - 1) = '\0';
520 }
521
522 /* display the hlist */
put_hlist(HLIST hlist)523 void put_hlist(HLIST hlist)
524 {
525 int i, j, before_did = -1;
526 uchar buf[BUFSIZE];
527
528 if (HitCountOnly) {
529 printf("%d\n", hlist.n);
530 return;
531 }
532
533 if (hlist.n <= 0 || HListMax == 0)
534 return;
535 for (i = HListWhence; i < hlist.n; i++) {
536 if (!AllList && (i >= HListWhence + HListMax))
537 break;
538 if (hlist.did[i] != before_did) {
539 if (before_did != -1)
540 filesclose2();
541 open_files2(hlist.did[i]);
542 before_did = hlist.did[i];
543 }
544
545 fseek(Flist,get_index_pointer(FlistIndex, hlist.fid[i]), 0);
546
547 if (Debug) {
548 printf("[[ %d ]]\n", hlist.fid[i]);
549 }
550 for (j = 0;; j++) {
551 if (!fgets(buf, BUFSIZE, Flist) || *buf == '\n') {
552 if (HtmlOutput)
553 fputc('\n', stdout);
554 break;
555 }
556 if (j == ABSTRACT_LINE && (ShortFormat || MoreShortFormat))
557 continue;
558 if (j == SCORE_LINE && MoreShortFormat)
559 continue;
560
561 *(lastc(buf)) = '\0';
562
563 if (MoreShortFormat) {
564 /* erase a message of size (XXX bytes) (stupid processing)*/
565 erase_size(buf);
566 }
567 if (!NoReplace && URL_REPLACE_FROM[0]) { /* replace a URL */
568 if (!strncmp("<DD><A HREF=\"", buf, 13)) {
569 replace_url(&buf[13], 1);
570 } else if (!strncmp("<STRONG><A HREF=\"", buf, 17)) {
571 replace_url(&buf[17], 0);
572 }
573 }
574 if (!HtmlOutput && DecodeURL &&
575 !strncmp("<DD><A HREF=\"", buf, 13)) {
576 decode_url_string(&buf[13]);
577 }
578 if (ModeTknamazu) {
579 erase_url(buf);
580 }
581
582 if (HtmlOutput) {
583 /* as NMZ.f is encoded with output inteded encoding,
584 fputs as is without codeconv and etc processing */
585 fputs(buf, stdout);
586 }
587 else {
588 jistoeuc(buf);
589 fputs_without_html_tag(buf, stdout);
590 }
591 if (j <= 0) {
592 if (!MoreShortFormat) {
593 printf("%d. ", i + 1);
594 }
595 continue;
596 }
597 if (j == SCORE_LINE)
598 printf(" (score: %d)", hlist.scr[i]);
599 fputc('\n', stdout);
600 }
601 }
602 filesclose2();
603 }
604
605
606
607 /*
608 * reverse the hlist
609 * original of this routine was contributed by Furukawa-san [11/13/1997]
610 */
reverse_hlist(HLIST hlist)611 void reverse_hlist(HLIST hlist)
612 {
613 int m, n;
614 HLIST tmp;
615
616 malloc_hlist(&tmp, 1);
617 m = 0;
618 n = hlist.n - 1;
619 while (m < n) {
620 copy_hlist(tmp, 0, hlist, m);
621 copy_hlist(hlist, m, hlist, n);
622 copy_hlist(hlist, n, tmp, 0);
623 m++;
624 n--;
625 }
626 free_hlist(tmp);
627 }
628