1 /*
2  *
3  * hlist.c -
4  *
5  * $Id: hlist.c,v 1.51.4.22 2008-03-06 15:38:08 opengl2772 Exp $
6  *
7  * Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
8  * Copyright (C) 2000-2008 Namazu Project All rights reserved.
9  * This is free software with ABSOLUTELY NO WARRANTY.
10  *
11  * This program is free software; you can redistribute it and/or modify
12  * it under the terms of the GNU General Public License as published by
13  * the Free Software Foundation; either version 2 of the License, or
14  * (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19  * GNU General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public License
22  * along with this program; if not, write to the Free Software
23  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
24  * 02111-1307, USA
25  *
26  * This file must be encoded in EUC-JP encoding
27  *
28  */
29 
30 #ifdef HAVE_CONFIG_H
31 #  include "config.h"
32 #endif
33 #ifdef HAVE_SUPPORT_H
34 #  include "support.h"
35 #endif
36 
37 #include <stdio.h>
38 #ifdef HAVE_STDLIB_H
39 #  include <stdlib.h>
40 #endif
41 #include <math.h>
42 
43 #ifdef HAVE_ERRNO_H
44 #  include <errno.h>
45 #endif
46 
47 #ifdef HAVE_UNISTD_H
48 #include <unistd.h>
49 #else
50 # ifdef _WIN32
51 # include <io.h>
52 # endif
53 #endif
54 
55 #ifdef HAVE_STRING_H
56 #  include <string.h>
57 #else
58 #  include <strings.h>
59 #endif
60 
61 #include "libnamazu.h"
62 #include "util.h"
63 #include "hlist.h"
64 #include "field.h"
65 #include "var.h"
66 #include "idxname.h"
67 #include "search.h"
68 #include "query.h"
69 #include "score.h"
70 
71 static int document_number = 0;  /* Number of documents covered in a target index */
72 static char field_for_sort[BUFSIZE] = "";  /* field_for_sort name used with sorting */
73 
74 /*
75  *
76  * Private functions
77  *
78  */
79 
80 static void memcpy_hlist(NmzResult, NmzResult, int);
81 static void set_rank(NmzResult);
82 static int  field_sort(NmzResult);
83 static int  field_scmp(const void*, const void*);
84 static int  field_ncmp(const void*, const void*);
85 static int  date_cmp(const void*, const void*);
86 static int  score_cmp(const void*, const void*);
87 static int  intcmp(int v1, int v2);
88 
89 static void
memcpy_hlist(NmzResult to,NmzResult from,int n)90 memcpy_hlist(NmzResult to, NmzResult from, int n)
91 {
92     memcpy(to.data + n,  from.data,  from.num * sizeof (to.data[0]));
93 }
94 
95 static void
set_rank(NmzResult hlist)96 set_rank(NmzResult hlist)
97 {
98     int i;
99 
100     /* Set rankings in descending order. */
101     for (i = 0 ; i < hlist.num; i++) {
102         hlist.data[i].rank = hlist.num - i;
103     }
104 }
105 
106 static int
field_sort(NmzResult hlist)107 field_sort(NmzResult hlist)
108 {
109     int i, numeric = 1;
110 
111     for (i = 0; i < hlist.num; i++) {
112 	char buf[BUFSIZE];
113 	size_t leng;
114 
115 	nmz_get_field_data(hlist.data[i].idxid,
116 			   hlist.data[i].docid, field_for_sort, buf);
117 	nmz_chomp(buf);
118 	leng = strlen(buf);
119 
120 	if (numeric == 1 && ! nmz_isnumstr(buf)) {
121 	    numeric = 0;
122 	}
123 
124 	hlist.data[i].field = malloc(leng + 1);
125 	if (hlist.data[i].field == NULL) {
126             int j;
127             for (j = 0; j < i; j++) {
128 	        free(hlist.data[j].field);
129 	        hlist.data[j].field = NULL;
130             }
131 	    nmz_set_dyingmsg(nmz_msg("%s", strerror(errno)));
132 	    return FAILURE;
133 	}
134 	strcpy(hlist.data[i].field, buf);
135     }
136 
137     if (numeric == 1) {
138 	qsort(hlist.data, hlist.num, sizeof(hlist.data[0]), field_ncmp);
139 
140     } else {
141 	qsort(hlist.data, hlist.num, sizeof(hlist.data[0]), field_scmp);
142     }
143 
144     for (i = 0; i < hlist.num; i++) {
145 	free(hlist.data[i].field);
146 	hlist.data[i].field = NULL;
147     }
148     return 0;
149 }
150 
151 /*
152  * Compare a pair of hlist.data[].field as string in descending order.
153  */
154 static int
field_scmp(const void * p1,const void * p2)155 field_scmp(const void *p1, const void *p2)
156 {
157     struct nmz_data *v1, *v2;
158     int r;
159 
160     v1 = (struct nmz_data *) p1;
161     v2 = (struct nmz_data *) p2;
162 
163     r = strcmp(v2->field, v1->field);
164     if (r == 0) {
165 	r = intcmp(v2->rank, v1->rank);
166     }
167     return r;
168 }
169 
170 /*
171  * Compare a pair of hlist.data[].field as number in descending order.
172  */
173 static int
field_ncmp(const void * p1,const void * p2)174 field_ncmp(const void *p1, const void *p2)
175 {
176     struct nmz_data *v1, *v2;
177     int r;
178 
179     v1 = (struct nmz_data *) p1;
180     v2 = (struct nmz_data *) p2;
181 
182     r = intcmp(atoi(v2->field), atoi(v1->field));
183     if (r ==0) {
184 	r = intcmp(v2->rank, v1->rank);
185     }
186     return r;
187 }
188 
189 
190 /*
191  * Compare a pair of hlist.data[].score as number in descending order.
192  */
193 static int
score_cmp(const void * p1,const void * p2)194 score_cmp(const void *p1, const void *p2)
195 {
196     struct nmz_data *v1, *v2;
197     int r;
198 
199     v1 = (struct nmz_data *) p1;
200     v2 = (struct nmz_data *) p2;
201 
202     r = intcmp(v2->score, v1->score);
203     if (r == 0) {
204 	r = intcmp(v2->rank, v1->rank);
205     }
206     return r;
207     /* Return (r = v2->score - v1->score) ? r : v2->rank - v1->rank; */
208 }
209 
210 /*
211  * Compare a pair of hlist.data[].date as number in descending order.
212  */
213 static int
date_cmp(const void * p1,const void * p2)214 date_cmp(const void *p1, const void *p2)
215 {
216     struct nmz_data *v1, *v2;
217     int r;
218 
219     v1 = (struct nmz_data *) p1;
220     v2 = (struct nmz_data *) p2;
221 
222     r = intcmp(v2->date, v1->date);
223     if (r == 0) {
224 	r = intcmp(v2->rank, v1->rank);
225     }
226     return r;
227 }
228 
229 /*
230  * This is a safe routine for comparing two integers.  It
231  * could be simply v1 - v2 but it would produce an incorrect
232  * answer if v2 is large and positive and v1 is large and
233  * negative or vice verse.
234  *
235  * See page 36 of "The Practice of Programming" by Brian
236  * W. Kernighan and Rob Pike for details.
237  */
238 static int
intcmp(int v1,int v2)239 intcmp(int v1, int v2)
240 {
241     if (v1 < v2) {
242 	return -1;
243     } else if (v1 == v2) {
244 	return 0;
245     } else {
246 	return 1;
247     }
248 }
249 
250 /*
251  *
252  * Public functions
253  *
254  */
255 
256 /*
257  * Merge the left and  right with AND rule.
258  */
259 NmzResult
nmz_andmerge(NmzResult left,NmzResult right,int * ignore)260 nmz_andmerge(NmzResult left, NmzResult right, int *ignore)
261 {
262     int i, j, v;
263 
264     if (left.stat == ERR_TOO_MUCH_MATCH || left.stat == ERR_TOO_MUCH_HIT) {
265 	nmz_free_hlist(left);
266 	return right;
267     }
268 
269     if (right.stat == ERR_TOO_MUCH_MATCH || right.stat == ERR_TOO_MUCH_HIT) {
270 	nmz_free_hlist(right);
271 	return left;
272     }
273 
274     if (left.stat != SUCCESS || left.num <= 0) {
275 	nmz_free_hlist(right);
276 	return left;
277     }
278     if (right.stat != SUCCESS || right.num <= 0) {
279 	nmz_free_hlist(left);
280 	return right;
281     }
282 
283     for (v = 0, i = 0, j = 0; i < left.num; i++) {
284 	for (;; j++) {
285 	    if (j >= right.num)
286 		goto OUT;
287 	    if (left.data[i].docid < right.data[j].docid)
288 		break;
289 	    if (left.data[i].docid == right.data[j].docid) {
290 
291                 if (v != i) {
292                     nmz_copy_hlist(left, v, left, i);
293                 }
294 
295                 if (nmz_is_tfidfmode()) {
296                     left.data[v].score =
297 			left.data[i].score + right.data[j].score;
298                 } else {
299                     /* Assign a smaller number, left or right*/
300                     left.data[v].score =
301 			left.data[i].score < right.data[j].score ?
302                         left.data[i].score : right.data[j].score;
303                 }
304 		v++;
305 		j++;
306 		break;
307 	    }
308 	}
309     }
310   OUT:
311     nmz_free_hlist(right);
312     left.num = v;
313     if (left.stat != SUCCESS || left.num <= 0)
314 	nmz_free_hlist(left);
315     return left;
316 }
317 
318 
319 /*
320  * Merge the left and  right with NOT rule.
321  */
322 NmzResult
nmz_notmerge(NmzResult left,NmzResult right,int * ignore)323 nmz_notmerge(NmzResult left, NmzResult right, int *ignore)
324 {
325     int i, j, v, f;
326 
327     if (ignore && *ignore && left.num > 0) {
328 	nmz_free_hlist(right);
329 	return left;
330     }
331     if (ignore && *ignore && right.num > 0) {
332 	nmz_free_hlist(left);
333 	return right;
334     }
335 
336     if (right.stat != SUCCESS || right.num <= 0) {
337 	nmz_free_hlist(right);
338 	return left;
339     }
340     if (left.stat != SUCCESS || left.num <= 0) {
341 	nmz_free_hlist(right);
342 	return left;
343     }
344 
345     for (v = 0, i = 0, j = 0; i < left.num; i++) {
346 	for (f = 0; j < right.num; j++) {
347 	    if (left.data[i].docid < right.data[j].docid)
348 		break;
349 	    if (left.data[i].docid == right.data[j].docid) {
350 		j++;
351 		f = 1;
352 		break;
353 	    }
354 	}
355 	if (!f) {
356             if (v != i) {
357 	        nmz_copy_hlist(left, v, left, i);
358             }
359 	    v++;
360 	}
361     }
362     nmz_free_hlist(right);
363     left.num = v;
364     if (left.stat != SUCCESS || left.num <= 0)
365 	nmz_free_hlist(left);
366     return left;
367 }
368 
369 
370 /*
371  * merge the left and  right with OR rule
372  */
373 NmzResult
nmz_ormerge(NmzResult left,NmzResult right)374 nmz_ormerge(NmzResult left, NmzResult right)
375 {
376     int i, j, v, n;
377     NmzResult val;
378 
379     val.num  = 0;
380     val.data = NULL;
381     val.stat = SUCCESS;
382 
383     if ((left.stat  != SUCCESS || left.num <= 0) &&
384 	(right.stat != SUCCESS || right.num <= 0))
385     {
386 	nmz_free_hlist(right);
387 	return left;
388     }
389     if (left.stat != SUCCESS || left.num <= 0) {
390 	nmz_free_hlist(left);
391 	return right;
392     }
393     if (right.stat != SUCCESS || right.num <= 0){
394 	nmz_free_hlist(right);
395 	return left;
396     }
397 
398     n = left.num + right.num;
399 
400     nmz_malloc_hlist(&val, n);
401     if (val.stat == ERR_FATAL) {
402 	nmz_free_hlist(left);
403 	nmz_free_hlist(right);
404         return val;
405     }
406 
407     for (v = 0, i = 0, j = 0; i < left.num; i++) {
408 	for (; j < right.num; j++) {
409 	    if (left.data[i].docid < right.data[j].docid) {
410                 break;
411             } else if (left.data[i].docid == right.data[j].docid) {
412 
413                 if (nmz_is_tfidfmode()) {
414                     left.data[i].score =
415 			left.data[i].score + right.data[j].score;
416                 } else {
417                     /* Assign a large number, left or right */
418                     left.data[i].score =
419 			left.data[i].score > right.data[j].score ?
420                         left.data[i].score : right.data[j].score;
421                 }
422 		j++;
423 		break;
424 	    } else {
425 		nmz_copy_hlist(val, v, right, j);
426 		v++;
427 	    }
428 	}
429 	nmz_copy_hlist(val, v, left, i);
430 	v++;
431     }
432 
433     for (; j < right.num; j++) {
434 	nmz_copy_hlist(val, v, right, j);
435 	v++;
436     }
437 
438     nmz_free_hlist(left);
439     nmz_free_hlist(right);
440     val.num = v;
441     return val;
442 }
443 
444 void
nmz_malloc_hlist(NmzResult * hlist,int n)445 nmz_malloc_hlist(NmzResult * hlist, int n)
446 {
447     hlist->num  = 0;
448     hlist->data = NULL;
449     hlist->stat = SUCCESS;
450 
451     if (n <= 0) return;
452     hlist->data = malloc(n * sizeof(struct nmz_data));
453     if (hlist->data == NULL) {
454 	 nmz_set_dyingmsg(nmz_msg("%s", strerror(errno)));
455 	 hlist->stat = ERR_FATAL;
456 	 return;
457     }
458 
459     hlist->num  = n;
460     hlist->stat = SUCCESS;
461 }
462 
463 void
nmz_realloc_hlist(NmzResult * hlist,int n)464 nmz_realloc_hlist(NmzResult * hlist, int n)
465 {
466     if (hlist->stat != SUCCESS || n <= 0) return;
467     hlist->data = realloc(hlist->data, n * sizeof(struct nmz_data));
468     if (hlist->data == NULL) {
469 	 nmz_set_dyingmsg(nmz_msg("%s", strerror(errno)));
470 	 hlist->stat = ERR_FATAL;
471     }
472 }
473 
474 void
nmz_free_hlist(NmzResult hlist)475 nmz_free_hlist(NmzResult hlist)
476 {
477     if (hlist.stat != SUCCESS || hlist.num <= 0) return;
478     free(hlist.data);
479 }
480 
481 void
nmz_copy_hlist(NmzResult to,int n_to,NmzResult from,int n_from)482 nmz_copy_hlist(NmzResult to, int n_to, NmzResult from, int n_from)
483 {
484     to.data[n_to] = from.data[n_from];
485 }
486 
487 void
nmz_set_idxid_hlist(NmzResult hlist,int id)488 nmz_set_idxid_hlist(NmzResult hlist, int id)
489 {
490     int i;
491     for (i = 0; i < hlist.num; i++) {
492         hlist.data[i].idxid = id;
493     }
494 }
495 
496 NmzResult
nmz_merge_hlist(NmzResult * hlists)497 nmz_merge_hlist(NmzResult *hlists)
498 {
499     int i, n;
500     NmzResult value;
501 
502     value.num  = 0;
503     value.data = NULL;
504     value.stat = SUCCESS;
505 
506     if (nmz_get_idxnum() == 1) {
507 	return hlists[0];
508     }
509 
510     for(i = n = 0; i < nmz_get_idxnum(); i++) {
511         if (hlists[i].stat == SUCCESS && hlists[i].num > 0) {
512             n += hlists[i].num;
513         }
514     }
515     nmz_malloc_hlist(&value, n);
516     if (value.stat == ERR_FATAL) {
517         for(i = 0; i < nmz_get_idxnum(); i++) {
518             if (hlists[i].stat != SUCCESS || hlists[i].num <= 0)
519                 continue;
520             nmz_free_hlist(hlists[i]);
521         }
522         return value;
523     }
524 
525     for(i = n = 0; i < nmz_get_idxnum(); i++) {
526         if (hlists[i].stat != SUCCESS || hlists[i].num <= 0)
527             continue;
528         memcpy_hlist(value, hlists[i], n);
529         n += hlists[i].num;
530         nmz_free_hlist(hlists[i]);
531     }
532     value.stat = SUCCESS;
533     value.num = n;
534     return value;
535 }
536 
537 /*
538  * Get date info from NMZ.t and do the missing number processing.
539  */
540 NmzResult
nmz_do_date_processing(NmzResult hlist)541 nmz_do_date_processing(NmzResult hlist)
542 {
543     FILE *date_index;
544     int i, v;
545 
546     date_index = fopen(NMZ.t, "rb");
547     if (date_index == NULL) {
548 	nmz_set_dyingmsg(nmz_msg("%s: %s", NMZ.t, strerror(errno)));
549 	hlist.stat = ERR_FATAL;
550         return hlist; /* error */
551     }
552 
553     for (i = 0; i < hlist.num; i++) {
554         if (fseek(date_index,
555                 hlist.data[i].docid * sizeof(hlist.data[i].date), 0) != 0)
556 	{
557 	    nmz_set_dyingmsg(nmz_msg("%s: %s", NMZ.t, strerror(errno)));
558 	    hlist.stat = ERR_FATAL;
559             fclose(date_index);
560             return hlist; /* error */
561         }
562         nmz_fread(&hlist.data[i].date,
563 		  sizeof(hlist.data[i].date), 1, date_index);
564     }
565 
566     fclose(date_index);
567 
568     for (v = 0, i = 0; i < hlist.num; i++) {
569         if (hlist.data[i].date == -1) {
570             /* The missing number, this document has been deleted */
571         } else {
572             if (v != i) {
573                 nmz_copy_hlist(hlist, v, hlist, i);
574             }
575             v++;
576         }
577     }
578     hlist.num = v;
579 
580     return hlist;
581 }
582 
583 /*
584  * Get the hit list.
585  */
586 NmzResult
nmz_get_hlist(int index)587 nmz_get_hlist(int index)
588 {
589     NmzResult hlist;
590 
591     hlist.num  = 0;
592     hlist.data = NULL;
593     hlist.stat = SUCCESS;
594 
595     if (fseek(Nmz.i, nmz_getidxptr(Nmz.ii, index), 0) != 0) {
596 	hlist.stat = ERR_FATAL;
597 	return hlist; /* error */
598     }
599 
600     {
601         int n, *buf, i;
602         double idf = 1.0;
603 	int sum = 0;
604 	int hit;
605 	int bersize;
606 	int totalsize;
607 
608         nmz_get_unpackw(Nmz.i, &bersize);
609 
610 	hit = bersize;
611 	buf = malloc(hit * sizeof(int));
612 	if (buf == NULL) {
613 	    nmz_set_dyingmsg(nmz_msg("%s", strerror(errno)));
614 	    hlist.data = NULL;
615 	    hlist.stat = ERR_FATAL;
616 	    return hlist;
617 	}
618 
619         n = 0;
620         totalsize = 0;
621         while (totalsize < bersize) {
622             totalsize += nmz_get_unpackw(Nmz.i, &buf[n]);
623             n++;
624         }
625         n /= 2;
626 
627         if (nmz_is_tfidfmode() &&
628 	    (nmz_get_querytokennum() > 1
629 	     /* 0th token is a phrase. */
630 	     || strchr(nmz_get_querytoken(0), '\t') != NULL))
631         {
632             idf = log((double)document_number / n) / log(2);
633 	    nmz_debug_printf("idf: %f (N:%d, n:%d)\n", idf, document_number, n);
634         }
635 
636 	nmz_malloc_hlist(&hlist, n);
637 	if (hlist.stat == ERR_FATAL) {
638 	    free(buf);
639 	    return hlist;
640         }
641 
642 	for (i = 0; i < n; i++) {
643 	    hlist.data[i].docid = *(buf + i * 2) + sum;
644 	    sum = hlist.data[i].docid;
645 	    hlist.data[i].score = *(buf + i * 2 + 1);
646 	    if (nmz_is_tfidfmode()) {
647 		hlist.data[i].score = (int)(hlist.data[i].score * idf) + 1;
648 	    }
649 	}
650         hlist.num = n;
651 	free(buf);
652         hlist = nmz_do_date_processing(hlist);
653     }
654     return hlist;
655 }
656 
657 
658 /*
659  * Interface to do appropriate sorting.
660  */
661 int
nmz_sort_hlist(NmzResult hlist,enum nmz_sortmethod method)662 nmz_sort_hlist(NmzResult hlist, enum nmz_sortmethod method)
663 {
664     set_rank(hlist); /* conserve current order for STABLE sorting */
665 
666     if (method == SORT_BY_FIELD) {
667 	if (field_sort(hlist) != SUCCESS)
668 	    return FAILURE;
669     } else if (method == SORT_BY_DATE) {
670 	qsort(hlist.data, hlist.num, sizeof(hlist.data[0]), date_cmp);
671     } else if (method == SORT_BY_SCORE) {
672 	qsort(hlist.data, hlist.num, sizeof(hlist.data[0]), score_cmp);
673     }
674     return 0;
675 }
676 
677 /*
678  * Reverse the given hlist.
679  * original of this routine was contributed by Furukawa-san [1997-11-13]
680  */
681 int
nmz_reverse_hlist(NmzResult hlist)682 nmz_reverse_hlist(NmzResult hlist)
683 {
684     int m, n;
685     struct nmz_data tmp;
686 
687     m = 0;
688     n = hlist.num - 1;
689     while (m < n) {
690         /* swap */
691         tmp = hlist.data[m];
692         hlist.data[m] = hlist.data[n];
693         hlist.data[n] = tmp;
694 
695         m++;
696         n--;
697     }
698 
699     return 0;
700 }
701 
702 void
nmz_set_docnum(int n)703 nmz_set_docnum(int n)
704 {
705     document_number = n;
706 }
707 
708 void
nmz_set_sortfield(const char * field)709 nmz_set_sortfield(const char *field)
710 {
711     strncpy(field_for_sort, field, BUFSIZE - 1);
712     field_for_sort[BUFSIZE - 1] = '\0';
713 }
714 
715 char *
nmz_get_sortfield(void)716 nmz_get_sortfield(void)
717 {
718     return field_for_sort;
719 }
720 
721 int
nmz_get_docnum()722 nmz_get_docnum()
723 {
724     return document_number;
725 }
726