1 /*
2 *
3 * hlist.c -
4 *
5 * $Id: hlist.c,v 1.51.4.22 2008-03-06 15:38:08 opengl2772 Exp $
6 *
7 * Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
8 * Copyright (C) 2000-2008 Namazu Project All rights reserved.
9 * This is free software with ABSOLUTELY NO WARRANTY.
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
24 * 02111-1307, USA
25 *
26 * This file must be encoded in EUC-JP encoding
27 *
28 */
29
30 #ifdef HAVE_CONFIG_H
31 # include "config.h"
32 #endif
33 #ifdef HAVE_SUPPORT_H
34 # include "support.h"
35 #endif
36
37 #include <stdio.h>
38 #ifdef HAVE_STDLIB_H
39 # include <stdlib.h>
40 #endif
41 #include <math.h>
42
43 #ifdef HAVE_ERRNO_H
44 # include <errno.h>
45 #endif
46
47 #ifdef HAVE_UNISTD_H
48 #include <unistd.h>
49 #else
50 # ifdef _WIN32
51 # include <io.h>
52 # endif
53 #endif
54
55 #ifdef HAVE_STRING_H
56 # include <string.h>
57 #else
58 # include <strings.h>
59 #endif
60
61 #include "libnamazu.h"
62 #include "util.h"
63 #include "hlist.h"
64 #include "field.h"
65 #include "var.h"
66 #include "idxname.h"
67 #include "search.h"
68 #include "query.h"
69 #include "score.h"
70
71 static int document_number = 0; /* Number of documents covered in a target index */
72 static char field_for_sort[BUFSIZE] = ""; /* field_for_sort name used with sorting */
73
74 /*
75 *
76 * Private functions
77 *
78 */
79
80 static void memcpy_hlist(NmzResult, NmzResult, int);
81 static void set_rank(NmzResult);
82 static int field_sort(NmzResult);
83 static int field_scmp(const void*, const void*);
84 static int field_ncmp(const void*, const void*);
85 static int date_cmp(const void*, const void*);
86 static int score_cmp(const void*, const void*);
87 static int intcmp(int v1, int v2);
88
89 static void
memcpy_hlist(NmzResult to,NmzResult from,int n)90 memcpy_hlist(NmzResult to, NmzResult from, int n)
91 {
92 memcpy(to.data + n, from.data, from.num * sizeof (to.data[0]));
93 }
94
95 static void
set_rank(NmzResult hlist)96 set_rank(NmzResult hlist)
97 {
98 int i;
99
100 /* Set rankings in descending order. */
101 for (i = 0 ; i < hlist.num; i++) {
102 hlist.data[i].rank = hlist.num - i;
103 }
104 }
105
106 static int
field_sort(NmzResult hlist)107 field_sort(NmzResult hlist)
108 {
109 int i, numeric = 1;
110
111 for (i = 0; i < hlist.num; i++) {
112 char buf[BUFSIZE];
113 size_t leng;
114
115 nmz_get_field_data(hlist.data[i].idxid,
116 hlist.data[i].docid, field_for_sort, buf);
117 nmz_chomp(buf);
118 leng = strlen(buf);
119
120 if (numeric == 1 && ! nmz_isnumstr(buf)) {
121 numeric = 0;
122 }
123
124 hlist.data[i].field = malloc(leng + 1);
125 if (hlist.data[i].field == NULL) {
126 int j;
127 for (j = 0; j < i; j++) {
128 free(hlist.data[j].field);
129 hlist.data[j].field = NULL;
130 }
131 nmz_set_dyingmsg(nmz_msg("%s", strerror(errno)));
132 return FAILURE;
133 }
134 strcpy(hlist.data[i].field, buf);
135 }
136
137 if (numeric == 1) {
138 qsort(hlist.data, hlist.num, sizeof(hlist.data[0]), field_ncmp);
139
140 } else {
141 qsort(hlist.data, hlist.num, sizeof(hlist.data[0]), field_scmp);
142 }
143
144 for (i = 0; i < hlist.num; i++) {
145 free(hlist.data[i].field);
146 hlist.data[i].field = NULL;
147 }
148 return 0;
149 }
150
151 /*
152 * Compare a pair of hlist.data[].field as string in descending order.
153 */
154 static int
field_scmp(const void * p1,const void * p2)155 field_scmp(const void *p1, const void *p2)
156 {
157 struct nmz_data *v1, *v2;
158 int r;
159
160 v1 = (struct nmz_data *) p1;
161 v2 = (struct nmz_data *) p2;
162
163 r = strcmp(v2->field, v1->field);
164 if (r == 0) {
165 r = intcmp(v2->rank, v1->rank);
166 }
167 return r;
168 }
169
170 /*
171 * Compare a pair of hlist.data[].field as number in descending order.
172 */
173 static int
field_ncmp(const void * p1,const void * p2)174 field_ncmp(const void *p1, const void *p2)
175 {
176 struct nmz_data *v1, *v2;
177 int r;
178
179 v1 = (struct nmz_data *) p1;
180 v2 = (struct nmz_data *) p2;
181
182 r = intcmp(atoi(v2->field), atoi(v1->field));
183 if (r ==0) {
184 r = intcmp(v2->rank, v1->rank);
185 }
186 return r;
187 }
188
189
190 /*
191 * Compare a pair of hlist.data[].score as number in descending order.
192 */
193 static int
score_cmp(const void * p1,const void * p2)194 score_cmp(const void *p1, const void *p2)
195 {
196 struct nmz_data *v1, *v2;
197 int r;
198
199 v1 = (struct nmz_data *) p1;
200 v2 = (struct nmz_data *) p2;
201
202 r = intcmp(v2->score, v1->score);
203 if (r == 0) {
204 r = intcmp(v2->rank, v1->rank);
205 }
206 return r;
207 /* Return (r = v2->score - v1->score) ? r : v2->rank - v1->rank; */
208 }
209
210 /*
211 * Compare a pair of hlist.data[].date as number in descending order.
212 */
213 static int
date_cmp(const void * p1,const void * p2)214 date_cmp(const void *p1, const void *p2)
215 {
216 struct nmz_data *v1, *v2;
217 int r;
218
219 v1 = (struct nmz_data *) p1;
220 v2 = (struct nmz_data *) p2;
221
222 r = intcmp(v2->date, v1->date);
223 if (r == 0) {
224 r = intcmp(v2->rank, v1->rank);
225 }
226 return r;
227 }
228
229 /*
230 * This is a safe routine for comparing two integers. It
231 * could be simply v1 - v2 but it would produce an incorrect
232 * answer if v2 is large and positive and v1 is large and
233 * negative or vice verse.
234 *
235 * See page 36 of "The Practice of Programming" by Brian
236 * W. Kernighan and Rob Pike for details.
237 */
238 static int
intcmp(int v1,int v2)239 intcmp(int v1, int v2)
240 {
241 if (v1 < v2) {
242 return -1;
243 } else if (v1 == v2) {
244 return 0;
245 } else {
246 return 1;
247 }
248 }
249
250 /*
251 *
252 * Public functions
253 *
254 */
255
256 /*
257 * Merge the left and right with AND rule.
258 */
259 NmzResult
nmz_andmerge(NmzResult left,NmzResult right,int * ignore)260 nmz_andmerge(NmzResult left, NmzResult right, int *ignore)
261 {
262 int i, j, v;
263
264 if (left.stat == ERR_TOO_MUCH_MATCH || left.stat == ERR_TOO_MUCH_HIT) {
265 nmz_free_hlist(left);
266 return right;
267 }
268
269 if (right.stat == ERR_TOO_MUCH_MATCH || right.stat == ERR_TOO_MUCH_HIT) {
270 nmz_free_hlist(right);
271 return left;
272 }
273
274 if (left.stat != SUCCESS || left.num <= 0) {
275 nmz_free_hlist(right);
276 return left;
277 }
278 if (right.stat != SUCCESS || right.num <= 0) {
279 nmz_free_hlist(left);
280 return right;
281 }
282
283 for (v = 0, i = 0, j = 0; i < left.num; i++) {
284 for (;; j++) {
285 if (j >= right.num)
286 goto OUT;
287 if (left.data[i].docid < right.data[j].docid)
288 break;
289 if (left.data[i].docid == right.data[j].docid) {
290
291 if (v != i) {
292 nmz_copy_hlist(left, v, left, i);
293 }
294
295 if (nmz_is_tfidfmode()) {
296 left.data[v].score =
297 left.data[i].score + right.data[j].score;
298 } else {
299 /* Assign a smaller number, left or right*/
300 left.data[v].score =
301 left.data[i].score < right.data[j].score ?
302 left.data[i].score : right.data[j].score;
303 }
304 v++;
305 j++;
306 break;
307 }
308 }
309 }
310 OUT:
311 nmz_free_hlist(right);
312 left.num = v;
313 if (left.stat != SUCCESS || left.num <= 0)
314 nmz_free_hlist(left);
315 return left;
316 }
317
318
319 /*
320 * Merge the left and right with NOT rule.
321 */
322 NmzResult
nmz_notmerge(NmzResult left,NmzResult right,int * ignore)323 nmz_notmerge(NmzResult left, NmzResult right, int *ignore)
324 {
325 int i, j, v, f;
326
327 if (ignore && *ignore && left.num > 0) {
328 nmz_free_hlist(right);
329 return left;
330 }
331 if (ignore && *ignore && right.num > 0) {
332 nmz_free_hlist(left);
333 return right;
334 }
335
336 if (right.stat != SUCCESS || right.num <= 0) {
337 nmz_free_hlist(right);
338 return left;
339 }
340 if (left.stat != SUCCESS || left.num <= 0) {
341 nmz_free_hlist(right);
342 return left;
343 }
344
345 for (v = 0, i = 0, j = 0; i < left.num; i++) {
346 for (f = 0; j < right.num; j++) {
347 if (left.data[i].docid < right.data[j].docid)
348 break;
349 if (left.data[i].docid == right.data[j].docid) {
350 j++;
351 f = 1;
352 break;
353 }
354 }
355 if (!f) {
356 if (v != i) {
357 nmz_copy_hlist(left, v, left, i);
358 }
359 v++;
360 }
361 }
362 nmz_free_hlist(right);
363 left.num = v;
364 if (left.stat != SUCCESS || left.num <= 0)
365 nmz_free_hlist(left);
366 return left;
367 }
368
369
370 /*
371 * merge the left and right with OR rule
372 */
373 NmzResult
nmz_ormerge(NmzResult left,NmzResult right)374 nmz_ormerge(NmzResult left, NmzResult right)
375 {
376 int i, j, v, n;
377 NmzResult val;
378
379 val.num = 0;
380 val.data = NULL;
381 val.stat = SUCCESS;
382
383 if ((left.stat != SUCCESS || left.num <= 0) &&
384 (right.stat != SUCCESS || right.num <= 0))
385 {
386 nmz_free_hlist(right);
387 return left;
388 }
389 if (left.stat != SUCCESS || left.num <= 0) {
390 nmz_free_hlist(left);
391 return right;
392 }
393 if (right.stat != SUCCESS || right.num <= 0){
394 nmz_free_hlist(right);
395 return left;
396 }
397
398 n = left.num + right.num;
399
400 nmz_malloc_hlist(&val, n);
401 if (val.stat == ERR_FATAL) {
402 nmz_free_hlist(left);
403 nmz_free_hlist(right);
404 return val;
405 }
406
407 for (v = 0, i = 0, j = 0; i < left.num; i++) {
408 for (; j < right.num; j++) {
409 if (left.data[i].docid < right.data[j].docid) {
410 break;
411 } else if (left.data[i].docid == right.data[j].docid) {
412
413 if (nmz_is_tfidfmode()) {
414 left.data[i].score =
415 left.data[i].score + right.data[j].score;
416 } else {
417 /* Assign a large number, left or right */
418 left.data[i].score =
419 left.data[i].score > right.data[j].score ?
420 left.data[i].score : right.data[j].score;
421 }
422 j++;
423 break;
424 } else {
425 nmz_copy_hlist(val, v, right, j);
426 v++;
427 }
428 }
429 nmz_copy_hlist(val, v, left, i);
430 v++;
431 }
432
433 for (; j < right.num; j++) {
434 nmz_copy_hlist(val, v, right, j);
435 v++;
436 }
437
438 nmz_free_hlist(left);
439 nmz_free_hlist(right);
440 val.num = v;
441 return val;
442 }
443
444 void
nmz_malloc_hlist(NmzResult * hlist,int n)445 nmz_malloc_hlist(NmzResult * hlist, int n)
446 {
447 hlist->num = 0;
448 hlist->data = NULL;
449 hlist->stat = SUCCESS;
450
451 if (n <= 0) return;
452 hlist->data = malloc(n * sizeof(struct nmz_data));
453 if (hlist->data == NULL) {
454 nmz_set_dyingmsg(nmz_msg("%s", strerror(errno)));
455 hlist->stat = ERR_FATAL;
456 return;
457 }
458
459 hlist->num = n;
460 hlist->stat = SUCCESS;
461 }
462
463 void
nmz_realloc_hlist(NmzResult * hlist,int n)464 nmz_realloc_hlist(NmzResult * hlist, int n)
465 {
466 if (hlist->stat != SUCCESS || n <= 0) return;
467 hlist->data = realloc(hlist->data, n * sizeof(struct nmz_data));
468 if (hlist->data == NULL) {
469 nmz_set_dyingmsg(nmz_msg("%s", strerror(errno)));
470 hlist->stat = ERR_FATAL;
471 }
472 }
473
474 void
nmz_free_hlist(NmzResult hlist)475 nmz_free_hlist(NmzResult hlist)
476 {
477 if (hlist.stat != SUCCESS || hlist.num <= 0) return;
478 free(hlist.data);
479 }
480
481 void
nmz_copy_hlist(NmzResult to,int n_to,NmzResult from,int n_from)482 nmz_copy_hlist(NmzResult to, int n_to, NmzResult from, int n_from)
483 {
484 to.data[n_to] = from.data[n_from];
485 }
486
487 void
nmz_set_idxid_hlist(NmzResult hlist,int id)488 nmz_set_idxid_hlist(NmzResult hlist, int id)
489 {
490 int i;
491 for (i = 0; i < hlist.num; i++) {
492 hlist.data[i].idxid = id;
493 }
494 }
495
496 NmzResult
nmz_merge_hlist(NmzResult * hlists)497 nmz_merge_hlist(NmzResult *hlists)
498 {
499 int i, n;
500 NmzResult value;
501
502 value.num = 0;
503 value.data = NULL;
504 value.stat = SUCCESS;
505
506 if (nmz_get_idxnum() == 1) {
507 return hlists[0];
508 }
509
510 for(i = n = 0; i < nmz_get_idxnum(); i++) {
511 if (hlists[i].stat == SUCCESS && hlists[i].num > 0) {
512 n += hlists[i].num;
513 }
514 }
515 nmz_malloc_hlist(&value, n);
516 if (value.stat == ERR_FATAL) {
517 for(i = 0; i < nmz_get_idxnum(); i++) {
518 if (hlists[i].stat != SUCCESS || hlists[i].num <= 0)
519 continue;
520 nmz_free_hlist(hlists[i]);
521 }
522 return value;
523 }
524
525 for(i = n = 0; i < nmz_get_idxnum(); i++) {
526 if (hlists[i].stat != SUCCESS || hlists[i].num <= 0)
527 continue;
528 memcpy_hlist(value, hlists[i], n);
529 n += hlists[i].num;
530 nmz_free_hlist(hlists[i]);
531 }
532 value.stat = SUCCESS;
533 value.num = n;
534 return value;
535 }
536
537 /*
538 * Get date info from NMZ.t and do the missing number processing.
539 */
540 NmzResult
nmz_do_date_processing(NmzResult hlist)541 nmz_do_date_processing(NmzResult hlist)
542 {
543 FILE *date_index;
544 int i, v;
545
546 date_index = fopen(NMZ.t, "rb");
547 if (date_index == NULL) {
548 nmz_set_dyingmsg(nmz_msg("%s: %s", NMZ.t, strerror(errno)));
549 hlist.stat = ERR_FATAL;
550 return hlist; /* error */
551 }
552
553 for (i = 0; i < hlist.num; i++) {
554 if (fseek(date_index,
555 hlist.data[i].docid * sizeof(hlist.data[i].date), 0) != 0)
556 {
557 nmz_set_dyingmsg(nmz_msg("%s: %s", NMZ.t, strerror(errno)));
558 hlist.stat = ERR_FATAL;
559 fclose(date_index);
560 return hlist; /* error */
561 }
562 nmz_fread(&hlist.data[i].date,
563 sizeof(hlist.data[i].date), 1, date_index);
564 }
565
566 fclose(date_index);
567
568 for (v = 0, i = 0; i < hlist.num; i++) {
569 if (hlist.data[i].date == -1) {
570 /* The missing number, this document has been deleted */
571 } else {
572 if (v != i) {
573 nmz_copy_hlist(hlist, v, hlist, i);
574 }
575 v++;
576 }
577 }
578 hlist.num = v;
579
580 return hlist;
581 }
582
583 /*
584 * Get the hit list.
585 */
586 NmzResult
nmz_get_hlist(int index)587 nmz_get_hlist(int index)
588 {
589 NmzResult hlist;
590
591 hlist.num = 0;
592 hlist.data = NULL;
593 hlist.stat = SUCCESS;
594
595 if (fseek(Nmz.i, nmz_getidxptr(Nmz.ii, index), 0) != 0) {
596 hlist.stat = ERR_FATAL;
597 return hlist; /* error */
598 }
599
600 {
601 int n, *buf, i;
602 double idf = 1.0;
603 int sum = 0;
604 int hit;
605 int bersize;
606 int totalsize;
607
608 nmz_get_unpackw(Nmz.i, &bersize);
609
610 hit = bersize;
611 buf = malloc(hit * sizeof(int));
612 if (buf == NULL) {
613 nmz_set_dyingmsg(nmz_msg("%s", strerror(errno)));
614 hlist.data = NULL;
615 hlist.stat = ERR_FATAL;
616 return hlist;
617 }
618
619 n = 0;
620 totalsize = 0;
621 while (totalsize < bersize) {
622 totalsize += nmz_get_unpackw(Nmz.i, &buf[n]);
623 n++;
624 }
625 n /= 2;
626
627 if (nmz_is_tfidfmode() &&
628 (nmz_get_querytokennum() > 1
629 /* 0th token is a phrase. */
630 || strchr(nmz_get_querytoken(0), '\t') != NULL))
631 {
632 idf = log((double)document_number / n) / log(2);
633 nmz_debug_printf("idf: %f (N:%d, n:%d)\n", idf, document_number, n);
634 }
635
636 nmz_malloc_hlist(&hlist, n);
637 if (hlist.stat == ERR_FATAL) {
638 free(buf);
639 return hlist;
640 }
641
642 for (i = 0; i < n; i++) {
643 hlist.data[i].docid = *(buf + i * 2) + sum;
644 sum = hlist.data[i].docid;
645 hlist.data[i].score = *(buf + i * 2 + 1);
646 if (nmz_is_tfidfmode()) {
647 hlist.data[i].score = (int)(hlist.data[i].score * idf) + 1;
648 }
649 }
650 hlist.num = n;
651 free(buf);
652 hlist = nmz_do_date_processing(hlist);
653 }
654 return hlist;
655 }
656
657
658 /*
659 * Interface to do appropriate sorting.
660 */
661 int
nmz_sort_hlist(NmzResult hlist,enum nmz_sortmethod method)662 nmz_sort_hlist(NmzResult hlist, enum nmz_sortmethod method)
663 {
664 set_rank(hlist); /* conserve current order for STABLE sorting */
665
666 if (method == SORT_BY_FIELD) {
667 if (field_sort(hlist) != SUCCESS)
668 return FAILURE;
669 } else if (method == SORT_BY_DATE) {
670 qsort(hlist.data, hlist.num, sizeof(hlist.data[0]), date_cmp);
671 } else if (method == SORT_BY_SCORE) {
672 qsort(hlist.data, hlist.num, sizeof(hlist.data[0]), score_cmp);
673 }
674 return 0;
675 }
676
677 /*
678 * Reverse the given hlist.
679 * original of this routine was contributed by Furukawa-san [1997-11-13]
680 */
681 int
nmz_reverse_hlist(NmzResult hlist)682 nmz_reverse_hlist(NmzResult hlist)
683 {
684 int m, n;
685 struct nmz_data tmp;
686
687 m = 0;
688 n = hlist.num - 1;
689 while (m < n) {
690 /* swap */
691 tmp = hlist.data[m];
692 hlist.data[m] = hlist.data[n];
693 hlist.data[n] = tmp;
694
695 m++;
696 n--;
697 }
698
699 return 0;
700 }
701
702 void
nmz_set_docnum(int n)703 nmz_set_docnum(int n)
704 {
705 document_number = n;
706 }
707
708 void
nmz_set_sortfield(const char * field)709 nmz_set_sortfield(const char *field)
710 {
711 strncpy(field_for_sort, field, BUFSIZE - 1);
712 field_for_sort[BUFSIZE - 1] = '\0';
713 }
714
715 char *
nmz_get_sortfield(void)716 nmz_get_sortfield(void)
717 {
718 return field_for_sort;
719 }
720
721 int
nmz_get_docnum()722 nmz_get_docnum()
723 {
724 return document_number;
725 }
726