1 /*
2 * Copyright (C) 1999-2004 Etymon Systems, Inc.
3 *
4 * Authors: Nassib Nassar
5 */
6
7 #include <stdlib.h>
8 #include <stdio.h>
9 #include "index.h"
10 #include "lock.h"
11 #include "util.h"
12 #include "fdef.h"
13 #include "stem.h"
14 #include "info.h"
15 #include "linear.h"
16
17 #include "text.h"
18 #include "xml.h"
19 #include "xml_test.h"
20 #include "erc.h"
21
22 #include "open.h"
23 extern ETYMON_AF_STATE *etymon_af_state[];
24
25 /* assumes that the buffer, absolute_path, is of size
26 ETYMON_MAX_PATH_SIZE, that relative_path contains a valid
27 null-terminated string, that cwd has been filled in using getcwd(),
28 and that relative_path != absolute_path */
etymon_index_expand_path(char * relative_path,char * absolute_path,char * cwd)29 void etymon_index_expand_path(char* relative_path, char* absolute_path, char* cwd) {
30 /* check if it's already an absolute path */
31 if (*relative_path == '/') {
32 strncpy(absolute_path, relative_path, ETYMON_MAX_PATH_SIZE - 1);
33 absolute_path[ETYMON_MAX_PATH_SIZE - 1] = '\0';
34 return;
35 } else {
36 char* r_p;
37 char* slash_p;
38 char* p;
39 int x, y, seg_len;
40
41 /* otherwise combine cwd and relative_path to get the absolute path */
42
43 /* start off with the cwd */
44 strcpy(absolute_path, cwd);
45
46 /* now take apart relative_path */
47 r_p = relative_path;
48 while ( (slash_p = strchr(r_p, '/')) != NULL ) {
49 /* now r_p points to the beginning and slash_p to the next '/' */
50 seg_len = slash_p - r_p;
51 /* check if it's "../" */
52 if ( (seg_len == 2) && (strncmp(r_p, "..", 2) == 0) ) {
53 /* remove the last segment from absolute_path */
54 p = strrchr(absolute_path, '/');
55 if (p == NULL) {
56 /* ERROR and return */
57 }
58 *p = '\0';
59 }
60 /* check if it's "./" */
61 else if ( (seg_len == 1) && (*r_p == '.') ) {
62 /* do nothing */
63 }
64 else {
65 /* append the segment to the end of absolute_path */
66 x = strlen(absolute_path);
67 if ((ETYMON_MAX_PATH_SIZE - x) >= (seg_len + 2)) {
68 absolute_path[x++] = '/';
69 memcpy(absolute_path + x, r_p, seg_len);
70 absolute_path[x + seg_len] = '\0';
71 }
72 }
73 r_p = slash_p + 1;
74 }
75 x = strlen(absolute_path);
76 y = strlen(r_p);
77 if ((ETYMON_MAX_PATH_SIZE - x) >= (y + 2)) {
78 absolute_path[x] = '/';
79 memcpy(absolute_path + x + 1, r_p, y + 1);
80 }
81 }
82 }
83
84 #ifdef ZZZZZ
85
86 /*#define OPT_STDIO*/
87
88 #ifdef OPT_STDIO
89
90 /* this was written before the advent of ETYMON_INDEX_PAGE_L.post_n[],
91 ETYMON_INDEX_UPOST.fields_n, and ETYMON_INDEX_UPOST.word_numbers_n
92 in the first unoptimized pass; so it explicitly counts these values
93 while building the optimized structures */
etymon_index_optimize_old_stdio(ETYMON_INDEX_OPTIONS * opt)94 int etymon_index_optimize_old_stdio(ETYMON_INDEX_OPTIONS* opt) {
95 int dbinfo_fd, udict_fd, upost_fd, ufield_fd, uword_fd, lpost_fd, lfield_fd, lword_fd;
96 FILE* dbinfo_f;
97 FILE* udict_f;
98 FILE* upost_f;
99 FILE* ufield_f;
100 FILE* uword_f;
101 FILE* lpost_f;
102 FILE* lfield_f;
103 FILE* lword_f;
104 int x;
105 etymon_af_off_t udict_size, upost_isize, ufield_isize, uword_isize, lpost_isize, lfield_isize, lword_isize;
106 Uint4 magic;
107 ETYMON_DB_INFO dbinfo;
108 char fn[ETYMON_MAX_PATH_SIZE];
109 ETYMON_AF_STAT st;
110 ssize_t nbytes;
111 Uint4 udict_p, upost_p, lpost_p_save;
112 Uint4 ufield_p, lfield_p_save, field_count;
113 Uint4 uword_p, lword_p_save, word_count;
114 ETYMON_INDEX_PAGE_L page_l;
115 ETYMON_INDEX_PAGE_NL page_nl;
116 Uint1 leaf_flag;
117 ETYMON_INDEX_UPOST upost;
118 ETYMON_INDEX_LPOST lpost;
119 ETYMON_INDEX_UFIELD ufield;
120 ETYMON_INDEX_LFIELD lfield;
121 ETYMON_INDEX_UWORD uword;
122 ETYMON_INDEX_LWORD lword;
123
124 /* make sure database is ready */
125 if (etymon_db_ready(opt->dbname, &(opt->log)) == 0) {
126 int e;
127 char s[ETYMON_MAX_MSG_SIZE];
128 sprintf(s, "%s: Database not ready", opt->dbname);
129 e = opt->log.error(s, 1);
130 if (e != 0) {
131 exit(e);
132 }
133 return -1;
134 }
135
136 /* lock the database */
137 etymon_db_lock(opt->dbname, &(opt->log));
138
139 /* open db info file for read/write */
140 etymon_db_construct_path(ETYMON_DBF_INFO, opt->dbname, fn);
141 dbinfo_fd = open(fn, O_RDWR | ETYMON_AF_O_LARGEFILE);
142 if (dbinfo_fd == -1) {
143 int e;
144 char s[ETYMON_MAX_MSG_SIZE];
145 sprintf(s, "%s: Unable to open database", opt->dbname);
146 e = opt->log.error(s, 1);
147 etymon_db_unlock(opt->dbname, &(opt->log));
148 if (e != 0) {
149 exit(e);
150 }
151 return -1;
152 }
153 nbytes = read(dbinfo_fd, &magic, sizeof(Uint4));
154 if (nbytes != sizeof(Uint4)) {
155 /* ERROR */
156 printf("unable to read %s\n", fn);
157 exit(1);
158 }
159 if (magic != ETYMON_INDEX_MAGIC) {
160 int e;
161 char s[ETYMON_MAX_MSG_SIZE];
162 sprintf(s, "%s: Database created by incompatible version", opt->dbname);
163 e = opt->log.error(s, 1);
164 close(dbinfo_fd);
165 etymon_db_unlock(opt->dbname, &(opt->log));
166 if (e != 0) {
167 exit(e);
168 }
169 return -1;
170 }
171 nbytes = read(dbinfo_fd, &dbinfo, sizeof(ETYMON_DB_INFO));
172 if (nbytes != sizeof(ETYMON_DB_INFO)) {
173 /* ERROR */
174 printf("unable to read %s\n", fn);
175 exit(1);
176 }
177 dbinfo_f = fdopen(dbinfo_fd, "r+b");
178
179 /* make sure the database is not already optimized */
180 if (dbinfo.optimized == 1) {
181 int e;
182 char s[ETYMON_MAX_MSG_SIZE];
183 sprintf(s, "%s: Database is already linearized", opt->dbname);
184 e = opt->log.error(s, 1);
185 fclose(dbinfo_f);
186 close(dbinfo_fd);
187 etymon_db_unlock(opt->dbname, &(opt->log)); /* unlock the database */
188 if (e != 0) {
189 exit(e);
190 }
191 return -1;
192 }
193
194 /* open files */
195
196 /* open udict for read/write */
197 etymon_db_construct_path(ETYMON_DBF_UDICT, opt->dbname, fn);
198 udict_fd = open(fn, O_RDWR | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
199 if (udict_fd == -1) {
200 /* ERROR */
201 printf("unable to open %s for read/write\n", fn);
202 exit(1);
203 }
204 /* stat udict to get size */
205 if (etymon_af_fstat(udict_fd, &st) == -1) {
206 perror("index_optimize():fstat()");
207 }
208 udict_size = st.st_size;
209 udict_f = fdopen(udict_fd, "r+b");
210
211 /* open upost for read */
212 etymon_db_construct_path(ETYMON_DBF_UPOST, opt->dbname, fn);
213 upost_fd = open(fn, O_RDONLY | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
214 if (upost_fd == -1) {
215 /* ERROR */
216 printf("unable to open %s for read\n", fn);
217 exit(1);
218 }
219 /* stat upost to get size */
220 if (etymon_af_fstat(upost_fd, &st) == -1) {
221 perror("index_optimize():fstat()");
222 }
223 upost_isize = st.st_size / sizeof(ETYMON_INDEX_UPOST);
224 upost_f = fdopen(upost_fd, "rb");
225
226 /* open ufield for read */
227 etymon_db_construct_path(ETYMON_DBF_UFIELD, opt->dbname, fn);
228 ufield_fd = open(fn, O_RDONLY | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
229 if (ufield_fd == -1) {
230 /* ERROR */
231 printf("unable to open %s for read\n", fn);
232 exit(1);
233 }
234 /* stat ufield to get size */
235 if (etymon_af_fstat(ufield_fd, &st) == -1) {
236 perror("index_optimize():fstat()");
237 }
238 ufield_isize = st.st_size / sizeof(ETYMON_INDEX_UFIELD);
239 ufield_f = fdopen(ufield_fd, "rb");
240
241 /* open uword for read */
242 etymon_db_construct_path(ETYMON_DBF_UWORD, opt->dbname, fn);
243 uword_fd = open(fn, O_RDONLY | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
244 if (uword_fd == -1) {
245 /* ERROR */
246 printf("unable to open %s for read\n", fn);
247 exit(1);
248 }
249 /* stat uword to get size */
250 if (etymon_af_fstat(uword_fd, &st) == -1) {
251 perror("index_optimize():fstat()");
252 }
253 uword_isize = st.st_size / sizeof(ETYMON_INDEX_UWORD);
254 uword_f = fdopen(uword_fd, "rb");
255
256 /* open lpost for append */
257 etymon_db_construct_path(ETYMON_DBF_LPOST, opt->dbname, fn);
258 lpost_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
259 if (lpost_fd == -1) {
260 /* ERROR */
261 printf("unable to open %s for append\n", fn);
262 exit(1);
263 }
264 /* stat lpost to get size */
265 if (etymon_af_fstat(lpost_fd, &st) == -1) {
266 perror("index_optimize():fstat()");
267 }
268 lpost_isize = st.st_size / sizeof(ETYMON_INDEX_LPOST);
269 lpost_f = fdopen(lpost_fd, "ab");
270
271 /* open lfield for append */
272 etymon_db_construct_path(ETYMON_DBF_LFIELD, opt->dbname, fn);
273 lfield_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
274 if (lfield_fd == -1) {
275 /* ERROR */
276 printf("unable to open %s for append\n", fn);
277 exit(1);
278 }
279 /* stat lfield to get size */
280 if (etymon_af_fstat(lfield_fd, &st) == -1) {
281 perror("index_optimize():fstat()");
282 }
283 lfield_isize = st.st_size / sizeof(ETYMON_INDEX_LFIELD);
284 lfield_f = fdopen(lfield_fd, "ab");
285
286 /* open lword for append */
287 etymon_db_construct_path(ETYMON_DBF_LWORD, opt->dbname, fn);
288 lword_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
289 if (lword_fd == -1) {
290 /* ERROR */
291 printf("unable to open %s for append\n", fn);
292 exit(1);
293 }
294 /* stat lword to get size */
295 if (etymon_af_fstat(lword_fd, &st) == -1) {
296 perror("index_optimize():fstat()");
297 }
298 lword_isize = st.st_size / sizeof(ETYMON_INDEX_LWORD);
299 lword_f = fdopen(lword_fd, "ab");
300
301 /* optimize! */
302
303 /* if (opt->verbose >= 2) {*/
304 printf("Linearizing (new)\n");
305 /* }*/
306
307 /* first descend to the left-most leaf page */
308 udict_p = dbinfo.udict_root;
309 do {
310 if (fseeko(udict_f, (etymon_af_off_t)udict_p, SEEK_SET) == -1) {
311 perror("index_optimize():fseeko()");
312 }
313 if (fread(&(leaf_flag), 1, 1, udict_f) < 1) {
314 perror("index_optimize():fread()");
315 }
316 if (leaf_flag == 0) {
317 if (fread(&page_nl, 1, sizeof(ETYMON_INDEX_PAGE_NL), udict_f) < sizeof(ETYMON_INDEX_PAGE_NL)) {
318 perror("index_optimize():fread()");
319 }
320 udict_p = page_nl.p[0];
321 }
322 } while (leaf_flag == 0);
323
324 /* now go through all leaf pages */
325
326 do {
327
328 if (fseeko(udict_f, ((etymon_af_off_t)(udict_p + 1)), SEEK_SET) == -1) {
329 perror("index_optimize():fseeko()");
330 }
331 if (fread(&page_l, 1, sizeof(ETYMON_INDEX_PAGE_L), udict_f) < sizeof(ETYMON_INDEX_PAGE_L)) {
332 perror("index_optimize():fread()");
333 }
334
335 /* examine each key and optimize associated posting, field data, and word number data */
336
337 for (x = 0; x < page_l.n; x++) {
338
339 page_l.post_n[x] = 0;
340
341 /* run through postings, assume matching doc_id's are consecutive */
342 lpost_p_save = lpost_isize + 1;
343 upost_p = page_l.post[x];
344 lpost.doc_id = 0;
345 while (upost_p != 0) {
346 /* read a upost node */
347 if (fseeko(upost_f,
348 (etymon_af_off_t)( ((etymon_af_off_t)(upost_p - 1)) * ((etymon_af_off_t)(sizeof(ETYMON_INDEX_UPOST))) ),
349 SEEK_SET) == -1) {
350 perror("index_optimize():fseeko()");
351 }
352 if (fread(&upost, 1, sizeof(ETYMON_INDEX_UPOST), upost_f) < sizeof(ETYMON_INDEX_UPOST)) {
353 perror("index_optimize():fread()");
354 }
355
356 /* optimize fields */
357 /* DO WE NEED TO LOOK FOR DUPLICATES? */
358 lfield_p_save = lfield_isize + 1;
359 field_count = 0;
360 ufield_p = upost.fields;
361 while (ufield_p != 0) {
362 field_count++;
363 if (fseeko(ufield_f,
364 (etymon_af_off_t)( ((etymon_af_off_t)(ufield_p - 1)) *
365 ((etymon_af_off_t)(sizeof(ETYMON_INDEX_UFIELD))) ),
366 SEEK_SET) == -1) {
367 perror("index_optimize():fseeko()");
368 }
369 if (fread(&ufield, 1, sizeof(ETYMON_INDEX_UFIELD), ufield_f) <
370 sizeof(ETYMON_INDEX_UFIELD)) {
371 perror("index_optimize():fread()");
372 }
373 memcpy(lfield.fields, ufield.fields, ETYMON_MAX_FIELD_NEST * 2);
374 if (fwrite(&lfield, 1, sizeof(ETYMON_INDEX_LFIELD), lfield_f) <
375 sizeof(ETYMON_INDEX_LFIELD)) {
376 perror("index_optimize():fwrite()");
377 }
378 lfield_isize++;
379 ufield_p = ufield.next;
380 }
381
382 /* optimize word numbers */
383 lword_p_save = lword_isize + 1;
384 word_count = 0;
385 uword_p = upost.word_numbers;
386 while (uword_p != 0) {
387 word_count++;
388 if (fseeko(uword_f,
389 (etymon_af_off_t)( ((etymon_af_off_t)(uword_p - 1)) *
390 ((etymon_af_off_t)(sizeof(ETYMON_INDEX_UWORD))) ),
391 SEEK_SET) == -1) {
392 perror("index_optimize():fseeko()");
393 }
394 if (fread(&uword, 1, sizeof(ETYMON_INDEX_UWORD), uword_f) <
395 sizeof(ETYMON_INDEX_UWORD)) {
396 perror("index_optimize():fread()");
397 }
398 lword.wn = uword.wn;
399 if (fwrite(&lword, 1, sizeof(ETYMON_INDEX_LWORD), lword_f) <
400 sizeof(ETYMON_INDEX_LWORD)) {
401 perror("index_optimize():fwrite()");
402 }
403 lword_isize++;
404 uword_p = uword.next;
405 }
406
407 /* compare the doc_id with our cached lpost */
408 if (upost.doc_id == lpost.doc_id) {
409 /* increment the frequency and field count */
410 lpost.freq += upost.freq;
411 lpost.fields_n += field_count;
412 lpost.word_numbers_n += word_count;
413 } else {
414 /* flush lpost */
415 if (lpost.doc_id != 0) { /* only flush if lpost contains something */
416 if (fwrite(&lpost, 1, sizeof(ETYMON_INDEX_LPOST), lpost_f) <
417 sizeof(ETYMON_INDEX_LPOST)) {
418 perror("index_optimize():fwrite()");
419 }
420 lpost_isize++;
421 page_l.post_n[x]++;
422 }
423 /* replace lpost with upost */
424 lpost.doc_id = upost.doc_id;
425 lpost.freq = upost.freq;
426 lpost.fields_n = field_count;
427 lpost.word_numbers_n = word_count;
428 /* set field pointer */
429 lpost.fields = lfield_p_save;
430 lpost.word_numbers = lword_p_save;
431 }
432 upost_p = upost.next;
433 } /* while */
434 /* flush lpost */
435 if (lpost.doc_id != 0) { /* only flush if lpost contains something */
436 if (fwrite(&lpost, 1, sizeof(ETYMON_INDEX_LPOST), lpost_f) < sizeof(ETYMON_INDEX_LPOST)) {
437 perror("index_optimize():fwrite()");
438 }
439 lpost_isize++;
440 page_l.post_n[x]++;
441 }
442 page_l.post[x] = lpost_p_save;
443
444 } /* for */
445
446 /* write out updated leaf page */
447 if (fseeko(udict_f, ((etymon_af_off_t)(udict_p + 1)), SEEK_SET) == -1) {
448 perror("index_optimize():fseeko()");
449 }
450 if (fwrite(&page_l, 1, sizeof(ETYMON_INDEX_PAGE_L), udict_f) < sizeof(ETYMON_INDEX_PAGE_L)) {
451 perror("index_optimize():fwrite()");
452 }
453
454 udict_p = page_l.next;
455
456 } while (udict_p != 0);
457
458 /* update dbinfo */
459 if (fseeko(dbinfo_f, (etymon_af_off_t)0, SEEK_SET) == -1) {
460 perror("index_optimize():fseeko()");
461 }
462 magic = ETYMON_INDEX_MAGIC;
463 nbytes = fwrite(&magic, 1, sizeof(Uint4), dbinfo_f);
464 if (nbytes != sizeof(Uint4)) {
465 /* ERROR */
466 printf("unable to write MN\n");
467 exit(1);
468 }
469 dbinfo.optimized = 1;
470 nbytes = fwrite(&dbinfo, 1, sizeof(ETYMON_DB_INFO), dbinfo_f);
471 if (nbytes != sizeof(ETYMON_DB_INFO)) {
472 /* ERROR */
473 printf("unable to write DBI\n");
474 exit(1);
475 }
476
477 /* clean up */
478 fclose(dbinfo_f);
479 fclose(udict_f);
480 fclose(upost_f);
481 fclose(ufield_f);
482 fclose(uword_f);
483 fclose(lpost_f);
484 fclose(lfield_f);
485 fclose(lword_f);
486 close(dbinfo_fd);
487 close(udict_fd);
488 close(upost_fd);
489 close(ufield_fd);
490 close(uword_fd);
491 close(lpost_fd);
492 close(lfield_fd);
493 close(lword_fd);
494
495 /* reopen and truncate upost */
496 etymon_db_construct_path(ETYMON_DBF_UPOST, opt->dbname, fn);
497 upost_fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
498 close(upost_fd);
499 /* reopen and truncate ufield */
500 etymon_db_construct_path(ETYMON_DBF_UFIELD, opt->dbname, fn);
501 ufield_fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
502 close(ufield_fd);
503 /* reopen and truncate uword */
504 etymon_db_construct_path(ETYMON_DBF_UWORD, opt->dbname, fn);
505 uword_fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
506 close(uword_fd);
507
508 /* unlock the database */
509 etymon_db_unlock(opt->dbname, &(opt->log));
510
511 return 0;
512
513 } /* optimize_new() */
514
515 #endif
516
517 /* this was written before the advent of ETYMON_INDEX_PAGE_L.post_n[],
518 ETYMON_INDEX_UPOST.fields_n, and ETYMON_INDEX_UPOST.word_numbers_n
519 in the first unoptimized pass; so it explicitly counts these values
520 while building the optimized structures */
etymon_index_optimize_old(ETYMON_INDEX_OPTIONS * opt)521 int etymon_index_optimize_old(ETYMON_INDEX_OPTIONS* opt) {
522 int dbinfo_fd, udict_fd, upost_fd, ufield_fd, uword_fd, lpost_fd, lfield_fd, lword_fd;
523 int x;
524 etymon_af_off_t udict_size, upost_isize, ufield_isize, uword_isize, lpost_isize, lfield_isize, lword_isize;
525 Uint4 magic;
526 ETYMON_DB_INFO dbinfo;
527 char fn[ETYMON_MAX_PATH_SIZE];
528 ETYMON_AF_STAT st;
529 ssize_t nbytes;
530 Uint4 udict_p, upost_p, lpost_p_save;
531 Uint4 ufield_p, lfield_p_save, field_count;
532 Uint4 uword_p, lword_p_save, word_count;
533 ETYMON_INDEX_PAGE_L page_l;
534 ETYMON_INDEX_PAGE_NL page_nl;
535 Uint1 leaf_flag;
536 ETYMON_INDEX_UPOST upost;
537 ETYMON_INDEX_LPOST lpost;
538 ETYMON_INDEX_UFIELD ufield;
539 ETYMON_INDEX_LFIELD lfield;
540 ETYMON_INDEX_UWORD uword;
541 ETYMON_INDEX_LWORD lword;
542
543 /* make sure database is ready */
544 if (etymon_db_ready(opt->dbname) == 0) {
545 int e;
546 char s[ETYMON_MAX_MSG_SIZE];
547 sprintf(s, "%s: Database not ready", opt->dbname);
548 e = opt->log.error(s, 1);
549 if (e != 0) {
550 exit(e);
551 }
552 return -1;
553 }
554
555 /* lock the database */
556 etymon_db_lock(opt->dbname, &(opt->log));
557
558 /* open db info file for read/write */
559 etymon_db_construct_path(ETYMON_DBF_INFO, opt->dbname, fn);
560 dbinfo_fd = open(fn, O_RDWR | ETYMON_AF_O_LARGEFILE);
561 if (dbinfo_fd == -1) {
562 int e;
563 char s[ETYMON_MAX_MSG_SIZE];
564 sprintf(s, "%s: Unable to open database", opt->dbname);
565 e = opt->log.error(s, 1);
566 etymon_db_unlock(opt->dbname);
567 if (e != 0) {
568 exit(e);
569 }
570 return -1;
571 }
572 nbytes = read(dbinfo_fd, &magic, sizeof(Uint4));
573 if (nbytes != sizeof(Uint4)) {
574 /* ERROR */
575 printf("unable to read %s\n", fn);
576 exit(1);
577 }
578 if (magic != ETYMON_INDEX_MAGIC) {
579 int e;
580 char s[ETYMON_MAX_MSG_SIZE];
581 sprintf(s, "%s: Database created by incompatible version", opt->dbname);
582 e = opt->log.error(s, 1);
583 close(dbinfo_fd);
584 etymon_db_unlock(opt->dbname);
585 if (e != 0) {
586 exit(e);
587 }
588 return -1;
589 }
590 nbytes = read(dbinfo_fd, &dbinfo, sizeof(ETYMON_DB_INFO));
591 if (nbytes != sizeof(ETYMON_DB_INFO)) {
592 /* ERROR */
593 printf("unable to read %s\n", fn);
594 exit(1);
595 }
596
597 /* make sure the database is not already optimized */
598 if (dbinfo.optimized == 1) {
599 int e;
600 char s[ETYMON_MAX_MSG_SIZE];
601 sprintf(s, "%s: Database is already linearized", opt->dbname);
602 e = opt->log.error(s, 1);
603 close(dbinfo_fd);
604 etymon_db_unlock(opt->dbname); /* unlock the database */
605 if (e != 0) {
606 exit(e);
607 }
608 return -1;
609 }
610
611 /* open files */
612
613 /* open udict for read/write */
614 etymon_db_construct_path(ETYMON_DBF_UDICT, opt->dbname, fn);
615 udict_fd = open(fn, O_RDWR | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
616 if (udict_fd == -1) {
617 /* ERROR */
618 printf("unable to open %s for read/write\n", fn);
619 exit(1);
620 }
621 /* stat udict to get size */
622 if (etymon_af_fstat(udict_fd, &st) == -1) {
623 perror("index_optimize():fstat()");
624 }
625 udict_size = st.st_size;
626
627 /* open upost for read */
628 etymon_db_construct_path(ETYMON_DBF_UPOST, opt->dbname, fn);
629 upost_fd = open(fn, O_RDONLY | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
630 if (upost_fd == -1) {
631 /* ERROR */
632 printf("unable to open %s for read\n", fn);
633 exit(1);
634 }
635 /* stat upost to get size */
636 if (etymon_af_fstat(upost_fd, &st) == -1) {
637 perror("index_optimize():fstat()");
638 }
639 upost_isize = st.st_size / sizeof(ETYMON_INDEX_UPOST);
640
641 /* open ufield for read */
642 etymon_db_construct_path(ETYMON_DBF_UFIELD, opt->dbname, fn);
643 ufield_fd = open(fn, O_RDONLY | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
644 if (ufield_fd == -1) {
645 /* ERROR */
646 printf("unable to open %s for read\n", fn);
647 exit(1);
648 }
649 /* stat ufield to get size */
650 if (etymon_af_fstat(ufield_fd, &st) == -1) {
651 perror("index_optimize():fstat()");
652 }
653 ufield_isize = st.st_size / sizeof(ETYMON_INDEX_UFIELD);
654
655 /* open uword for read */
656 etymon_db_construct_path(ETYMON_DBF_UWORD, opt->dbname, fn);
657 uword_fd = open(fn, O_RDONLY | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
658 if (uword_fd == -1) {
659 /* ERROR */
660 printf("unable to open %s for read\n", fn);
661 exit(1);
662 }
663 /* stat uword to get size */
664 if (etymon_af_fstat(uword_fd, &st) == -1) {
665 perror("index_optimize():fstat()");
666 }
667 uword_isize = st.st_size / sizeof(ETYMON_INDEX_UWORD);
668
669 /* open lpost for append */
670 etymon_db_construct_path(ETYMON_DBF_LPOST, opt->dbname, fn);
671 lpost_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
672 if (lpost_fd == -1) {
673 /* ERROR */
674 printf("unable to open %s for append\n", fn);
675 exit(1);
676 }
677 /* stat lpost to get size */
678 if (etymon_af_fstat(lpost_fd, &st) == -1) {
679 perror("index_optimize():fstat()");
680 }
681 lpost_isize = st.st_size / sizeof(ETYMON_INDEX_LPOST);
682
683 /* open lfield for append */
684 etymon_db_construct_path(ETYMON_DBF_LFIELD, opt->dbname, fn);
685 lfield_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
686 if (lfield_fd == -1) {
687 /* ERROR */
688 printf("unable to open %s for append\n", fn);
689 exit(1);
690 }
691 /* stat lfield to get size */
692 if (etymon_af_fstat(lfield_fd, &st) == -1) {
693 perror("index_optimize():fstat()");
694 }
695 lfield_isize = st.st_size / sizeof(ETYMON_INDEX_LFIELD);
696
697 /* open lword for append */
698 etymon_db_construct_path(ETYMON_DBF_LWORD, opt->dbname, fn);
699 lword_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
700 if (lword_fd == -1) {
701 /* ERROR */
702 printf("unable to open %s for append\n", fn);
703 exit(1);
704 }
705 /* stat lword to get size */
706 if (etymon_af_fstat(lword_fd, &st) == -1) {
707 perror("index_optimize():fstat()");
708 }
709 lword_isize = st.st_size / sizeof(ETYMON_INDEX_LWORD);
710
711 /* optimize! */
712
713 if (opt->verbose >= 2) {
714 printf("Linearizing (old)\n");
715 }
716
717 /* first descend to the left-most leaf page */
718 udict_p = dbinfo.udict_root;
719 do {
720 if (etymon_af_lseek(udict_fd, (etymon_af_off_t)udict_p, SEEK_SET) == -1) {
721 perror("index_optimize():lseek()");
722 }
723 if (read(udict_fd, &(leaf_flag), 1) == -1) {
724 perror("index_optimize():read()");
725 }
726 if (leaf_flag == 0) {
727 if (read(udict_fd, &page_nl, sizeof(ETYMON_INDEX_PAGE_NL)) == -1) {
728 perror("index_optimize():read()");
729 }
730 udict_p = page_nl.p[0];
731 }
732 } while (leaf_flag == 0);
733
734 /* now go through all leaf pages */
735
736 do {
737
738 if (etymon_af_lseek(udict_fd, ((etymon_af_off_t)(udict_p + 1)), SEEK_SET) == -1) {
739 perror("index_optimize():lseek()");
740 }
741 if (read(udict_fd, &page_l, sizeof(ETYMON_INDEX_PAGE_L)) == -1) {
742 perror("index_optimize():read()");
743 }
744
745 /* examine each key and optimize associated posting, field data, and word number data */
746
747 for (x = 0; x < page_l.n; x++) {
748
749 page_l.post_n[x] = 0;
750
751 /* run through postings, assume matching doc_id's are consecutive */
752 lpost_p_save = lpost_isize + 1;
753 upost_p = page_l.post[x];
754 lpost.doc_id = 0;
755 while (upost_p != 0) {
756 /* read a upost node */
757 if (etymon_af_lseek(upost_fd,
758 (etymon_af_off_t)( ((etymon_af_off_t)(upost_p - 1)) * ((etymon_af_off_t)(sizeof(ETYMON_INDEX_UPOST))) ),
759 SEEK_SET) == -1) {
760 perror("index_optimize():lseek()");
761 }
762 if (read(upost_fd, &upost, sizeof(ETYMON_INDEX_UPOST)) == -1) {
763 perror("index_optimize():read()");
764 }
765
766 /* optimize fields */
767 /* DO WE NEED TO LOOK FOR DUPLICATES? */
768 lfield_p_save = lfield_isize + 1;
769 field_count = 0;
770 ufield_p = upost.fields;
771 while (ufield_p != 0) {
772 field_count++;
773 if (etymon_af_lseek(ufield_fd,
774 (etymon_af_off_t)( ((etymon_af_off_t)(ufield_p - 1)) *
775 ((etymon_af_off_t)(sizeof(ETYMON_INDEX_UFIELD))) ),
776 SEEK_SET) == -1) {
777 perror("index_optimize():lseek()");
778 }
779 if (read(ufield_fd, &ufield, sizeof(ETYMON_INDEX_UFIELD)) == -1) {
780 perror("index_optimize():read()");
781 }
782 memcpy(lfield.fields, ufield.fields, ETYMON_MAX_FIELD_NEST * 2);
783 if (write(lfield_fd, &lfield, sizeof(ETYMON_INDEX_LFIELD)) == -1) {
784 perror("index_optimize():write()");
785 }
786 lfield_isize++;
787 ufield_p = ufield.next;
788 }
789
790 /* optimize word numbers */
791 lword_p_save = lword_isize + 1;
792 word_count = 0;
793 uword_p = upost.word_numbers;
794 while (uword_p != 0) {
795 word_count++;
796 if (etymon_af_lseek(uword_fd,
797 (etymon_af_off_t)( ((etymon_af_off_t)(uword_p - 1)) *
798 ((etymon_af_off_t)(sizeof(ETYMON_INDEX_UWORD))) ),
799 SEEK_SET) == -1) {
800 perror("index_optimize():lseek()");
801 }
802 if (read(uword_fd, &uword, sizeof(ETYMON_INDEX_UWORD)) == -1) {
803 perror("index_optimize():read()");
804 }
805 lword.wn = uword.wn;
806 if (write(lword_fd, &lword, sizeof(ETYMON_INDEX_LWORD)) == -1) {
807 perror("index_optimize():write()");
808 }
809 lword_isize++;
810 uword_p = uword.next;
811 }
812
813 /* compare the doc_id with our cached lpost */
814 if (upost.doc_id == lpost.doc_id) {
815 /* increment the frequency and field count */
816 lpost.freq += upost.freq;
817 lpost.fields_n += field_count;
818 lpost.word_numbers_n += word_count;
819 } else {
820 /* flush lpost */
821 if (lpost.doc_id != 0) { /* only flush if lpost contains something */
822 if (write(lpost_fd, &lpost, sizeof(ETYMON_INDEX_LPOST)) == -1) {
823 perror("index_optimize():write()");
824 }
825 lpost_isize++;
826 page_l.post_n[x]++;
827 }
828 /* replace lpost with upost */
829 lpost.doc_id = upost.doc_id;
830 lpost.freq = upost.freq;
831 lpost.fields_n = field_count;
832 lpost.word_numbers_n = word_count;
833 /* set field pointer */
834 lpost.fields = lfield_p_save;
835 lpost.word_numbers = lword_p_save;
836 }
837 upost_p = upost.next;
838 } /* while */
839 /* flush lpost */
840 if (lpost.doc_id != 0) { /* only flush if lpost contains something */
841 if (write(lpost_fd, &lpost, sizeof(ETYMON_INDEX_LPOST)) == -1) {
842 perror("index_optimize():write()");
843 }
844 lpost_isize++;
845 page_l.post_n[x]++;
846 }
847 page_l.post[x] = lpost_p_save;
848
849 } /* for */
850
851 /* write out updated leaf page */
852 if (etymon_af_lseek(udict_fd, ((etymon_af_off_t)(udict_p + 1)), SEEK_SET) == -1) {
853 perror("index_optimize():lseek()");
854 }
855 if (write(udict_fd, &page_l, sizeof(ETYMON_INDEX_PAGE_L)) == -1) {
856 perror("index_optimize():write()");
857 }
858
859 udict_p = page_l.next;
860
861 } while (udict_p != 0);
862
863 /* update dbinfo */
864 if (etymon_af_lseek(dbinfo_fd, (etymon_af_off_t)0, SEEK_SET) == -1) {
865 perror("index_optimize():lseek()");
866 }
867 magic = ETYMON_INDEX_MAGIC;
868 nbytes = write(dbinfo_fd, &magic, sizeof(Uint4));
869 if (nbytes != sizeof(Uint4)) {
870 /* ERROR */
871 printf("unable to write MN\n");
872 exit(1);
873 }
874 dbinfo.optimized = 1;
875 nbytes = write(dbinfo_fd, &dbinfo, sizeof(ETYMON_DB_INFO));
876 if (nbytes != sizeof(ETYMON_DB_INFO)) {
877 /* ERROR */
878 printf("unable to write DBI\n");
879 exit(1);
880 }
881 close(dbinfo_fd);
882
883 /* clean up */
884 close(udict_fd);
885 close(upost_fd);
886 close(ufield_fd);
887 close(uword_fd);
888 close(lpost_fd);
889 close(lfield_fd);
890 close(lword_fd);
891
892 /* reopen and truncate upost */
893 etymon_db_construct_path(ETYMON_DBF_UPOST, opt->dbname, fn);
894 upost_fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
895 close(upost_fd);
896 /* reopen and truncate ufield */
897 etymon_db_construct_path(ETYMON_DBF_UFIELD, opt->dbname, fn);
898 ufield_fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
899 close(ufield_fd);
900 /* reopen and truncate uword */
901 etymon_db_construct_path(ETYMON_DBF_UWORD, opt->dbname, fn);
902 uword_fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
903 close(uword_fd);
904
905 /* unlock the database */
906 etymon_db_unlock(opt->dbname);
907
908 return 0;
909 }
910
911 #endif
912
etymon_index_write_nl(int filedes,etymon_af_off_t offset,ETYMON_INDEX_PAGE_NL * page)913 void etymon_index_write_nl(int filedes, etymon_af_off_t offset, ETYMON_INDEX_PAGE_NL* page) {
914 static Uint1 leaf_flag = 0;
915 if (etymon_af_lseek(filedes, (etymon_af_off_t)offset, SEEK_SET) == -1) {
916 perror("index_write_nl():lseek()");
917 }
918 if (write(filedes, &(leaf_flag), 1) == -1) {
919 perror("index_write_nl():write()");
920 }
921 if (write(filedes, page, sizeof(ETYMON_INDEX_PAGE_NL)) == -1) {
922 perror("index_write_nl():write()");
923 }
924 }
925
926
etymon_index_write_l(int filedes,etymon_af_off_t offset,ETYMON_INDEX_PAGE_L * page)927 void etymon_index_write_l(int filedes, etymon_af_off_t offset, ETYMON_INDEX_PAGE_L* page) {
928 static Uint1 leaf_flag = 1;
929 if (etymon_af_lseek(filedes, (etymon_af_off_t)offset, SEEK_SET) == -1) {
930 perror("index_write_l():lseek()");
931 }
932 if (write(filedes, &(leaf_flag), 1) == -1) {
933 perror("index_write_l():write()");
934 }
935 if (write(filedes, page, sizeof(ETYMON_INDEX_PAGE_L)) == -1) {
936 perror("index_write_l():write()");
937 }
938 }
939
940
941 /* returns 1 if flush was performed */
etymon_index_flush_l(ETYMON_INDEX_INDEXING_STATE * state)942 int etymon_index_flush_l(ETYMON_INDEX_INDEXING_STATE* state) {
943 if (state->pcache_l_write != 0) {
944 etymon_index_write_l(state->udict_fd, state->pcache_l_write, &(state->pcache_l));
945 state->pcache_l_write = 0;
946 return 1;
947 } else {
948 return 0;
949 }
950 }
951
952
etymon_index_search_keys_nl(unsigned char * word,size_t word_len,ETYMON_INDEX_PAGE_NL * page)953 int etymon_index_search_keys_nl(unsigned char* word, size_t word_len, ETYMON_INDEX_PAGE_NL* page) {
954 int j, k, m, len, comp;
955 if (page->n == 0) {
956 return 0;
957 }
958 j = 0;
959 k = page->n - 1;
960 while (j <= k) {
961 m = (j + k) / 2;
962 len = page->offset[m + 1] - page->offset[m];
963 comp = strncmp( (char*)word,
964 (char*)(page->keys + page->offset[m]),
965 len );
966 if (comp == 0) {
967 comp = word_len - len;
968 }
969 if (comp < 0) {
970 k = m - 1;
971 }
972 else if (comp > 0) {
973 j = m + 1;
974 }
975 else {
976 return m + 1; /* different from a leaf search - match gives right side pointer */
977 }
978 }
979 return j;
980 }
981
982
etymon_index_search_keys_l(unsigned char * word,size_t word_len,ETYMON_INDEX_PAGE_L * page,int * match)983 int etymon_index_search_keys_l(unsigned char* word, size_t word_len, ETYMON_INDEX_PAGE_L* page, int* match) {
984 int j, k, m, len, comp;
985 if (page->n == 0) {
986 *match = 0;
987 return 0;
988 }
989 j = 0;
990 k = page->n - 1;
991 while (j <= k) {
992 m = (j + k) / 2;
993 len = page->offset[m + 1] - page->offset[m];
994 comp = strncmp( (char*)word,
995 (char*)(page->keys + page->offset[m]),
996 len );
997 if (comp == 0) {
998 comp = word_len - len;
999 }
1000 if (comp < 0) {
1001 k = m - 1;
1002 }
1003 else if (comp > 0) {
1004 j = m + 1;
1005 }
1006 else {
1007 *match = 1;
1008 return m; /* different from non-leaf search - match gives left side pointer */
1009 }
1010 }
1011 *match = 0;
1012 return j;
1013 }
1014
1015
etymon_index_write_upost(ETYMON_INDEX_INDEXING_STATE * state,int wcache_p,ETYMON_INDEX_PAGE_L * page_l,int ins)1016 void etymon_index_write_upost(ETYMON_INDEX_INDEXING_STATE* state, int wcache_p, ETYMON_INDEX_PAGE_L* page_l, int ins) {
1017 int p;
1018 Uint4 ufield_p;
1019 Uint4 uword_p;
1020 state->upost.doc_id = state->wcache[wcache_p].doc_id;
1021 state->upost.freq = state->wcache[wcache_p].freq;
1022 state->upost.next = page_l->post[ins];
1023
1024 /* write out fields if there are any */
1025 if (state->wcache[wcache_p].fields == -1) {
1026 state->upost.fields = 0;
1027 state->upost.fields_n = 0;
1028 } else {
1029 /* write out fields */
1030 p = state->wcache[wcache_p].fields;
1031 ufield_p = 0;
1032 state->upost.fields_n = 0;
1033 while (p != -1) {
1034 memcpy(state->ufield.fields, state->fcache[p].f, ETYMON_MAX_FIELD_NEST * 2);
1035 state->ufield.next = ufield_p;
1036 if (write(state->ufield_fd, &(state->ufield), sizeof(ETYMON_INDEX_UFIELD)) == -1) {
1037 perror("index_write_upost():write()");
1038 }
1039 p = state->fcache[p].next;
1040 state->ufield_isize++;
1041 ufield_p = state->ufield_isize;
1042 state->upost.fields_n++;
1043 }
1044 state->upost.fields = ufield_p;
1045 }
1046
1047 /* write out word number data if any */
1048 if (state->number_words) {
1049 /* write out word numbers */
1050 p = state->wcache[wcache_p].word_numbers_head;
1051 uword_p = 0;
1052 state->upost.word_numbers_n = 0;
1053 while (p != -1) {
1054 state->uword.wn = state->wncache[p].wn;
1055 state->uword.next = uword_p;
1056 if (write(state->uword_fd, &(state->uword), sizeof(ETYMON_INDEX_UWORD)) == -1) {
1057 perror("index_write_upost():write()");
1058 }
1059 p = state->wncache[p].next;
1060 state->uword_isize++;
1061 uword_p = state->uword_isize;
1062 state->upost.word_numbers_n++;
1063 }
1064 state->upost.word_numbers = uword_p;
1065 } else {
1066 state->upost.word_numbers = 0;
1067 state->upost.word_numbers_n = 0;
1068 }
1069
1070 page_l->post[ins] = state->upost_isize + 1;
1071 page_l->post_n[ins]++;
1072
1073 /* now write out the new upost */
1074 if (write(state->upost_fd, &(state->upost), sizeof(ETYMON_INDEX_UPOST)) == -1) {
1075 perror("index_write_upost():write()");
1076 }
1077 state->upost_isize++;
1078 }
1079
1080
etymon_index_insert_key_l(ETYMON_INDEX_PAGE_L * page,int ins,unsigned char * word,size_t word_len)1081 void etymon_index_insert_key_l(ETYMON_INDEX_PAGE_L* page, int ins, unsigned char* word, size_t word_len) {
1082 int x;
1083 /* first scoot the keys over and insert the new word */
1084 if (ins < page->n) { /* don't need to if we're at the end of the key buffer */
1085 memmove(page->keys + page->offset[ins] + word_len,
1086 page->keys + page->offset[ins],
1087 page->offset[page->n] - page->offset[ins]);
1088 }
1089 memcpy(page->keys + page->offset[ins], word, word_len);
1090 /* move post data over */
1091 memmove(page->post + ins + 1, page->post + ins, (page->n - ins) * sizeof(Uint4));
1092 memmove(page->post_n + ins + 1, page->post_n + ins, (page->n - ins) * sizeof(Uint4));
1093 /* next scoot the offsets directory over (add word_len to offsets) */
1094 page->n++;
1095 for (x = page->n; x > ins; x--) {
1096 page->offset[x] = page->offset[x - 1] + word_len;
1097 }
1098 }
1099
1100
etymon_index_insert_key_nl(ETYMON_INDEX_PAGE_NL * page,int ins,unsigned char * word,size_t word_len)1101 void etymon_index_insert_key_nl(ETYMON_INDEX_PAGE_NL* page, int ins, unsigned char* word, size_t word_len) {
1102 int x;
1103 /* first scoot the keys over and insert the new word */
1104 if (ins < page->n) { /* don't need to if we're at the end of the key buffer */
1105 memmove(page->keys + page->offset[ins] + word_len,
1106 page->keys + page->offset[ins],
1107 page->offset[page->n] - page->offset[ins]);
1108 }
1109 memcpy(page->keys + page->offset[ins], word, word_len);
1110 /* move the page pointers */
1111 memmove(page->p + ins + 2, page->p + ins + 1, (page->n - ins) * sizeof(Uint4));
1112 /* next scoot the offsets directory over (add word_len to offsets) */
1113 page->n++;
1114 for (x = page->n; x > ins; x--) {
1115 page->offset[x] = page->offset[x - 1] + word_len;
1116 }
1117 }
1118
1119
1120 /* fills in word with the shortest separator between the two pages, and returns the word length */
etymon_index_shortest_sep_l(ETYMON_INDEX_PAGE_L * left,ETYMON_INDEX_PAGE_L * right,unsigned char * word)1121 int etymon_index_shortest_sep_l(ETYMON_INDEX_PAGE_L* left, ETYMON_INDEX_PAGE_L* right, unsigned char* word) {
1122 static int p;
1123 static int max;
1124 static unsigned char* left_word;
1125 static int left_word_len;
1126 max = right->offset[1];
1127 left_word = left->keys + left->offset[left->n - 1];
1128 left_word_len = left->offset[left->n] - left->offset[left->n - 1];
1129 p = 0;
1130 do {
1131 word[p] = right->keys[p];
1132 p++;
1133 } while ( (p < max) && (left_word[p - 1] >= word[p - 1]) );
1134 word[p] = '\0';
1135 return p;
1136 }
1137
1138
1139 /* fills in word with the shortest separator between the two pages, and returns the word length */
etymon_index_shortest_sep_nl(ETYMON_INDEX_PAGE_NL * left,ETYMON_INDEX_PAGE_NL * right,unsigned char * word)1140 int etymon_index_shortest_sep_nl(ETYMON_INDEX_PAGE_NL* left, ETYMON_INDEX_PAGE_NL* right, unsigned char* word) {
1141 static int p;
1142 static int max;
1143 static unsigned char* left_word;
1144 static int left_word_len;
1145 max = right->offset[1];
1146 left_word = left->keys + left->offset[left->n - 1];
1147 left_word_len = left->offset[left->n] - left->offset[left->n - 1];
1148 p = 0;
1149 do {
1150 word[p] = right->keys[p];
1151 p++;
1152 } while ( (p < max) && (left_word[p - 1] >= word[p - 1]) );
1153 word[p] = '\0';
1154 return p;
1155 }
1156
1157
etymon_index_parent_add_key(ETYMON_INDEX_INDEXING_STATE * state,int level,unsigned char * word,size_t word_len,Uint4 child)1158 void etymon_index_parent_add_key(ETYMON_INDEX_INDEXING_STATE* state, int level, unsigned char* word, size_t word_len,
1159 Uint4 child) {
1160 int x, y, ins;
1161 unsigned char new_word[ETYMON_MAX_WORD_SIZE];
1162 size_t new_word_len;
1163 Uint4 overflow_pos;
1164
1165 /* first check if we have ascended above the root of the tree */
1166 if (level < 0) {
1167 /* if so, we create a new root page */
1168 /* we can do it place of the old root position in the pcache, and invalidate the rest of the pcache */
1169 state->pcache_nl[0].nl.n = 1;
1170 state->pcache_nl[0].nl.p[0] = state->pcache_nl[0].pos; /* grab the pos from the now former root page */
1171 state->pcache_nl[0].nl.p[1] = child;
1172 state->pcache_nl[0].nl.offset[0] = 0;
1173 state->pcache_nl[0].nl.offset[1] = word_len;
1174 memcpy(state->pcache_nl[0].nl.keys, word, word_len);
1175 state->pcache_nl[0].pos = state->udict_size;
1176 state->pcache_nl[0].is_nl = 1;
1177 state->pcache_count = 1;
1178 /* now write out the new root page */
1179 etymon_index_write_nl(state->udict_fd, state->pcache_nl[0].pos, &(state->pcache_nl[0].nl));
1180 state->udict_size += sizeof(Uint1) + sizeof(ETYMON_INDEX_PAGE_NL);
1181 /* set new root pointer */
1182 state->udict_root = state->pcache_nl[0].pos;
1183 return;
1184 }
1185
1186 /* check if page is full */
1187 if ( (state->pcache_nl[level].nl.n >= ETYMON_MAX_KEYS_NL) ||
1188 ((ETYMON_MAX_KEY_AREA_NL - state->pcache_nl[level].nl.offset[state->pcache_nl[level].nl.n]) /* empty key space */
1189 < (int)word_len) ) {
1190
1191 /* split the page */
1192
1193 /* allocate new non-leaf page for split overflow */
1194 /* we move half of the keys into the overflow leaf page */
1195 state->overflow_nl.n = state->pcache_nl[level].nl.n / 2;
1196 /* move the offsets - by hand */
1197 y = state->pcache_nl[level].nl.offset[state->pcache_nl[level].nl.n - state->overflow_nl.n];
1198 for (x = 0; x <= state->overflow_nl.n; x++) {
1199 state->overflow_nl.offset[x] =
1200 state->pcache_nl[level].nl.offset[x + state->pcache_nl[level].nl.n -
1201 state->overflow_nl.n] - y;
1202 }
1203
1204 /* move the keys */
1205 memcpy(state->overflow_nl.keys,
1206 state->pcache_nl[level].nl.keys +
1207 state->pcache_nl[level].nl.offset[state->pcache_nl[level].nl.n - state->overflow_nl.n],
1208 state->pcache_nl[level].nl.offset[state->pcache_nl[level].nl.n] -
1209 state->pcache_nl[level].nl.offset[state->pcache_nl[level].nl.n - state->overflow_nl.n]);
1210
1211 /* move the page pointers */
1212 memcpy(state->overflow_nl.p,
1213 state->pcache_nl[level].nl.p +
1214 state->pcache_nl[level].nl.n - state->overflow_nl.n,
1215 (state->overflow_nl.n + 1) * sizeof(Uint4));
1216
1217 state->pcache_nl[level].nl.n -= state->overflow_nl.n;
1218
1219 /* remove the median key (now at the end of the old page),
1220 which we remember and will insert into the parent */
1221 new_word_len = state->pcache_nl[level].nl.offset[state->pcache_nl[level].nl.n] -
1222 state->pcache_nl[level].nl.offset[state->pcache_nl[level].nl.n - 1];
1223 memcpy(new_word, state->pcache_nl[level].nl.keys +
1224 state->pcache_nl[level].nl.offset[state->pcache_nl[level].nl.n - 1], new_word_len);
1225 new_word[new_word_len] = '\0';
1226 state->pcache_nl[level].nl.n--;
1227
1228 /* insert the new key into either the old or new nl page */
1229 /* search in the new page */
1230 ins = etymon_index_search_keys_nl(word, word_len, &(state->overflow_nl));
1231 /* if it's at 0 then put it in the old */
1232 if (ins == 0) {
1233 /* it goes in the old page, in the last key position */
1234 etymon_index_insert_key_nl(&(state->pcache_nl[level].nl), state->pcache_nl[level].nl.n, word,
1235 word_len);
1236
1237 state->pcache_nl[level].nl.p[ins + 1] = child;
1238 } else {
1239 /* it goes in the new page */
1240 etymon_index_insert_key_nl(&(state->overflow_nl), ins, word, word_len);
1241
1242 state->overflow_nl.p[ins + 1] = child;
1243 }
1244
1245 /* write out old nl page */
1246 etymon_index_write_nl(state->udict_fd, state->pcache_nl[level].pos, &(state->pcache_nl[level].nl));
1247 /* write out new overflow page */
1248 overflow_pos = state->udict_size;
1249 etymon_index_write_nl(state->udict_fd, overflow_pos, &(state->overflow_nl));
1250 /* update size of dictionary file */
1251 state->udict_size += sizeof(Uint1) + sizeof(ETYMON_INDEX_PAGE_NL);
1252
1253 /* recursively update parent page, splitting if necessary */
1254 etymon_index_parent_add_key(state, level - 1, new_word, new_word_len, overflow_pos);
1255
1256 } else {
1257 /* if not full, simply insert the new key */
1258 ins = etymon_index_search_keys_nl(word, word_len, &(state->pcache_nl[level].nl));
1259 etymon_index_insert_key_nl(&(state->pcache_nl[level].nl), ins, word, word_len);
1260 state->pcache_nl[level].nl.p[ins + 1] = child;
1261 /* write out the updated page */
1262 etymon_index_write_nl(state->udict_fd, state->pcache_nl[level].pos, &(state->pcache_nl[level].nl));
1263 }
1264
1265 }
1266
1267
1268 /* returns 0 if everything was OK */
etymon_index_index_word(ETYMON_INDEX_INDEXING_STATE * state,int wcache_p)1269 int etymon_index_index_word(ETYMON_INDEX_INDEXING_STATE* state, int wcache_p) {
1270 static int x, y, match, ins;
1271 static Uint4 p;
1272 static ssize_t nbytes;
1273 static Uint1 leaf_flag;
1274 static unsigned char* word;
1275 static unsigned char new_word[ETYMON_MAX_WORD_SIZE];
1276 static size_t word_len, new_word_len;
1277 static int level;
1278
1279 word = state->wcache[wcache_p].word;
1280
1281 /* search for the right page */
1282
1283 /* start at the root page */
1284 p = state->udict_root;
1285 level = 0;
1286
1287 /* search the tree by descent */
1288 do {
1289
1290 /* get the page from the cache if available */
1291
1292 if ( (level < state->pcache_count) &&
1293 (state->pcache_nl[level].pos == p) ) {
1294 /* yes, it is in the cache */
1295 /* check whether it is a leaf page */
1296 if (state->pcache_nl[level].is_nl == 1) {
1297 leaf_flag = 0;
1298 } else {
1299 leaf_flag = 1;
1300 }
1301 } else {
1302 /* not in the cache, so read it from disk */
1303 if (etymon_af_lseek(state->udict_fd, (etymon_af_off_t)p, SEEK_SET) == -1) {
1304 perror("index_index_word():lseek()");
1305 }
1306 /* check whether it is a leaf page */
1307 nbytes = read(state->udict_fd, &(leaf_flag), sizeof(Uint1));
1308 if (nbytes != sizeof(Uint1)) {
1309 /* ERROR */
1310 printf("error reading from index (LP)\n");
1311 exit(1);
1312 }
1313 if (leaf_flag == 0) {
1314 /* read the non-leaf page into the right cache position */
1315 state->pcache_nl[level].pos = p;
1316 state->pcache_nl[level].is_nl = 1;
1317 state->pcache_count = level + 1;
1318 nbytes = read(state->udict_fd, &(state->pcache_nl[level].nl),
1319 sizeof(ETYMON_INDEX_PAGE_NL));
1320 if (nbytes != sizeof(ETYMON_INDEX_PAGE_NL)) {
1321 /* ERROR */
1322 printf("error reading from index (NL)\n");
1323 exit(1);
1324 }
1325 } else {
1326
1327 /* read the leaf page into the leaf page cache */
1328
1329 if (etymon_index_flush_l(state) == 1) { /* flush leaf write cache first */
1330 if (etymon_af_lseek(state->udict_fd, (etymon_af_off_t)(p + 1), SEEK_SET) == -1) {
1331 perror("index_index_word():lseek()");
1332 }
1333 }
1334
1335 state->pcache_nl[level].pos = p;
1336 state->pcache_nl[level].is_nl = 0;
1337 state->pcache_count = level + 1;
1338 nbytes = read(state->udict_fd, &(state->pcache_l), sizeof(ETYMON_INDEX_PAGE_L));
1339 if (nbytes != sizeof(ETYMON_INDEX_PAGE_L)) {
1340 /* ERROR */
1341 printf("error reading from index (L)\n");
1342 exit(1);
1343 }
1344 }
1345 }
1346
1347 /* if it is not a leaf page, determine next seek */
1348 if (leaf_flag == 0) {
1349 ins = etymon_index_search_keys_nl(word, word_len,
1350 &(state->pcache_nl[level].nl));
1351 p = state->pcache_nl[level].nl.p[ins];
1352 level++;
1353 /* Internal overflow (level) */
1354 if (level >= ETYMON_MAX_PAGE_DEPTH)
1355 return aferr(AFEUNKNOWN);
1356 }
1357
1358 } while (leaf_flag == 0);
1359 /* we have reached a leaf page */
1360
1361 word_len = strlen((char*)word);
1362
1363 /* determine position in key list to insert new key */
1364 ins = etymon_index_search_keys_l(word, word_len,
1365 &(state->pcache_l), &match);
1366
1367 /* if we found a perfect match, then no need to insert the key; just add it to the postings */
1368 if (match) {
1369
1370 etymon_index_write_upost(state, wcache_p, &(state->pcache_l), ins);
1371
1372 /* tag leaf for write cache */
1373 state->pcache_l_write = p;
1374
1375 } else {
1376 /* insert the key */
1377 /* check if page is full */
1378 if ( (state->pcache_l.n >= ETYMON_MAX_KEYS_L) ||
1379 ((ETYMON_MAX_KEY_AREA_L - state->pcache_l.offset[state->pcache_l.n]) /* empty key space */
1380 < (int)word_len) ) {
1381
1382 /* if so, we split the page */
1383
1384 /* allocate new leaf page for split overflow */
1385 /* we move half of the keys into the overflow leaf page */
1386 state->overflow_l.n = state->pcache_l.n / 2;
1387 state->overflow_l.prev = p;
1388 state->overflow_l.next = state->pcache_l.next;
1389 /* move the posting data */
1390 memcpy(state->overflow_l.post,
1391 state->pcache_l.post +
1392 state->pcache_l.n - state->overflow_l.n,
1393 state->overflow_l.n * sizeof(Uint4));
1394 memcpy(state->overflow_l.post_n,
1395 state->pcache_l.post_n +
1396 state->pcache_l.n - state->overflow_l.n,
1397 state->overflow_l.n * sizeof(Uint4));
1398 /* move the offsets - by hand */
1399 y = state->pcache_l.offset[state->pcache_l.n - state->overflow_l.n];
1400 for (x = 0; x <= state->overflow_l.n; x++) {
1401 state->overflow_l.offset[x] =
1402 state->pcache_l.offset[x + state->pcache_l.n - state->overflow_l.n] - y;
1403 }
1404 /* move the keys */
1405 memcpy(state->overflow_l.keys,
1406 state->pcache_l.keys +
1407 state->pcache_l.offset[state->pcache_l.n - state->overflow_l.n],
1408 state->pcache_l.offset[state->pcache_l.n] -
1409 state->pcache_l.offset[state->pcache_l.n - state->overflow_l.n]);
1410 state->pcache_l.n -= state->overflow_l.n;
1411 state->pcache_l.next = state->udict_size;
1412
1413 /* insert the new key into either the old or new leaf page */
1414 if (ins <= state->pcache_l.n) {
1415 /* it goes in the old page */
1416 etymon_index_insert_key_l(&(state->pcache_l), ins, word, word_len);
1417
1418 state->pcache_l.post[ins] = 0;
1419 state->pcache_l.post_n[ins] = 0;
1420 etymon_index_write_upost(state, wcache_p, &(state->pcache_l), ins);
1421 } else {
1422 /* it goes in the new page */
1423
1424 x = etymon_index_search_keys_l(word, word_len,
1425 &(state->overflow_l), &match);
1426
1427 etymon_index_insert_key_l(&(state->overflow_l), x, word, word_len);
1428
1429 state->overflow_l.post[x] = 0;
1430 state->overflow_l.post_n[x] = 0;
1431 etymon_index_write_upost(state, wcache_p, &(state->overflow_l), x);
1432 }
1433
1434 /* tag old leaf for write caching */
1435 state->pcache_l_write = p;
1436 /* write out new overflow page */
1437 etymon_index_write_l(state->udict_fd, state->udict_size, &(state->overflow_l));
1438 y = state->udict_size; /**/
1439 /* update size of dictionary file */
1440 state->udict_size += sizeof(Uint1) + sizeof(ETYMON_INDEX_PAGE_L);
1441
1442 /* update prev pointer in far-right leaf to point to overflow_l */
1443 if (state->overflow_l.next != 0) {
1444 if (etymon_af_lseek(state->udict_fd,
1445 (etymon_af_off_t)(state->overflow_l.next + 1), SEEK_SET) == -1) {
1446 perror("index_index_word():lseek()");
1447 }
1448 nbytes = read(state->udict_fd, &(state->extra_l), sizeof(ETYMON_INDEX_PAGE_L));
1449 if (nbytes == -1) {
1450 perror("index_index_word():read()");
1451 }
1452 state->extra_l.prev = state->pcache_l.next;
1453 if (etymon_af_lseek(state->udict_fd, (etymon_af_off_t)(state->overflow_l.next + 1), SEEK_SET) == -1) {
1454 perror("index_index_word():lseek()");
1455 }
1456 nbytes = write(state->udict_fd, &(state->extra_l), sizeof(ETYMON_INDEX_PAGE_L));
1457 if (nbytes == -1) {
1458 perror("index_index_word():write()");
1459 }
1460 }
1461
1462 /* we want to insert a new key in the parent to fork the split */
1463
1464 new_word_len = etymon_index_shortest_sep_l(&(state->pcache_l), &(state->overflow_l),
1465 new_word);
1466
1467 /* recursively update parent page, splitting if necessary */
1468 etymon_index_parent_add_key(state, level - 1, new_word, new_word_len,
1469 state->pcache_l.next);
1470
1471 } else {
1472
1473 /* if not, simply scoot keys over and insert new key */
1474 etymon_index_insert_key_l(&(state->pcache_l), ins, word, word_len);
1475 state->pcache_l.post[ins] = 0;
1476 state->pcache_l.post_n[ins] = 0;
1477
1478 /* next add postings for the new key */
1479
1480 etymon_index_write_upost(state, wcache_p, &(state->pcache_l), ins);
1481
1482 /* tag leaf for write cache */
1483 state->pcache_l_write = p;
1484 }
1485
1486 }
1487
1488 return 0;
1489 }
1490
1491
1492 /* returns 0 if everything was OK */
etymon_index_traverse_wcache(ETYMON_INDEX_INDEXING_STATE * state,int p)1493 int etymon_index_traverse_wcache(ETYMON_INDEX_INDEXING_STATE* state, int p) {
1494 int c;
1495 int start;
1496 if (state->wcache[p].left != -1) {
1497 if (etymon_index_traverse_wcache(state, state->wcache[p].left) == -1) {
1498 return -1;
1499 }
1500 }
1501 start = state->wcache[p].next;
1502 c = start;
1503 do {
1504 if (etymon_index_index_word(state, c) == -1) {
1505 return -1;
1506 }
1507 c = state->wcache[c].next;
1508 } while (c != start);
1509 if (state->wcache[p].right != -1) {
1510 if (etymon_index_traverse_wcache(state, state->wcache[p].right) == -1) {
1511 return -1;
1512 }
1513 }
1514 return 0;
1515 }
1516
1517
1518 /* returns 0 if everything was OK */
etymon_index_dclass_index(ETYMON_INDEX_INDEXING_STATE * state)1519 int etymon_index_dclass_index(ETYMON_INDEX_INDEXING_STATE* state) {
1520 static Uint1 leaf_flag;
1521 static ssize_t nbytes;
1522
1523 if (!state->flushmsg) {
1524 state->flushmsg = 1;
1525 afprintv(state->verbose, 2, "Flushing index buffers");
1526 }
1527 /* make sure there is at least one page (root) */
1528 if (state->udict_root == 0) {
1529 /* seek to offset 0 and write one zero byte (unused) */
1530 if (etymon_af_lseek(state->udict_fd, (etymon_af_off_t)0, SEEK_SET) == -1) {
1531 perror("index_dclass_index():lseek()");
1532 }
1533 leaf_flag = 0; /* we'll use the leaf_flag variable, but this isn't really a leaf flag byte */
1534 nbytes = write(state->udict_fd, &(leaf_flag), 1);
1535 if (nbytes == -1) {
1536 perror("index_dclass_index():write()");
1537 }
1538 /* write an empty root page now at offset 1 */
1539 state->pcache_l.n = 0;
1540 state->pcache_l.prev = 0;
1541 state->pcache_l.next = 0;
1542 state->pcache_l.offset[0] = 0;
1543 leaf_flag = 1;
1544 nbytes = write(state->udict_fd, &(leaf_flag), sizeof(Uint1));
1545 if (nbytes != sizeof(Uint1)) {
1546 /* ERROR */
1547 printf("error writing to index\n");
1548 exit(1);
1549 }
1550 nbytes = write(state->udict_fd, &(state->pcache_l), sizeof(ETYMON_INDEX_PAGE_L));
1551 if (nbytes != sizeof(ETYMON_INDEX_PAGE_L)) {
1552 /* ERROR */
1553 printf("error writing to index\n");
1554 exit(1);
1555 }
1556 /* update size of dictionary file */
1557 state->udict_size += 2 + sizeof(ETYMON_INDEX_PAGE_L);
1558 /* set root pointer */
1559 state->udict_root = 1;
1560 /* root page is now cached and it is a leaf */
1561 state->pcache_nl[0].pos = 1;
1562 state->pcache_nl[0].is_nl = 0;
1563 state->pcache_count = 1;
1564 }
1565
1566 if (state->wcache_count > 0) {
1567 if (etymon_index_traverse_wcache(state, state->wcache_root) == -1) {
1568 return -1;
1569 }
1570 }
1571 return 0;
1572 }
1573
1574
1575 /* return 0 if everything went well */
etymon_index_dclass_finish(ETYMON_INDEX_INDEXING_STATE * state)1576 int etymon_index_dclass_finish(ETYMON_INDEX_INDEXING_STATE* state) {
1577 /* perform last indexing pass */
1578 if (etymon_index_dclass_index(state) == -1) {
1579 return -1;
1580 }
1581 /* flush write cached leaf node */
1582 etymon_index_flush_l(state);
1583
1584 return 0;
1585 }
1586
1587
etymon_af_index_get_split_list(ETYMON_AF_DC_INDEX * dc_index,char * split)1588 ETYMON_AF_DC_SPLIT* etymon_af_index_get_split_list(ETYMON_AF_DC_INDEX*
1589 dc_index, char* split) {
1590 ETYMON_AF_DC_SPLIT* split_list;
1591 ETYMON_AF_DC_SPLIT* split_p;
1592 ETYMON_DOCBUF* docbuf = dc_index->docbuf;
1593 int split_len = strlen(split);
1594 int split_match = 0; /* number of characters matched with
1595 delimiter string */
1596 unsigned char ch;
1597 Uint4 offset = 0;
1598
1599 /* initialize split list */
1600 split_list =
1601 (ETYMON_AF_DC_SPLIT*)(malloc(
1602 sizeof(ETYMON_AF_DC_SPLIT)));
1603 split_p = split_list;
1604
1605 /* return if the document size is 0 */
1606 if (docbuf->data_len == 0) {
1607 split_list->end = 0;
1608 split_list->next = NULL;
1609 return split_list;
1610 }
1611
1612 /* skip first character to avoid a 0 length document resulting
1613 from an immediate match */
1614 etymon_docbuf_next_char(docbuf);
1615 offset++;
1616
1617 /* find matches to the delimiter string */
1618 while ( ! docbuf->eof ) {
1619 ch = etymon_docbuf_next_char(docbuf);
1620 offset++;
1621 if (ch == split[split_match]) {
1622 split_match++;
1623 if (split_match == split_len) {
1624 split_match = 0;
1625 split_p->end = offset - split_len;
1626 split_p->next = (ETYMON_AF_DC_SPLIT*)(malloc(
1627 sizeof(ETYMON_AF_DC_SPLIT)));
1628 split_p = split_p->next;
1629 }
1630 } else {
1631 if (split_match != 0) {
1632 split_match = 0;
1633 }
1634 }
1635 }
1636
1637 split_p->end = docbuf->st.st_size;
1638 split_p->next = NULL;
1639
1640 return split_list;
1641 }
1642
1643
1644 /* returns 0 if everything went OK */
etymon_index_add_files(Afindex * opt)1645 int etymon_index_add_files(Afindex *opt) {
1646 ETYMON_DOCBUF* docbuf;
1647 ETYMON_INDEX_INDEXING_STATE* state;
1648 char s_file[ETYMON_MAX_PATH_SIZE];
1649 char* source_file;
1650 char fn[ETYMON_MAX_PATH_SIZE];
1651 char cwd[ETYMON_MAX_PATH_SIZE];
1652 ETYMON_AF_STAT st;
1653 ssize_t nbytes;
1654 size_t maxmem, memleft;
1655 /* int dbinfo_fd; */
1656 int x_file;
1657 Uint4 magic;
1658 ETYMON_DB_INFO *dbinfo;
1659 int dclass_id;
1660 int result;
1661 int fdef_fd;
1662 int done_files;
1663 int file_good;
1664 int x;
1665 size_t wcache_alloc, fcache_alloc, wncache_alloc;
1666 ETYMON_AF_DC_INDEX dc_index;
1667 ETYMON_AF_DC_INIT dc_init;
1668 ETYMON_AF_DC_SPLIT* split_list = NULL;
1669 ETYMON_AF_DC_SPLIT* split_p;
1670 int use_docbuf; /* 1: use docbuf; 0: don't use it */
1671 char *dbname;
1672
1673 dbname = etymon_af_state[opt->dbid]->dbname;
1674
1675 maxmem = ((size_t) opt->memory) * 1048576 - 1315000;
1676
1677 /* make sure database is ready */
1678 if (etymon_db_ready(dbname) == 0)
1679 return aferr(AFEDBLOCK);
1680
1681 /* lock the database */
1682 etymon_db_lock(dbname, NULL);
1683
1684 /* open db info file for read/write */
1685 /*
1686 etymon_db_construct_path(ETYMON_DBF_INFO, dbname, fn);
1687 dbinfo_fd = open(fn, O_RDWR | ETYMON_AF_O_LARGEFILE);
1688 if (dbinfo_fd == -1) {
1689 etymon_db_unlock(dbname);
1690 return aferr(AFEDBIO);
1691 }
1692 nbytes = read(dbinfo_fd, &magic, sizeof(Uint4));
1693 if (nbytes != sizeof(Uint4)) {
1694 printf("unable to read %s\n", fn);
1695 exit(1);
1696 }
1697 if (magic != ETYMON_INDEX_MAGIC) {
1698 close(dbinfo_fd);
1699 etymon_db_unlock(dbname);
1700 return aferr(AFEVERSION);
1701 }
1702 nbytes = read(dbinfo_fd, &dbinfo, sizeof(ETYMON_DB_INFO));
1703 if (nbytes != sizeof(ETYMON_DB_INFO)) {
1704 printf("unable to read %s\n", fn);
1705 exit(1);
1706 }
1707 */
1708 dbinfo = &(etymon_af_state[opt->dbid]->info);
1709
1710 if (dbinfo->stemming && !af_stem_available()) {
1711 etymon_db_unlock(dbname);
1712 return aferr(AFENOSTEM);
1713 }
1714
1715 /* we can only add files if the database is not optimized */
1716 if (dbinfo->optimized == 1) {
1717 etymon_db_unlock(dbname);
1718 return aferr(AFELINEAR);
1719 }
1720
1721 /* set up state information for indexing */
1722
1723 state = (ETYMON_INDEX_INDEXING_STATE*)(malloc(sizeof(ETYMON_INDEX_INDEXING_STATE)));
1724
1725 state->udict_root = dbinfo->udict_root;
1726 state->doc_n = dbinfo->doc_n;
1727 state->dbname = dbname;
1728 state->verbose = opt->verbose;
1729 state->long_words = opt->_longwords;
1730
1731 /* open doctable for append */
1732 etymon_db_construct_path(ETYMON_DBF_DOCTABLE, dbname, fn);
1733 state->doctable_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
1734 if (state->doctable_fd == -1) {
1735 /* ERROR */
1736 printf("unable to open %s for append\n", fn);
1737 exit(1);
1738 }
1739 /* stat doctable to get size */
1740 if (etymon_af_fstat(state->doctable_fd, &st) == -1) {
1741 perror("index_add_files():fstat()");
1742 }
1743 state->doctable_next_id = st.st_size / sizeof(ETYMON_DOCTABLE) + 1;
1744
1745 /* open udict for read/write */
1746 etymon_db_construct_path(ETYMON_DBF_UDICT, dbname, fn);
1747 state->udict_fd = open(fn, O_RDWR | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
1748 if (state->udict_fd == -1) {
1749 /* ERROR */
1750 printf("unable to open %s for read/write\n", fn);
1751 exit(1);
1752 }
1753 /* stat udict to get size */
1754 if (etymon_af_fstat(state->udict_fd, &st) == -1) {
1755 perror("index_add_files():fstat()");
1756 }
1757 state->udict_size = st.st_size;
1758
1759 /* open upost for append */
1760 etymon_db_construct_path(ETYMON_DBF_UPOST, dbname, fn);
1761 state->upost_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
1762 if (state->upost_fd == -1) {
1763 /* ERROR */
1764 printf("unable to open %s for append\n", fn);
1765 exit(1);
1766 }
1767 /* stat upost to get size */
1768 if (etymon_af_fstat(state->upost_fd, &st) == -1) {
1769 perror("index_add_files():fstat()");
1770 }
1771 state->upost_isize = st.st_size / sizeof(ETYMON_INDEX_UPOST);
1772
1773 /* open ufield for append */
1774 etymon_db_construct_path(ETYMON_DBF_UFIELD, dbname, fn);
1775 state->ufield_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
1776 if (state->ufield_fd == -1) {
1777 /* ERROR */
1778 printf("unable to open %s for append\n", fn);
1779 exit(1);
1780 }
1781 /* stat ufield to get size */
1782 if (etymon_af_fstat(state->ufield_fd, &st) == -1) {
1783 perror("index_add_files():fstat()");
1784 }
1785 state->ufield_isize = st.st_size / sizeof(ETYMON_INDEX_UFIELD);
1786
1787 /* open uword for append */
1788 etymon_db_construct_path(ETYMON_DBF_UWORD, dbname, fn);
1789 state->uword_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
1790 if (state->uword_fd == -1) {
1791 /* ERROR */
1792 printf("unable to open %s for append\n", fn);
1793 exit(1);
1794 }
1795 /* stat uword to get size */
1796 if (etymon_af_fstat(state->uword_fd, &st) == -1) {
1797 perror("index_add_files():fstat()");
1798 }
1799 state->uword_isize = st.st_size / sizeof(ETYMON_INDEX_UWORD);
1800
1801 /* allocate memory for page cache */
1802 state->pcache_nl_size = ETYMON_MAX_PAGE_DEPTH;
1803 memleft = maxmem - ((size_t) (sizeof(ETYMON_INDEX_PCACHE_NODE) * ETYMON_MAX_PAGE_DEPTH));
1804 if (memleft < 1048576) {
1805 memleft = 1048576;
1806 }
1807 state->pcache_nl = (ETYMON_INDEX_PCACHE_NODE*)(malloc(sizeof(ETYMON_INDEX_PCACHE_NODE) * ETYMON_MAX_PAGE_DEPTH));
1808 if (state->pcache_nl == NULL) {
1809 /* ERROR */
1810 printf("unable to allocate memory for cache\n");
1811 exit(1);
1812 }
1813 state->pcache_count = 0;
1814 state->pcache_nl[0].pos = 0;
1815
1816 /* initialize the write cached leaf page */
1817 state->pcache_l_write = 0;
1818
1819 /* turn on word numbering */
1820 state->phrase = dbinfo->phrase;
1821 state->word_proximity = dbinfo->word_proximity;
1822 state->stemming = dbinfo->stemming;
1823 if ( (dbinfo->phrase) || (dbinfo->word_proximity) ) {
1824 state->number_words = 1;
1825 } else {
1826 state->number_words = 0;
1827 }
1828
1829 /* calculate cache memory allocation based on memleft */
1830 if (state->number_words) {
1831 wcache_alloc = (size_t) (memleft * .4);
1832 fcache_alloc = (size_t) (memleft * .3);
1833 wncache_alloc = (size_t) (memleft * .3);
1834 } else {
1835 wcache_alloc = (size_t) (memleft * .5);
1836 fcache_alloc = (size_t) (memleft * .5);
1837 wncache_alloc = 0;
1838 }
1839 /*
1840 if (wcache_alloc > 2000000000)
1841 wcache_alloc = 2000000000;
1842 if (fcache_alloc > 2000000000)
1843 fcache_alloc = 2000000000;
1844 if (wncache_alloc > 2000000000)
1845 wncache_alloc = 2000000000;
1846 */
1847
1848 /* allocate memory for word cache */
1849 state->wcache_size = wcache_alloc / ((size_t) sizeof(ETYMON_INDEX_WCACHE_NODE));
1850 state->wcache = (ETYMON_INDEX_WCACHE_NODE*)(malloc(sizeof(ETYMON_INDEX_WCACHE_NODE) * state->wcache_size));
1851 if (state->wcache == NULL) {
1852 /* ERROR */
1853 printf("unable to allocate memory for cache\n");
1854 exit(1);
1855 }
1856 state->wcache_count = 0;
1857 state->wcache_root = -1;
1858
1859 /* allocate memory for field cache */
1860 state->fcache_size = fcache_alloc / ((size_t) sizeof(ETYMON_INDEX_FCACHE_NODE));
1861 state->fcache = (ETYMON_INDEX_FCACHE_NODE*)(malloc(sizeof(ETYMON_INDEX_FCACHE_NODE) * state->fcache_size));
1862 if (state->fcache == NULL) {
1863 /* ERROR */
1864 printf("unable to allocate memory for cache\n");
1865 exit(1);
1866 }
1867 state->fcache_count = 0;
1868
1869 /* allocate memory for word number cache */
1870 if (state->number_words) {
1871 state->wncache_size = wncache_alloc / ((size_t) sizeof(ETYMON_INDEX_WNCACHE_NODE));
1872 state->wncache = (ETYMON_INDEX_WNCACHE_NODE*)(malloc(sizeof(ETYMON_INDEX_WNCACHE_NODE) *
1873 state->wncache_size));
1874 if (state->wncache == NULL) {
1875 /* ERROR */
1876 printf("unable to allocate memory for cache\n");
1877 exit(1);
1878 }
1879 state->wncache_count = 0;
1880 }
1881
1882 /* load field definitions into a binary tree */
1883 /* open fdef for read/write */
1884 etymon_db_construct_path(ETYMON_DBF_FDEF, dbname, fn);
1885 fdef_fd = open(fn, O_RDWR | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
1886 if (fdef_fd == -1) {
1887 /* ERROR */
1888 printf("unable to open %s for read/write\n", fn);
1889 exit(1);
1890 }
1891 state->fdef_count = etymon_af_fdef_read_mem(fdef_fd, &(state->fdef_root), &(state->fdef_tail));
1892
1893 /* set dclass_id based on dclass string */
1894 if (strcmp(opt->doctype, "xml") == 0) {
1895 dclass_id = 1;
1896 }
1897 else if (strcmp(opt->doctype, "xml_test") == 0) {
1898 dclass_id = 2;
1899 }
1900 else if (strcmp(opt->doctype, "erc") == 0) {
1901 dclass_id = 100;
1902 } else {
1903 /* need to print an error here if the input is unknown
1904 - right now it's being handled in af.cc which
1905 is the wrong place */
1906 dclass_id = 0;
1907 }
1908
1909 /* set up parameters to pass in to document class init
1910 function */
1911 dc_init.use_docbuf = 1;
1912 dc_init.dc_state = NULL;
1913
1914 /* for now we hard code calls */
1915 switch (dclass_id) {
1916 case 2:
1917 dc_index.dclass_id = 2;
1918 result = dc_xml_test_init(&dc_init);
1919 break;
1920 case 100:
1921 dc_index.dclass_id = 100;
1922 result = dc_erc_init(&dc_init);
1923 break;
1924 case 1:
1925 #ifdef ETYMON_AF_XML
1926 dc_index.dclass_id = 1;
1927 result = dc_xml_init(&dc_init);
1928 break;
1929 #endif
1930 default:
1931 dc_index.dclass_id = 0;
1932 result = dc_text_init(&dc_init);
1933 }
1934 if (result == -1) {
1935 free(state->wcache);
1936 free(state->fcache);
1937 if (state->number_words) {
1938 free(state->wncache);
1939 }
1940 free(state->pcache_nl);
1941 close(state->doctable_fd);
1942 close(state->udict_fd);
1943 close(state->upost_fd);
1944 close(state->ufield_fd);
1945 close(fdef_fd);
1946 etymon_af_fdef_free_mem(state->fdef_root);
1947 free(state);
1948 return -1;
1949 }
1950
1951 use_docbuf = dc_init.use_docbuf;
1952
1953 if (use_docbuf) {
1954 /* set up buffering for input documents */
1955 docbuf = (ETYMON_DOCBUF*)(malloc(sizeof(ETYMON_DOCBUF)));
1956 docbuf->buf = NULL; /* first time it will be NULL,
1957 because we need to get
1958 st_blksize from stat */
1959 } else {
1960 docbuf = NULL;
1961 }
1962
1963 /* set up parameters to pass in to document class index function */
1964 dc_index.docbuf = docbuf;
1965 dc_index.filename = fn;
1966 dc_index.split_list = NULL;
1967 dc_index.dlevel = opt->dlevel;
1968 dc_index.state = state;
1969 dc_index.dc_state = dc_init.dc_state;
1970
1971 /* get cwd */
1972 getcwd(cwd, ETYMON_MAX_PATH_SIZE);
1973
1974 /* loop through and index each file */
1975 x_file = 0;
1976 done_files = 0;
1977 do {
1978
1979 file_good = 1;
1980
1981 /* load file into buffer */
1982 if (opt->_stdin) {
1983 if (fgets(s_file, ETYMON_MAX_PATH_SIZE, stdin) == NULL) {
1984 /* need to check fgets more correctly for errors */
1985 done_files = 1;
1986 break;
1987 } else {
1988 /* remove '\n' at end */
1989 x = strlen(s_file);
1990 if ( (x > 1) && (s_file[x - 1] == '\n') ) {
1991 s_file[x - 1] = '\0';
1992 }
1993 source_file = s_file;
1994 }
1995 } else {
1996 if (x_file == opt->sourcen) {
1997 done_files = 1;
1998 break;
1999 } else {
2000 source_file = opt->source[x_file];
2001 }
2002 }
2003
2004 if (done_files) {
2005 break;
2006 }
2007
2008 etymon_index_expand_path(source_file, fn, cwd);
2009
2010 if (use_docbuf) {
2011 docbuf->fn = fn;
2012 docbuf->filedes = open(docbuf->fn, O_RDONLY | ETYMON_AF_O_LARGEFILE);
2013 if (docbuf->filedes == -1) {
2014 /*
2015 int e;
2016 char s[ETYMON_MAX_MSG_SIZE];
2017 sprintf(s, "%s: No such file or directory", docbuf->fn);
2018 file_good = 0;
2019 e = opt->log.error(s, 0);
2020 if (e != 0) {
2021 exit(e);
2022 }
2023 */
2024 } else {
2025
2026 /* stat the file */
2027 if (etymon_af_fstat(docbuf->filedes,
2028 &(docbuf->st)) == -1) {
2029 perror("index_add_files():fstat()");
2030 }
2031 /* make sure it is a regular file */
2032 if (S_ISREG(docbuf->st.st_mode) == 0) {
2033 int e;
2034 char s[ETYMON_MAX_MSG_SIZE];
2035 sprintf(s,
2036 "%s: file not recognized: File format not recognized",
2037 docbuf->fn);
2038 file_good = 0;
2039 close(docbuf->filedes);
2040 /*
2041 e = opt->log.error(s, 0);
2042 if (e != 0) {
2043 exit(e);
2044 }
2045 */
2046 }
2047
2048 }
2049 }
2050
2051 if (file_good) {
2052
2053 state->flushmsg = 0;
2054
2055 if (opt->verbose >= 1) {
2056 if (opt->verbose >= 2) {
2057 printf("Indexing ");
2058 }
2059 printf("%s\n", fn);
2060 }
2061
2062 if (use_docbuf) {
2063 /* initialize the buffer if it hasn't been done */
2064 if (docbuf->buf == NULL) {
2065 docbuf->buf_size = docbuf->st.st_blksize;
2066 docbuf->buf = (unsigned char*)(malloc(docbuf->buf_size));
2067 }
2068 /* continue setting up to load the file */
2069 docbuf->eof = 0;
2070 /* read the first page from disk */
2071 etymon_docbuf_load_page(docbuf);
2072
2073 /* ok, the docbuf page has been
2074 prepared */
2075 /* check if we need to split the file
2076 into multiple documents */
2077 if (*(opt->split) != '\0') {
2078 split_list =
2079 etymon_af_index_get_split_list(
2080 &dc_index,
2081 (char *) opt->split);
2082 /* reset the docbuf page */
2083 if (etymon_af_lseek(docbuf->filedes,
2084 (etymon_af_off_t)0, SEEK_SET) == -1) {
2085 perror("index_add_files():lseek()");
2086 exit(-1);
2087 }
2088 docbuf->eof = 0;
2089 etymon_docbuf_load_page(docbuf);
2090 } else {
2091 /* otherwise set up a single
2092 node split list, marking
2093 the end of the file */
2094 split_list =
2095 (ETYMON_AF_DC_SPLIT*)(malloc(
2096 sizeof(ETYMON_AF_DC_SPLIT)));
2097 split_list->end = docbuf->st.st_size;
2098 split_list->next = NULL;
2099 }
2100 dc_index.split_list = split_list;
2101 }
2102
2103 /* here we must call the indexing function in the doctype,
2104 handing it a pointer to a struct of call back functions */
2105 /* for now we hard code calls */
2106 switch (dclass_id) {
2107 case 2:
2108 dc_index.dclass_id = 2;
2109 result = dc_xml_test_index(&dc_index);
2110 break;
2111 case 100:
2112 dc_index.dclass_id = 100;
2113 result = dc_erc_index(&dc_index);
2114 break;
2115 case 1:
2116 #ifdef ETYMON_AF_XML
2117 dc_index.dclass_id = 1;
2118 result = dc_xml_index(&dc_index);
2119 break;
2120 #endif
2121 default:
2122 dc_index.dclass_id = 0;
2123 result = dc_text_index(&dc_index);
2124 }
2125 /* free split list */
2126 while (split_list) {
2127 split_p = split_list;
2128 split_list = split_list->next;
2129 free(split_p);
2130 }
2131 /* check result from document class */
2132 if (result == -1) {
2133 free(state->wcache);
2134 free(state->fcache);
2135 if (state->number_words) {
2136 free(state->wncache);
2137 }
2138 free(state->pcache_nl);
2139 if (use_docbuf) {
2140 if (docbuf->buf != NULL) {
2141 free(docbuf->buf);
2142 }
2143 free(docbuf);
2144 }
2145 close(state->doctable_fd);
2146 close(state->udict_fd);
2147 close(state->upost_fd);
2148 close(state->ufield_fd);
2149 close(fdef_fd);
2150 etymon_af_fdef_free_mem(state->fdef_root);
2151 free(state);
2152 return -1;
2153 }
2154
2155 if (use_docbuf) {
2156 /* close the document file */
2157 close(docbuf->filedes);
2158 }
2159
2160 } /* if (file_good) */
2161
2162 x_file++;
2163
2164 } while (done_files == 0);
2165
2166 if (etymon_index_dclass_finish(state) == -1) {
2167 free(state->wcache);
2168 free(state->fcache);
2169 if (state->number_words) {
2170 free(state->wncache);
2171 }
2172 free(state->pcache_nl);
2173 if (use_docbuf) {
2174 if (docbuf->buf != NULL) {
2175 free(docbuf->buf);
2176 }
2177 free(docbuf);
2178 }
2179 close(state->doctable_fd);
2180 close(state->udict_fd);
2181 close(state->upost_fd);
2182 close(state->ufield_fd);
2183 close(fdef_fd);
2184 etymon_af_fdef_free_mem(state->fdef_root);
2185 free(state);
2186 return -1;
2187 }
2188
2189 /* write out fdef file */
2190 /* re-open fdef and overwrite */
2191 close(fdef_fd);
2192 etymon_db_construct_path(ETYMON_DBF_FDEF, dbname, fn);
2193 fdef_fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
2194 if (fdef_fd == -1) {
2195 /* ERROR */
2196 printf("unable to open %s for read/write\n", fn);
2197 exit(1);
2198 }
2199 etymon_af_fdef_write_mem(fdef_fd, state->fdef_root);
2200
2201 /* update dbinfo */
2202 /*
2203 if (etymon_af_lseek(dbinfo_fd, (etymon_af_off_t)0, SEEK_SET) == -1) {
2204 perror("index_add_files():lseek()");
2205 }
2206 magic = ETYMON_INDEX_MAGIC;
2207 nbytes = write(dbinfo_fd, &magic, sizeof(Uint4));
2208 if (nbytes != sizeof(Uint4)) {
2209 printf("unable to write MN\n");
2210 exit(1);
2211 }
2212 dbinfo.udict_root = state->udict_root;
2213 dbinfo.doc_n = state->doc_n;
2214 nbytes = write(dbinfo_fd, &dbinfo, sizeof(ETYMON_DB_INFO));
2215 if (nbytes != sizeof(ETYMON_DB_INFO)) {
2216 printf("unable to write DBI\n");
2217 exit(1);
2218 }
2219 close(dbinfo_fd);
2220 */
2221 dbinfo->udict_root = state->udict_root;
2222 dbinfo->doc_n = state->doc_n;
2223
2224 /* clean up */
2225 free(state->wcache);
2226 free(state->fcache);
2227 if (state->number_words) {
2228 free(state->wncache);
2229 }
2230 free(state->pcache_nl);
2231 if (use_docbuf) {
2232 if (docbuf->buf != NULL) {
2233 free(docbuf->buf);
2234 }
2235 free(docbuf);
2236 }
2237 close(state->doctable_fd);
2238 close(state->udict_fd);
2239 close(state->upost_fd);
2240 close(state->ufield_fd);
2241 close(fdef_fd);
2242 etymon_af_fdef_free_mem(state->fdef_root);
2243 free(state);
2244
2245 /* unlock the database */
2246 etymon_db_unlock(dbname);
2247
2248 return 0;
2249 }
2250
2251
etymon_af_index_add_word(ETYMON_AF_INDEX_ADD_WORD * opt)2252 int etymon_af_index_add_word(ETYMON_AF_INDEX_ADD_WORD* opt) {
2253 ETYMON_INDEX_INDEXING_STATE* state = opt->state;
2254 int tree_p, comp, field_p, word_number_p;
2255 int* tree_link;
2256 int done;
2257 int full;
2258
2259 if (state->verbose >= 5) {
2260 afprintvp(state->verbose, 5);
2261 printf("Adding word: \"%s\"\n", (const char *) opt->word);
2262 }
2263
2264 /*
2265 size_t tmp_len = strlen((const char*)opt->word);
2266 char tmp_word[1024];
2267 strcpy(tmp_word, (const char*)opt->word);
2268 af_stem(opt->word);
2269 if (strlen((const char*)opt->word) != tmp_len) {
2270 printf("%s -> ", tmp_word);
2271 printf("%s\n", (const char*)opt->word);
2272 }
2273 */
2274 if (state->stemming)
2275 af_stem(opt->word);
2276
2277 /* if any caches are full, then index this block and clear the caches */
2278 if (state->number_words) {
2279 full = (state->wcache_count == state->wcache_size) ||
2280 (state->fcache_count == state->fcache_size) ||
2281 (state->wncache_count == state->wncache_size);
2282 } else {
2283 full = (state->wcache_count == state->wcache_size) ||
2284 (state->fcache_count == state->fcache_size);
2285 }
2286 if (full) {
2287 if (etymon_index_dclass_index(state) == -1) {
2288 return -1;
2289 }
2290 state->wcache_count = 0;
2291 state->wcache_root = -1;
2292 state->fcache_count = 0;
2293 state->wncache_count = 0;
2294 }
2295
2296 /* add the new word to the cache */
2297
2298 /* search the binary tree for a matching word
2299 - tree_p will end up the index of the matching node, or -1 if no match exists
2300 - tree_link will end up pointing to the parent link (or the root pointer) */
2301
2302 tree_link = &(state->wcache_root);
2303 tree_p = state->wcache_root;
2304 comp = -1;
2305 while ( (comp != 0) && (tree_p != -1) ) {
2306 comp = strcmp((char*)(opt->word), (char*)(state->wcache[tree_p].word));
2307 if (comp < 0) {
2308 tree_link = &(state->wcache[tree_p].left);
2309 tree_p = *tree_link;
2310 }
2311 else if (comp > 0) {
2312 tree_link = &(state->wcache[tree_p].right);
2313 tree_p = *tree_link;
2314 }
2315 }
2316
2317 if (tree_p == -1) {
2318 /* if there was no match, we create a new node */
2319 memcpy(state->wcache[state->wcache_count].word, opt->word, ETYMON_MAX_WORD_SIZE);
2320 state->wcache[state->wcache_count].left = -1;
2321 state->wcache[state->wcache_count].right = -1;
2322 state->wcache[state->wcache_count].next = state->wcache_count; /* here next points to the tail */
2323 state->wcache[state->wcache_count].freq = 1;
2324 state->wcache[state->wcache_count].doc_id = opt->doc_id;
2325 /* add new node to field cache */
2326 if (opt->fields[0] != 0) {
2327 memcpy(state->fcache[state->fcache_count].f, opt->fields, ETYMON_MAX_FIELD_NEST * 2);
2328 state->fcache[state->fcache_count].next = -1;
2329 state->wcache[state->wcache_count].fields = state->fcache_count;
2330 state->fcache_count++;
2331 } else {
2332 state->wcache[state->wcache_count].fields = -1;
2333 }
2334 /* add new node to word number cache */
2335 if (state->number_words) {
2336 state->wncache[state->wncache_count].wn = opt->word_number;
2337 state->wncache[state->wncache_count].next = -1;
2338 state->wcache[state->wcache_count].word_numbers_head = state->wncache_count;
2339 state->wcache[state->wcache_count].word_numbers_tail = state->wncache_count;
2340 state->wncache_count++;
2341 } else {
2342 state->wcache[state->wcache_count].word_numbers_head = -1;
2343 state->wcache[state->wcache_count].word_numbers_tail = -1;
2344 }
2345 /* update parent node in binary tree */
2346 if (tree_link != NULL) {
2347 *tree_link = state->wcache_count;
2348 }
2349 state->wcache_count++;
2350 } else {
2351 /* there was a word match, so now we check if the doc_id's match */
2352 if (opt->doc_id == state->wcache[tree_p].doc_id) {
2353 /* doc_id's match, so we simply increment the frequency */
2354 state->wcache[tree_p].freq++;
2355 /* now add new node to field cache if there is no matching field */
2356 if (opt->fields[0] != 0) {
2357 /* search for a matching field */
2358 field_p = state->wcache[tree_p].fields;
2359 done = 0;
2360 while ( (done == 0) && (field_p != -1) ) {
2361 if (memcmp(state->fcache[field_p].f, opt->fields,
2362 ETYMON_MAX_FIELD_NEST * 2) != 0) {
2363 field_p = state->fcache[field_p].next;
2364 } else {
2365 done = 1;
2366 }
2367 }
2368 if (field_p == -1) {
2369 /* no match, so add a new field node */
2370 memcpy(state->fcache[state->fcache_count].f, opt->fields,
2371 ETYMON_MAX_FIELD_NEST * 2);
2372 state->fcache[state->fcache_count].next = state->wcache[tree_p].fields;
2373 state->wcache[tree_p].fields = state->fcache_count;
2374 state->fcache_count++;
2375 }
2376 }
2377 /* now add new node to word number cache at end of list */
2378 if (state->number_words) {
2379 /* search to the end of the list */
2380 word_number_p = state->wcache[tree_p].word_numbers_tail;
2381 /* add a new word number node */
2382 state->wncache[state->wncache_count].wn = opt->word_number;
2383 state->wncache[state->wcache[tree_p].word_numbers_tail].next = state->wncache_count;
2384 state->wncache[state->wncache_count].next = -1;
2385 state->wcache[tree_p].word_numbers_tail = state->wncache_count;
2386 state->wncache_count++;
2387 }
2388 } else {
2389 /* doc_id's don't match, so we create a new node in the binary tree */
2390 memcpy(state->wcache[state->wcache_count].word, opt->word, ETYMON_MAX_WORD_SIZE);
2391 state->wcache[state->wcache_count].left = state->wcache[tree_p].left;
2392 state->wcache[state->wcache_count].right = state->wcache[tree_p].right;
2393 state->wcache[state->wcache_count].next = state->wcache[tree_p].next; /* tail */
2394 state->wcache[tree_p].next = state->wcache_count; /* to point back to the new node */
2395 state->wcache[state->wcache_count].freq = 1;
2396 state->wcache[state->wcache_count].doc_id = opt->doc_id;
2397 /* add new node to field cache */
2398 if (opt->fields[0] != 0) {
2399 memcpy(state->fcache[state->fcache_count].f, opt->fields, ETYMON_MAX_FIELD_NEST * 2);
2400 state->fcache[state->fcache_count].next = -1;
2401 state->wcache[state->wcache_count].fields = state->fcache_count;
2402 state->fcache_count++;
2403 } else {
2404 state->wcache[state->wcache_count].fields = -1;
2405 }
2406 /* add new node to word number cache */
2407 if (state->number_words) {
2408 state->wncache[state->wncache_count].wn = opt->word_number;
2409 state->wncache[state->wncache_count].next = -1;
2410 state->wcache[state->wcache_count].word_numbers_head = state->wncache_count;
2411 state->wcache[state->wcache_count].word_numbers_tail = state->wncache_count;
2412 state->wncache_count++;
2413 } else {
2414 state->wcache[state->wcache_count].word_numbers_head = -1;
2415 state->wcache[state->wcache_count].word_numbers_head = -1;
2416 }
2417 /* update parent node in binary tree */
2418 if (tree_link != NULL) {
2419 *tree_link = state->wcache_count;
2420 }
2421 state->wcache_count++;
2422 }
2423 }
2424
2425 return 0;
2426 }
2427
2428
etymon_af_index_add_doc(ETYMON_AF_INDEX_ADD_DOC * opt)2429 Uint4 etymon_af_index_add_doc(ETYMON_AF_INDEX_ADD_DOC* opt) {
2430 ETYMON_INDEX_INDEXING_STATE* state = opt->state;
2431 ssize_t nbytes;
2432
2433 /* fill in doctable entry with new data */
2434 if (opt->key == NULL) {
2435 /* fill in default key, based on doctable id */
2436 /*
2437 snprintf((char*)(state->doctable.key), ETYMON_MAX_KEY_SIZE, "%ld",
2438 (unsigned long)(state->doctable_next_id));
2439 */
2440 state->doctable.key[0] = '\0';
2441 } else {
2442 strncpy((char*)(state->doctable.key), (char*)(opt->key), ETYMON_MAX_KEY_SIZE - 1);
2443 state->doctable.key[ETYMON_MAX_KEY_SIZE - 1] = '\0';
2444 }
2445 strncpy(state->doctable.filename, opt->filename, ETYMON_MAX_PATH_SIZE - 1);
2446 state->doctable.filename[ETYMON_MAX_PATH_SIZE - 1] = '\0';
2447 state->doctable.begin = opt->begin;
2448 state->doctable.end = opt->end;
2449 state->doctable.parent = opt->parent;
2450 state->doctable.dclass_id = opt->dclass_id;
2451 state->doctable.deleted = 0;
2452 /* write out doctable entry */
2453 nbytes = write(state->doctable_fd, &(state->doctable), sizeof(ETYMON_DOCTABLE));
2454 if (nbytes != sizeof(ETYMON_DOCTABLE)) {
2455 /* ERROR */
2456 printf("error writing to file in etymon_index_dclass_add_doc\n");
2457 exit(1);
2458 }
2459 /* increment count of total number of (non-deleted) documents in database */
2460 state->doc_n++;
2461 /* increment next counter */
2462 return state->doctable_next_id++;
2463 }
2464
2465
2466 /* need to change this function prototype to conform to other document
2467 class call-backs */
etymon_index_dclass_get_next_doc_id(ETYMON_INDEX_INDEXING_STATE * state)2468 Uint4 etymon_index_dclass_get_next_doc_id(ETYMON_INDEX_INDEXING_STATE* state) {
2469 return state->doctable_next_id;
2470 }
2471