1 /*
2  *  Copyright (C) 1999-2004 Etymon Systems, Inc.
3  *
4  *  Authors:  Nassib Nassar
5  */
6 
7 #include <stdlib.h>
8 #include <stdio.h>
9 #include "index.h"
10 #include "lock.h"
11 #include "util.h"
12 #include "fdef.h"
13 #include "stem.h"
14 #include "info.h"
15 #include "linear.h"
16 
17 #include "text.h"
18 #include "xml.h"
19 #include "xml_test.h"
20 #include "erc.h"
21 
22 #include "open.h"
23 extern ETYMON_AF_STATE *etymon_af_state[];
24 
25 /* assumes that the buffer, absolute_path, is of size
26    ETYMON_MAX_PATH_SIZE, that relative_path contains a valid
27    null-terminated string, that cwd has been filled in using getcwd(),
28    and that relative_path != absolute_path */
etymon_index_expand_path(char * relative_path,char * absolute_path,char * cwd)29 void etymon_index_expand_path(char* relative_path, char* absolute_path, char* cwd) {
30 	/* check if it's already an absolute path */
31 	if (*relative_path == '/') {
32 		strncpy(absolute_path, relative_path, ETYMON_MAX_PATH_SIZE - 1);
33 		absolute_path[ETYMON_MAX_PATH_SIZE - 1] = '\0';
34 		return;
35 	} else {
36 		char* r_p;
37 		char* slash_p;
38 		char* p;
39 		int x, y, seg_len;
40 
41 		/* otherwise combine cwd and relative_path to get the absolute path */
42 
43 		/* start off with the cwd */
44 		strcpy(absolute_path, cwd);
45 
46 		/* now take apart relative_path */
47 		r_p = relative_path;
48 		while ( (slash_p = strchr(r_p, '/')) != NULL ) {
49 			/* now r_p points to the beginning and slash_p to the next '/' */
50 			seg_len = slash_p - r_p;
51 			/* check if it's "../" */
52 			if ( (seg_len == 2) && (strncmp(r_p, "..", 2) == 0) ) {
53 				/* remove the last segment from absolute_path */
54 				p = strrchr(absolute_path, '/');
55 				if (p == NULL) {
56 					/* ERROR and return */
57 				}
58 				*p = '\0';
59 			}
60 			/* check if it's "./" */
61 			else if ( (seg_len == 1) && (*r_p == '.') ) {
62 				/* do nothing */
63 			}
64 			else {
65 				/* append the segment to the end of absolute_path */
66 				x = strlen(absolute_path);
67 				if ((ETYMON_MAX_PATH_SIZE - x) >= (seg_len + 2)) {
68 					absolute_path[x++] = '/';
69 					memcpy(absolute_path + x, r_p, seg_len);
70 					absolute_path[x + seg_len] = '\0';
71 				}
72 			}
73 			r_p = slash_p + 1;
74 		}
75 		x = strlen(absolute_path);
76 		y = strlen(r_p);
77 		if ((ETYMON_MAX_PATH_SIZE - x) >= (y + 2)) {
78 			absolute_path[x] = '/';
79 			memcpy(absolute_path + x + 1, r_p, y + 1);
80 		}
81 	}
82 }
83 
84 #ifdef ZZZZZ
85 
86 /*#define OPT_STDIO*/
87 
88 #ifdef OPT_STDIO
89 
90 /* this was written before the advent of ETYMON_INDEX_PAGE_L.post_n[],
91    ETYMON_INDEX_UPOST.fields_n, and ETYMON_INDEX_UPOST.word_numbers_n
92    in the first unoptimized pass; so it explicitly counts these values
93    while building the optimized structures */
etymon_index_optimize_old_stdio(ETYMON_INDEX_OPTIONS * opt)94 int etymon_index_optimize_old_stdio(ETYMON_INDEX_OPTIONS* opt) {
95 	int dbinfo_fd, udict_fd, upost_fd, ufield_fd, uword_fd, lpost_fd, lfield_fd, lword_fd;
96 	FILE* dbinfo_f;
97 	FILE* udict_f;
98 	FILE* upost_f;
99 	FILE* ufield_f;
100 	FILE* uword_f;
101 	FILE* lpost_f;
102 	FILE* lfield_f;
103 	FILE* lword_f;
104 	int x;
105 	etymon_af_off_t udict_size, upost_isize, ufield_isize, uword_isize, lpost_isize, lfield_isize, lword_isize;
106 	Uint4 magic;
107 	ETYMON_DB_INFO dbinfo;
108 	char fn[ETYMON_MAX_PATH_SIZE];
109 	ETYMON_AF_STAT st;
110 	ssize_t nbytes;
111 	Uint4 udict_p, upost_p, lpost_p_save;
112 	Uint4 ufield_p, lfield_p_save, field_count;
113 	Uint4 uword_p, lword_p_save, word_count;
114 	ETYMON_INDEX_PAGE_L page_l;
115 	ETYMON_INDEX_PAGE_NL page_nl;
116 	Uint1 leaf_flag;
117 	ETYMON_INDEX_UPOST upost;
118 	ETYMON_INDEX_LPOST lpost;
119 	ETYMON_INDEX_UFIELD ufield;
120 	ETYMON_INDEX_LFIELD lfield;
121 	ETYMON_INDEX_UWORD uword;
122 	ETYMON_INDEX_LWORD lword;
123 
124 	/* make sure database is ready */
125 	if (etymon_db_ready(opt->dbname, &(opt->log)) == 0) {
126 		int e;
127 		char s[ETYMON_MAX_MSG_SIZE];
128 		sprintf(s, "%s: Database not ready", opt->dbname);
129 		e = opt->log.error(s, 1);
130 		if (e != 0) {
131 			exit(e);
132 		}
133 		return -1;
134 	}
135 
136 	/* lock the database */
137 	etymon_db_lock(opt->dbname, &(opt->log));
138 
139 	/* open db info file for read/write */
140 	etymon_db_construct_path(ETYMON_DBF_INFO, opt->dbname, fn);
141 	dbinfo_fd = open(fn, O_RDWR | ETYMON_AF_O_LARGEFILE);
142 	if (dbinfo_fd == -1) {
143 		int e;
144 		char s[ETYMON_MAX_MSG_SIZE];
145 		sprintf(s, "%s: Unable to open database", opt->dbname);
146 		e = opt->log.error(s, 1);
147 		etymon_db_unlock(opt->dbname, &(opt->log));
148 		if (e != 0) {
149 			exit(e);
150 		}
151 		return -1;
152 	}
153 	nbytes = read(dbinfo_fd, &magic, sizeof(Uint4));
154 	if (nbytes != sizeof(Uint4)) {
155 		/* ERROR */
156 		printf("unable to read %s\n", fn);
157 		exit(1);
158 	}
159 	if (magic != ETYMON_INDEX_MAGIC) {
160 		int e;
161 		char s[ETYMON_MAX_MSG_SIZE];
162 		sprintf(s, "%s: Database created by incompatible version", opt->dbname);
163 		e = opt->log.error(s, 1);
164 		close(dbinfo_fd);
165 		etymon_db_unlock(opt->dbname, &(opt->log));
166 		if (e != 0) {
167 			exit(e);
168 		}
169 		return -1;
170 	}
171 	nbytes = read(dbinfo_fd, &dbinfo, sizeof(ETYMON_DB_INFO));
172 	if (nbytes != sizeof(ETYMON_DB_INFO)) {
173 		/* ERROR */
174 		printf("unable to read %s\n", fn);
175 		exit(1);
176 	}
177 	dbinfo_f = fdopen(dbinfo_fd, "r+b");
178 
179 	/* make sure the database is not already optimized */
180 	if (dbinfo.optimized == 1) {
181 		int e;
182 		char s[ETYMON_MAX_MSG_SIZE];
183 		sprintf(s, "%s: Database is already linearized", opt->dbname);
184 		e = opt->log.error(s, 1);
185 		fclose(dbinfo_f);
186 		close(dbinfo_fd);
187 		etymon_db_unlock(opt->dbname, &(opt->log)); /* unlock the database */
188 		if (e != 0) {
189 			exit(e);
190 		}
191 		return -1;
192 	}
193 
194 	/* open files */
195 
196 	/* open udict for read/write */
197 	etymon_db_construct_path(ETYMON_DBF_UDICT, opt->dbname, fn);
198 	udict_fd = open(fn, O_RDWR | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
199 	if (udict_fd == -1) {
200 		/* ERROR */
201 		printf("unable to open %s for read/write\n", fn);
202 		exit(1);
203 	}
204 	/* stat udict to get size */
205 	if (etymon_af_fstat(udict_fd, &st) == -1) {
206 		perror("index_optimize():fstat()");
207 	}
208 	udict_size = st.st_size;
209 	udict_f = fdopen(udict_fd, "r+b");
210 
211 	/* open upost for read */
212 	etymon_db_construct_path(ETYMON_DBF_UPOST, opt->dbname, fn);
213 	upost_fd = open(fn, O_RDONLY | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
214 	if (upost_fd == -1) {
215 		/* ERROR */
216 		printf("unable to open %s for read\n", fn);
217 		exit(1);
218 	}
219 	/* stat upost to get size */
220 	if (etymon_af_fstat(upost_fd, &st) == -1) {
221 		perror("index_optimize():fstat()");
222 	}
223 	upost_isize = st.st_size / sizeof(ETYMON_INDEX_UPOST);
224 	upost_f = fdopen(upost_fd, "rb");
225 
226 	/* open ufield for read */
227 	etymon_db_construct_path(ETYMON_DBF_UFIELD, opt->dbname, fn);
228 	ufield_fd = open(fn, O_RDONLY | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
229 	if (ufield_fd == -1) {
230 		/* ERROR */
231 		printf("unable to open %s for read\n", fn);
232 		exit(1);
233 	}
234 	/* stat ufield to get size */
235 	if (etymon_af_fstat(ufield_fd, &st) == -1) {
236 		perror("index_optimize():fstat()");
237 	}
238 	ufield_isize = st.st_size / sizeof(ETYMON_INDEX_UFIELD);
239 	ufield_f = fdopen(ufield_fd, "rb");
240 
241 	/* open uword for read */
242 	etymon_db_construct_path(ETYMON_DBF_UWORD, opt->dbname, fn);
243 	uword_fd = open(fn, O_RDONLY | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
244 	if (uword_fd == -1) {
245 		/* ERROR */
246 		printf("unable to open %s for read\n", fn);
247 		exit(1);
248 	}
249 	/* stat uword to get size */
250 	if (etymon_af_fstat(uword_fd, &st) == -1) {
251 		perror("index_optimize():fstat()");
252 	}
253 	uword_isize = st.st_size / sizeof(ETYMON_INDEX_UWORD);
254 	uword_f = fdopen(uword_fd, "rb");
255 
256 	/* open lpost for append */
257 	etymon_db_construct_path(ETYMON_DBF_LPOST, opt->dbname, fn);
258 	lpost_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
259 	if (lpost_fd == -1) {
260 		/* ERROR */
261 		printf("unable to open %s for append\n", fn);
262 		exit(1);
263 	}
264 	/* stat lpost to get size */
265 	if (etymon_af_fstat(lpost_fd, &st) == -1) {
266 		perror("index_optimize():fstat()");
267 	}
268 	lpost_isize = st.st_size / sizeof(ETYMON_INDEX_LPOST);
269 	lpost_f = fdopen(lpost_fd, "ab");
270 
271 	/* open lfield for append */
272 	etymon_db_construct_path(ETYMON_DBF_LFIELD, opt->dbname, fn);
273 	lfield_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
274 	if (lfield_fd == -1) {
275 		/* ERROR */
276 		printf("unable to open %s for append\n", fn);
277 		exit(1);
278 	}
279 	/* stat lfield to get size */
280 	if (etymon_af_fstat(lfield_fd, &st) == -1) {
281 		perror("index_optimize():fstat()");
282 	}
283 	lfield_isize = st.st_size / sizeof(ETYMON_INDEX_LFIELD);
284 	lfield_f = fdopen(lfield_fd, "ab");
285 
286 	/* open lword for append */
287 	etymon_db_construct_path(ETYMON_DBF_LWORD, opt->dbname, fn);
288 	lword_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
289 	if (lword_fd == -1) {
290 		/* ERROR */
291 		printf("unable to open %s for append\n", fn);
292 		exit(1);
293 	}
294 	/* stat lword to get size */
295 	if (etymon_af_fstat(lword_fd, &st) == -1) {
296 		perror("index_optimize():fstat()");
297 	}
298 	lword_isize = st.st_size / sizeof(ETYMON_INDEX_LWORD);
299 	lword_f = fdopen(lword_fd, "ab");
300 
301 	/* optimize! */
302 
303 /*	if (opt->verbose >= 2) {*/
304 		printf("Linearizing (new)\n");
305 /*	}*/
306 
307 	/* first descend to the left-most leaf page */
308 	udict_p = dbinfo.udict_root;
309 	do {
310 		if (fseeko(udict_f, (etymon_af_off_t)udict_p, SEEK_SET) == -1) {
311 			perror("index_optimize():fseeko()");
312 		}
313 		if (fread(&(leaf_flag), 1, 1, udict_f) < 1) {
314 			perror("index_optimize():fread()");
315 		}
316 		if (leaf_flag == 0) {
317 			if (fread(&page_nl, 1, sizeof(ETYMON_INDEX_PAGE_NL), udict_f) < sizeof(ETYMON_INDEX_PAGE_NL)) {
318 				perror("index_optimize():fread()");
319 			}
320 			udict_p = page_nl.p[0];
321 		}
322 	} while (leaf_flag == 0);
323 
324 	/* now go through all leaf pages */
325 
326 	do {
327 
328 		if (fseeko(udict_f, ((etymon_af_off_t)(udict_p + 1)), SEEK_SET) == -1) {
329 			perror("index_optimize():fseeko()");
330 		}
331 		if (fread(&page_l, 1, sizeof(ETYMON_INDEX_PAGE_L), udict_f) < sizeof(ETYMON_INDEX_PAGE_L)) {
332 			perror("index_optimize():fread()");
333 		}
334 
335 		/* examine each key and optimize associated posting, field data, and word number data */
336 
337 		for (x = 0; x < page_l.n; x++) {
338 
339 			page_l.post_n[x] = 0;
340 
341 			/* run through postings, assume matching doc_id's are consecutive */
342 			lpost_p_save = lpost_isize + 1;
343 			upost_p = page_l.post[x];
344 			lpost.doc_id = 0;
345 			while (upost_p != 0) {
346 				/* read a upost node */
347 				if (fseeko(upost_f,
348 					  (etymon_af_off_t)( ((etymon_af_off_t)(upost_p - 1)) * ((etymon_af_off_t)(sizeof(ETYMON_INDEX_UPOST))) ),
349 					  SEEK_SET) == -1) {
350 					perror("index_optimize():fseeko()");
351 				}
352 				if (fread(&upost, 1, sizeof(ETYMON_INDEX_UPOST), upost_f) < sizeof(ETYMON_INDEX_UPOST)) {
353 					perror("index_optimize():fread()");
354 				}
355 
356 				/* optimize fields */
357 				/* DO WE NEED TO LOOK FOR DUPLICATES? */
358 				lfield_p_save = lfield_isize + 1;
359 				field_count = 0;
360 				ufield_p = upost.fields;
361 				while (ufield_p != 0) {
362 					field_count++;
363 					if (fseeko(ufield_f,
364 						  (etymon_af_off_t)( ((etymon_af_off_t)(ufield_p - 1)) *
365 							   ((etymon_af_off_t)(sizeof(ETYMON_INDEX_UFIELD))) ),
366 						  SEEK_SET) == -1) {
367 						perror("index_optimize():fseeko()");
368 					}
369 					if (fread(&ufield, 1, sizeof(ETYMON_INDEX_UFIELD), ufield_f) <
370 					    sizeof(ETYMON_INDEX_UFIELD)) {
371 						perror("index_optimize():fread()");
372 					}
373 					memcpy(lfield.fields, ufield.fields, ETYMON_MAX_FIELD_NEST * 2);
374 					if (fwrite(&lfield, 1, sizeof(ETYMON_INDEX_LFIELD), lfield_f) <
375 					    sizeof(ETYMON_INDEX_LFIELD)) {
376 						perror("index_optimize():fwrite()");
377 					}
378 					lfield_isize++;
379 					ufield_p = ufield.next;
380 				}
381 
382 				/* optimize word numbers */
383 				lword_p_save = lword_isize + 1;
384 				word_count = 0;
385 				uword_p = upost.word_numbers;
386 				while (uword_p != 0) {
387 					word_count++;
388 					if (fseeko(uword_f,
389 						  (etymon_af_off_t)( ((etymon_af_off_t)(uword_p - 1)) *
390 							   ((etymon_af_off_t)(sizeof(ETYMON_INDEX_UWORD))) ),
391 						  SEEK_SET) == -1) {
392 						perror("index_optimize():fseeko()");
393 					}
394 					if (fread(&uword, 1, sizeof(ETYMON_INDEX_UWORD), uword_f) <
395 					    sizeof(ETYMON_INDEX_UWORD)) {
396 						perror("index_optimize():fread()");
397 					}
398 					lword.wn = uword.wn;
399 					if (fwrite(&lword, 1, sizeof(ETYMON_INDEX_LWORD), lword_f) <
400 					    sizeof(ETYMON_INDEX_LWORD)) {
401 						perror("index_optimize():fwrite()");
402 					}
403 					lword_isize++;
404 					uword_p = uword.next;
405 				}
406 
407 				/* compare the doc_id with our cached lpost */
408 				if (upost.doc_id == lpost.doc_id) {
409 					/* increment the frequency and field count */
410 					lpost.freq += upost.freq;
411 					lpost.fields_n += field_count;
412 					lpost.word_numbers_n += word_count;
413 				} else {
414 					/* flush lpost */
415 					if (lpost.doc_id != 0) { /* only flush if lpost contains something */
416 						if (fwrite(&lpost, 1, sizeof(ETYMON_INDEX_LPOST), lpost_f) <
417 						    sizeof(ETYMON_INDEX_LPOST)) {
418 							perror("index_optimize():fwrite()");
419 						}
420 						lpost_isize++;
421 						page_l.post_n[x]++;
422 					}
423 					/* replace lpost with upost */
424 					lpost.doc_id = upost.doc_id;
425 					lpost.freq = upost.freq;
426 					lpost.fields_n = field_count;
427 					lpost.word_numbers_n = word_count;
428 					/* set field pointer */
429 					lpost.fields = lfield_p_save;
430 					lpost.word_numbers = lword_p_save;
431 				}
432 				upost_p = upost.next;
433 			} /* while */
434 			/* flush lpost */
435 			if (lpost.doc_id != 0) { /* only flush if lpost contains something */
436 				if (fwrite(&lpost, 1, sizeof(ETYMON_INDEX_LPOST), lpost_f) < sizeof(ETYMON_INDEX_LPOST)) {
437 					perror("index_optimize():fwrite()");
438 				}
439 				lpost_isize++;
440 				page_l.post_n[x]++;
441 			}
442 			page_l.post[x] = lpost_p_save;
443 
444 		} /* for */
445 
446 		/* write out updated leaf page */
447 		if (fseeko(udict_f, ((etymon_af_off_t)(udict_p + 1)), SEEK_SET) == -1) {
448 			perror("index_optimize():fseeko()");
449 		}
450 		if (fwrite(&page_l, 1, sizeof(ETYMON_INDEX_PAGE_L), udict_f) < sizeof(ETYMON_INDEX_PAGE_L)) {
451 			perror("index_optimize():fwrite()");
452 		}
453 
454 		udict_p = page_l.next;
455 
456 	} while (udict_p != 0);
457 
458 	/* update dbinfo */
459 	if (fseeko(dbinfo_f, (etymon_af_off_t)0, SEEK_SET) == -1) {
460 		perror("index_optimize():fseeko()");
461 	}
462 	magic = ETYMON_INDEX_MAGIC;
463 	nbytes = fwrite(&magic, 1, sizeof(Uint4), dbinfo_f);
464 	if (nbytes != sizeof(Uint4)) {
465 		/* ERROR */
466 		printf("unable to write MN\n");
467 		exit(1);
468 	}
469 	dbinfo.optimized = 1;
470 	nbytes = fwrite(&dbinfo, 1, sizeof(ETYMON_DB_INFO), dbinfo_f);
471 	if (nbytes != sizeof(ETYMON_DB_INFO)) {
472 		/* ERROR */
473 		printf("unable to write DBI\n");
474 		exit(1);
475 	}
476 
477 	/* clean up */
478 	fclose(dbinfo_f);
479 	fclose(udict_f);
480 	fclose(upost_f);
481 	fclose(ufield_f);
482 	fclose(uword_f);
483 	fclose(lpost_f);
484 	fclose(lfield_f);
485 	fclose(lword_f);
486 	close(dbinfo_fd);
487 	close(udict_fd);
488 	close(upost_fd);
489 	close(ufield_fd);
490 	close(uword_fd);
491 	close(lpost_fd);
492 	close(lfield_fd);
493 	close(lword_fd);
494 
495 	/* reopen and truncate upost */
496 	etymon_db_construct_path(ETYMON_DBF_UPOST, opt->dbname, fn);
497 	upost_fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
498 	close(upost_fd);
499 	/* reopen and truncate ufield */
500 	etymon_db_construct_path(ETYMON_DBF_UFIELD, opt->dbname, fn);
501 	ufield_fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
502 	close(ufield_fd);
503 	/* reopen and truncate uword */
504 	etymon_db_construct_path(ETYMON_DBF_UWORD, opt->dbname, fn);
505 	uword_fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
506 	close(uword_fd);
507 
508 	/* unlock the database */
509 	etymon_db_unlock(opt->dbname, &(opt->log));
510 
511 	return 0;
512 
513 } /* optimize_new() */
514 
515 #endif
516 
517 /* this was written before the advent of ETYMON_INDEX_PAGE_L.post_n[],
518    ETYMON_INDEX_UPOST.fields_n, and ETYMON_INDEX_UPOST.word_numbers_n
519    in the first unoptimized pass; so it explicitly counts these values
520    while building the optimized structures */
etymon_index_optimize_old(ETYMON_INDEX_OPTIONS * opt)521 int etymon_index_optimize_old(ETYMON_INDEX_OPTIONS* opt) {
522 	int dbinfo_fd, udict_fd, upost_fd, ufield_fd, uword_fd, lpost_fd, lfield_fd, lword_fd;
523 	int x;
524 	etymon_af_off_t udict_size, upost_isize, ufield_isize, uword_isize, lpost_isize, lfield_isize, lword_isize;
525 	Uint4 magic;
526 	ETYMON_DB_INFO dbinfo;
527 	char fn[ETYMON_MAX_PATH_SIZE];
528 	ETYMON_AF_STAT st;
529 	ssize_t nbytes;
530 	Uint4 udict_p, upost_p, lpost_p_save;
531 	Uint4 ufield_p, lfield_p_save, field_count;
532 	Uint4 uword_p, lword_p_save, word_count;
533 	ETYMON_INDEX_PAGE_L page_l;
534 	ETYMON_INDEX_PAGE_NL page_nl;
535 	Uint1 leaf_flag;
536 	ETYMON_INDEX_UPOST upost;
537 	ETYMON_INDEX_LPOST lpost;
538 	ETYMON_INDEX_UFIELD ufield;
539 	ETYMON_INDEX_LFIELD lfield;
540 	ETYMON_INDEX_UWORD uword;
541 	ETYMON_INDEX_LWORD lword;
542 
543 	/* make sure database is ready */
544 	if (etymon_db_ready(opt->dbname) == 0) {
545 		int e;
546 		char s[ETYMON_MAX_MSG_SIZE];
547 		sprintf(s, "%s: Database not ready", opt->dbname);
548 		e = opt->log.error(s, 1);
549 		if (e != 0) {
550 			exit(e);
551 		}
552 		return -1;
553 	}
554 
555 	/* lock the database */
556 	etymon_db_lock(opt->dbname, &(opt->log));
557 
558 	/* open db info file for read/write */
559 	etymon_db_construct_path(ETYMON_DBF_INFO, opt->dbname, fn);
560 	dbinfo_fd = open(fn, O_RDWR | ETYMON_AF_O_LARGEFILE);
561 	if (dbinfo_fd == -1) {
562 		int e;
563 		char s[ETYMON_MAX_MSG_SIZE];
564 		sprintf(s, "%s: Unable to open database", opt->dbname);
565 		e = opt->log.error(s, 1);
566 		etymon_db_unlock(opt->dbname);
567 		if (e != 0) {
568 			exit(e);
569 		}
570 		return -1;
571 	}
572 	nbytes = read(dbinfo_fd, &magic, sizeof(Uint4));
573 	if (nbytes != sizeof(Uint4)) {
574 		/* ERROR */
575 		printf("unable to read %s\n", fn);
576 		exit(1);
577 	}
578 	if (magic != ETYMON_INDEX_MAGIC) {
579 		int e;
580 		char s[ETYMON_MAX_MSG_SIZE];
581 		sprintf(s, "%s: Database created by incompatible version", opt->dbname);
582 		e = opt->log.error(s, 1);
583 		close(dbinfo_fd);
584 		etymon_db_unlock(opt->dbname);
585 		if (e != 0) {
586 			exit(e);
587 		}
588 		return -1;
589 	}
590 	nbytes = read(dbinfo_fd, &dbinfo, sizeof(ETYMON_DB_INFO));
591 	if (nbytes != sizeof(ETYMON_DB_INFO)) {
592 		/* ERROR */
593 		printf("unable to read %s\n", fn);
594 		exit(1);
595 	}
596 
597 	/* make sure the database is not already optimized */
598 	if (dbinfo.optimized == 1) {
599 		int e;
600 		char s[ETYMON_MAX_MSG_SIZE];
601 		sprintf(s, "%s: Database is already linearized", opt->dbname);
602 		e = opt->log.error(s, 1);
603 		close(dbinfo_fd);
604 		etymon_db_unlock(opt->dbname); /* unlock the database */
605 		if (e != 0) {
606 			exit(e);
607 		}
608 		return -1;
609 	}
610 
611 	/* open files */
612 
613 	/* open udict for read/write */
614 	etymon_db_construct_path(ETYMON_DBF_UDICT, opt->dbname, fn);
615 	udict_fd = open(fn, O_RDWR | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
616 	if (udict_fd == -1) {
617 		/* ERROR */
618 		printf("unable to open %s for read/write\n", fn);
619 		exit(1);
620 	}
621 	/* stat udict to get size */
622 	if (etymon_af_fstat(udict_fd, &st) == -1) {
623 		perror("index_optimize():fstat()");
624 	}
625 	udict_size = st.st_size;
626 
627 	/* open upost for read */
628 	etymon_db_construct_path(ETYMON_DBF_UPOST, opt->dbname, fn);
629 	upost_fd = open(fn, O_RDONLY | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
630 	if (upost_fd == -1) {
631 		/* ERROR */
632 		printf("unable to open %s for read\n", fn);
633 		exit(1);
634 	}
635 	/* stat upost to get size */
636 	if (etymon_af_fstat(upost_fd, &st) == -1) {
637 		perror("index_optimize():fstat()");
638 	}
639 	upost_isize = st.st_size / sizeof(ETYMON_INDEX_UPOST);
640 
641 	/* open ufield for read */
642 	etymon_db_construct_path(ETYMON_DBF_UFIELD, opt->dbname, fn);
643 	ufield_fd = open(fn, O_RDONLY | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
644 	if (ufield_fd == -1) {
645 		/* ERROR */
646 		printf("unable to open %s for read\n", fn);
647 		exit(1);
648 	}
649 	/* stat ufield to get size */
650 	if (etymon_af_fstat(ufield_fd, &st) == -1) {
651 		perror("index_optimize():fstat()");
652 	}
653 	ufield_isize = st.st_size / sizeof(ETYMON_INDEX_UFIELD);
654 
655 	/* open uword for read */
656 	etymon_db_construct_path(ETYMON_DBF_UWORD, opt->dbname, fn);
657 	uword_fd = open(fn, O_RDONLY | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
658 	if (uword_fd == -1) {
659 		/* ERROR */
660 		printf("unable to open %s for read\n", fn);
661 		exit(1);
662 	}
663 	/* stat uword to get size */
664 	if (etymon_af_fstat(uword_fd, &st) == -1) {
665 		perror("index_optimize():fstat()");
666 	}
667 	uword_isize = st.st_size / sizeof(ETYMON_INDEX_UWORD);
668 
669 	/* open lpost for append */
670 	etymon_db_construct_path(ETYMON_DBF_LPOST, opt->dbname, fn);
671 	lpost_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
672 	if (lpost_fd == -1) {
673 		/* ERROR */
674 		printf("unable to open %s for append\n", fn);
675 		exit(1);
676 	}
677 	/* stat lpost to get size */
678 	if (etymon_af_fstat(lpost_fd, &st) == -1) {
679 		perror("index_optimize():fstat()");
680 	}
681 	lpost_isize = st.st_size / sizeof(ETYMON_INDEX_LPOST);
682 
683 	/* open lfield for append */
684 	etymon_db_construct_path(ETYMON_DBF_LFIELD, opt->dbname, fn);
685 	lfield_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
686 	if (lfield_fd == -1) {
687 		/* ERROR */
688 		printf("unable to open %s for append\n", fn);
689 		exit(1);
690 	}
691 	/* stat lfield to get size */
692 	if (etymon_af_fstat(lfield_fd, &st) == -1) {
693 		perror("index_optimize():fstat()");
694 	}
695 	lfield_isize = st.st_size / sizeof(ETYMON_INDEX_LFIELD);
696 
697 	/* open lword for append */
698 	etymon_db_construct_path(ETYMON_DBF_LWORD, opt->dbname, fn);
699 	lword_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
700 	if (lword_fd == -1) {
701 		/* ERROR */
702 		printf("unable to open %s for append\n", fn);
703 		exit(1);
704 	}
705 	/* stat lword to get size */
706 	if (etymon_af_fstat(lword_fd, &st) == -1) {
707 		perror("index_optimize():fstat()");
708 	}
709 	lword_isize = st.st_size / sizeof(ETYMON_INDEX_LWORD);
710 
711 	/* optimize! */
712 
713 	if (opt->verbose >= 2) {
714 		printf("Linearizing (old)\n");
715 	}
716 
717 	/* first descend to the left-most leaf page */
718 	udict_p = dbinfo.udict_root;
719 	do {
720 		if (etymon_af_lseek(udict_fd, (etymon_af_off_t)udict_p, SEEK_SET) == -1) {
721 			perror("index_optimize():lseek()");
722 		}
723 		if (read(udict_fd, &(leaf_flag), 1) == -1) {
724 			perror("index_optimize():read()");
725 		}
726 		if (leaf_flag == 0) {
727 			if (read(udict_fd, &page_nl, sizeof(ETYMON_INDEX_PAGE_NL)) == -1) {
728 				perror("index_optimize():read()");
729 			}
730 			udict_p = page_nl.p[0];
731 		}
732 	} while (leaf_flag == 0);
733 
734 	/* now go through all leaf pages */
735 
736 	do {
737 
738 		if (etymon_af_lseek(udict_fd, ((etymon_af_off_t)(udict_p + 1)), SEEK_SET) == -1) {
739 			perror("index_optimize():lseek()");
740 		}
741 		if (read(udict_fd, &page_l, sizeof(ETYMON_INDEX_PAGE_L)) == -1) {
742 			perror("index_optimize():read()");
743 		}
744 
745 		/* examine each key and optimize associated posting, field data, and word number data */
746 
747 		for (x = 0; x < page_l.n; x++) {
748 
749 			page_l.post_n[x] = 0;
750 
751 			/* run through postings, assume matching doc_id's are consecutive */
752 			lpost_p_save = lpost_isize + 1;
753 			upost_p = page_l.post[x];
754 			lpost.doc_id = 0;
755 			while (upost_p != 0) {
756 				/* read a upost node */
757 				if (etymon_af_lseek(upost_fd,
758 					  (etymon_af_off_t)( ((etymon_af_off_t)(upost_p - 1)) * ((etymon_af_off_t)(sizeof(ETYMON_INDEX_UPOST))) ),
759 					  SEEK_SET) == -1) {
760 					perror("index_optimize():lseek()");
761 				}
762 				if (read(upost_fd, &upost, sizeof(ETYMON_INDEX_UPOST)) == -1) {
763 					perror("index_optimize():read()");
764 				}
765 
766 				/* optimize fields */
767 				/* DO WE NEED TO LOOK FOR DUPLICATES? */
768 				lfield_p_save = lfield_isize + 1;
769 				field_count = 0;
770 				ufield_p = upost.fields;
771 				while (ufield_p != 0) {
772 					field_count++;
773 					if (etymon_af_lseek(ufield_fd,
774 						  (etymon_af_off_t)( ((etymon_af_off_t)(ufield_p - 1)) *
775 							   ((etymon_af_off_t)(sizeof(ETYMON_INDEX_UFIELD))) ),
776 						  SEEK_SET) == -1) {
777 						perror("index_optimize():lseek()");
778 					}
779 					if (read(ufield_fd, &ufield, sizeof(ETYMON_INDEX_UFIELD)) == -1) {
780 						perror("index_optimize():read()");
781 					}
782 					memcpy(lfield.fields, ufield.fields, ETYMON_MAX_FIELD_NEST * 2);
783 					if (write(lfield_fd, &lfield, sizeof(ETYMON_INDEX_LFIELD)) == -1) {
784 						perror("index_optimize():write()");
785 					}
786 					lfield_isize++;
787 					ufield_p = ufield.next;
788 				}
789 
790 				/* optimize word numbers */
791 				lword_p_save = lword_isize + 1;
792 				word_count = 0;
793 				uword_p = upost.word_numbers;
794 				while (uword_p != 0) {
795 					word_count++;
796 					if (etymon_af_lseek(uword_fd,
797 						  (etymon_af_off_t)( ((etymon_af_off_t)(uword_p - 1)) *
798 							   ((etymon_af_off_t)(sizeof(ETYMON_INDEX_UWORD))) ),
799 						  SEEK_SET) == -1) {
800 						perror("index_optimize():lseek()");
801 					}
802 					if (read(uword_fd, &uword, sizeof(ETYMON_INDEX_UWORD)) == -1) {
803 						perror("index_optimize():read()");
804 					}
805 					lword.wn = uword.wn;
806 					if (write(lword_fd, &lword, sizeof(ETYMON_INDEX_LWORD)) == -1) {
807 						perror("index_optimize():write()");
808 					}
809 					lword_isize++;
810 					uword_p = uword.next;
811 				}
812 
813 				/* compare the doc_id with our cached lpost */
814 				if (upost.doc_id == lpost.doc_id) {
815 					/* increment the frequency and field count */
816 					lpost.freq += upost.freq;
817 					lpost.fields_n += field_count;
818 					lpost.word_numbers_n += word_count;
819 				} else {
820 					/* flush lpost */
821 					if (lpost.doc_id != 0) { /* only flush if lpost contains something */
822 						if (write(lpost_fd, &lpost, sizeof(ETYMON_INDEX_LPOST)) == -1) {
823 							perror("index_optimize():write()");
824 						}
825 						lpost_isize++;
826 						page_l.post_n[x]++;
827 					}
828 					/* replace lpost with upost */
829 					lpost.doc_id = upost.doc_id;
830 					lpost.freq = upost.freq;
831 					lpost.fields_n = field_count;
832 					lpost.word_numbers_n = word_count;
833 					/* set field pointer */
834 					lpost.fields = lfield_p_save;
835 					lpost.word_numbers = lword_p_save;
836 				}
837 				upost_p = upost.next;
838 			} /* while */
839 			/* flush lpost */
840 			if (lpost.doc_id != 0) { /* only flush if lpost contains something */
841 				if (write(lpost_fd, &lpost, sizeof(ETYMON_INDEX_LPOST)) == -1) {
842 					perror("index_optimize():write()");
843 				}
844 				lpost_isize++;
845 				page_l.post_n[x]++;
846 			}
847 			page_l.post[x] = lpost_p_save;
848 
849 		} /* for */
850 
851 		/* write out updated leaf page */
852 		if (etymon_af_lseek(udict_fd, ((etymon_af_off_t)(udict_p + 1)), SEEK_SET) == -1) {
853 			perror("index_optimize():lseek()");
854 		}
855 		if (write(udict_fd, &page_l, sizeof(ETYMON_INDEX_PAGE_L)) == -1) {
856 			perror("index_optimize():write()");
857 		}
858 
859 		udict_p = page_l.next;
860 
861 	} while (udict_p != 0);
862 
863 	/* update dbinfo */
864 	if (etymon_af_lseek(dbinfo_fd, (etymon_af_off_t)0, SEEK_SET) == -1) {
865 		perror("index_optimize():lseek()");
866 	}
867 	magic = ETYMON_INDEX_MAGIC;
868 	nbytes = write(dbinfo_fd, &magic, sizeof(Uint4));
869 	if (nbytes != sizeof(Uint4)) {
870 		/* ERROR */
871 		printf("unable to write MN\n");
872 		exit(1);
873 	}
874 	dbinfo.optimized = 1;
875 	nbytes = write(dbinfo_fd, &dbinfo, sizeof(ETYMON_DB_INFO));
876 	if (nbytes != sizeof(ETYMON_DB_INFO)) {
877 		/* ERROR */
878 		printf("unable to write DBI\n");
879 		exit(1);
880 	}
881 	close(dbinfo_fd);
882 
883 	/* clean up */
884 	close(udict_fd);
885 	close(upost_fd);
886 	close(ufield_fd);
887 	close(uword_fd);
888 	close(lpost_fd);
889 	close(lfield_fd);
890 	close(lword_fd);
891 
892 	/* reopen and truncate upost */
893 	etymon_db_construct_path(ETYMON_DBF_UPOST, opt->dbname, fn);
894 	upost_fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
895 	close(upost_fd);
896 	/* reopen and truncate ufield */
897 	etymon_db_construct_path(ETYMON_DBF_UFIELD, opt->dbname, fn);
898 	ufield_fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
899 	close(ufield_fd);
900 	/* reopen and truncate uword */
901 	etymon_db_construct_path(ETYMON_DBF_UWORD, opt->dbname, fn);
902 	uword_fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
903 	close(uword_fd);
904 
905 	/* unlock the database */
906 	etymon_db_unlock(opt->dbname);
907 
908 	return 0;
909 }
910 
911 #endif
912 
etymon_index_write_nl(int filedes,etymon_af_off_t offset,ETYMON_INDEX_PAGE_NL * page)913 void etymon_index_write_nl(int filedes, etymon_af_off_t offset, ETYMON_INDEX_PAGE_NL* page) {
914 	static Uint1 leaf_flag = 0;
915 	if (etymon_af_lseek(filedes, (etymon_af_off_t)offset, SEEK_SET) == -1) {
916 		perror("index_write_nl():lseek()");
917 	}
918 	if (write(filedes, &(leaf_flag), 1) == -1) {
919 		perror("index_write_nl():write()");
920 	}
921 	if (write(filedes, page, sizeof(ETYMON_INDEX_PAGE_NL)) == -1) {
922 		perror("index_write_nl():write()");
923 	}
924 }
925 
926 
etymon_index_write_l(int filedes,etymon_af_off_t offset,ETYMON_INDEX_PAGE_L * page)927 void etymon_index_write_l(int filedes, etymon_af_off_t offset, ETYMON_INDEX_PAGE_L* page) {
928 	static Uint1 leaf_flag = 1;
929 	if (etymon_af_lseek(filedes, (etymon_af_off_t)offset, SEEK_SET) == -1) {
930 		perror("index_write_l():lseek()");
931 	}
932 	if (write(filedes, &(leaf_flag), 1) == -1) {
933 		perror("index_write_l():write()");
934 	}
935 	if (write(filedes, page, sizeof(ETYMON_INDEX_PAGE_L)) == -1) {
936 		perror("index_write_l():write()");
937 	}
938 }
939 
940 
941 /* returns 1 if flush was performed */
etymon_index_flush_l(ETYMON_INDEX_INDEXING_STATE * state)942 int etymon_index_flush_l(ETYMON_INDEX_INDEXING_STATE* state) {
943 	if (state->pcache_l_write != 0) {
944 		etymon_index_write_l(state->udict_fd, state->pcache_l_write, &(state->pcache_l));
945 		state->pcache_l_write = 0;
946 		return 1;
947 	} else {
948 		return 0;
949 	}
950 }
951 
952 
etymon_index_search_keys_nl(unsigned char * word,size_t word_len,ETYMON_INDEX_PAGE_NL * page)953 int etymon_index_search_keys_nl(unsigned char* word, size_t word_len, ETYMON_INDEX_PAGE_NL* page) {
954 	int j, k, m, len, comp;
955 	if (page->n == 0) {
956 		return 0;
957 	}
958 	j = 0;
959 	k = page->n - 1;
960 	while (j <= k) {
961 		m = (j + k) / 2;
962 		len = page->offset[m + 1] - page->offset[m];
963 		comp = strncmp( (char*)word,
964 				(char*)(page->keys + page->offset[m]),
965 				len );
966 		if (comp == 0) {
967 			comp = word_len - len;
968 		}
969 		if (comp < 0) {
970 			k = m - 1;
971 		}
972 		else if (comp > 0) {
973 			j = m + 1;
974 		}
975 		else {
976 			return m + 1; /* different from a leaf search - match gives right side pointer */
977 		}
978 	}
979 	return j;
980 }
981 
982 
etymon_index_search_keys_l(unsigned char * word,size_t word_len,ETYMON_INDEX_PAGE_L * page,int * match)983 int etymon_index_search_keys_l(unsigned char* word, size_t word_len, ETYMON_INDEX_PAGE_L* page, int* match) {
984 	int j, k, m, len, comp;
985 	if (page->n == 0) {
986 		*match = 0;
987 		return 0;
988 	}
989 	j = 0;
990 	k = page->n - 1;
991 	while (j <= k) {
992 		m = (j + k) / 2;
993 		len = page->offset[m + 1] - page->offset[m];
994 		comp = strncmp( (char*)word,
995 				(char*)(page->keys + page->offset[m]),
996 				len );
997 		if (comp == 0) {
998 			comp = word_len - len;
999 		}
1000 		if (comp < 0) {
1001 			k = m - 1;
1002 		}
1003 		else if (comp > 0) {
1004 			j = m + 1;
1005 		}
1006 		else {
1007 			*match = 1;
1008 			return m; /* different from non-leaf search - match gives left side pointer */
1009 		}
1010 	}
1011 	*match = 0;
1012 	return j;
1013 }
1014 
1015 
etymon_index_write_upost(ETYMON_INDEX_INDEXING_STATE * state,int wcache_p,ETYMON_INDEX_PAGE_L * page_l,int ins)1016 void etymon_index_write_upost(ETYMON_INDEX_INDEXING_STATE* state, int wcache_p, ETYMON_INDEX_PAGE_L* page_l, int ins) {
1017 	int p;
1018 	Uint4 ufield_p;
1019 	Uint4 uword_p;
1020 	state->upost.doc_id = state->wcache[wcache_p].doc_id;
1021 	state->upost.freq = state->wcache[wcache_p].freq;
1022 	state->upost.next = page_l->post[ins];
1023 
1024 	/* write out fields if there are any */
1025 	if (state->wcache[wcache_p].fields == -1) {
1026 		state->upost.fields = 0;
1027 		state->upost.fields_n = 0;
1028 	} else {
1029 		/* write out fields */
1030 		p = state->wcache[wcache_p].fields;
1031 		ufield_p = 0;
1032 		state->upost.fields_n = 0;
1033 		while (p != -1) {
1034 			memcpy(state->ufield.fields, state->fcache[p].f, ETYMON_MAX_FIELD_NEST * 2);
1035 			state->ufield.next = ufield_p;
1036 			if (write(state->ufield_fd, &(state->ufield), sizeof(ETYMON_INDEX_UFIELD)) == -1) {
1037 				perror("index_write_upost():write()");
1038 			}
1039 			p = state->fcache[p].next;
1040 			state->ufield_isize++;
1041 			ufield_p = state->ufield_isize;
1042 			state->upost.fields_n++;
1043 		}
1044 		state->upost.fields = ufield_p;
1045 	}
1046 
1047 	/* write out word number data if any */
1048 	if (state->number_words) {
1049 		/* write out word numbers */
1050 		p = state->wcache[wcache_p].word_numbers_head;
1051 		uword_p = 0;
1052 		state->upost.word_numbers_n = 0;
1053 		while (p != -1) {
1054 			state->uword.wn = state->wncache[p].wn;
1055 			state->uword.next = uword_p;
1056 			if (write(state->uword_fd, &(state->uword), sizeof(ETYMON_INDEX_UWORD)) == -1) {
1057 				perror("index_write_upost():write()");
1058 			}
1059 			p = state->wncache[p].next;
1060 			state->uword_isize++;
1061 			uword_p = state->uword_isize;
1062 			state->upost.word_numbers_n++;
1063 		}
1064 		state->upost.word_numbers = uword_p;
1065 	} else {
1066 		state->upost.word_numbers = 0;
1067 		state->upost.word_numbers_n = 0;
1068 	}
1069 
1070 	page_l->post[ins] = state->upost_isize + 1;
1071 	page_l->post_n[ins]++;
1072 
1073 	/* now write out the new upost */
1074 	if (write(state->upost_fd, &(state->upost), sizeof(ETYMON_INDEX_UPOST)) == -1) {
1075 		perror("index_write_upost():write()");
1076 	}
1077 	state->upost_isize++;
1078 }
1079 
1080 
etymon_index_insert_key_l(ETYMON_INDEX_PAGE_L * page,int ins,unsigned char * word,size_t word_len)1081 void etymon_index_insert_key_l(ETYMON_INDEX_PAGE_L* page, int ins, unsigned char* word, size_t word_len) {
1082 	int x;
1083 	/* first scoot the keys over and insert the new word */
1084 	if (ins < page->n) { /* don't need to if we're at the end of the key buffer */
1085 		memmove(page->keys + page->offset[ins] + word_len,
1086 			page->keys + page->offset[ins],
1087 			page->offset[page->n] - page->offset[ins]);
1088 	}
1089 	memcpy(page->keys + page->offset[ins], word, word_len);
1090 	/* move post data over */
1091 	memmove(page->post + ins + 1, page->post + ins, (page->n - ins) * sizeof(Uint4));
1092 	memmove(page->post_n + ins + 1, page->post_n + ins, (page->n - ins) * sizeof(Uint4));
1093 	/* next scoot the offsets directory over (add word_len to offsets) */
1094 	page->n++;
1095 	for (x = page->n; x > ins; x--) {
1096 		page->offset[x] = page->offset[x - 1] + word_len;
1097 	}
1098 }
1099 
1100 
etymon_index_insert_key_nl(ETYMON_INDEX_PAGE_NL * page,int ins,unsigned char * word,size_t word_len)1101 void etymon_index_insert_key_nl(ETYMON_INDEX_PAGE_NL* page, int ins, unsigned char* word, size_t word_len) {
1102 	int x;
1103 	/* first scoot the keys over and insert the new word */
1104 	if (ins < page->n) { /* don't need to if we're at the end of the key buffer */
1105 		memmove(page->keys + page->offset[ins] + word_len,
1106 			page->keys + page->offset[ins],
1107 			page->offset[page->n] - page->offset[ins]);
1108 	}
1109 	memcpy(page->keys + page->offset[ins], word, word_len);
1110 	/* move the page pointers */
1111 	memmove(page->p + ins + 2, page->p + ins + 1, (page->n - ins) * sizeof(Uint4));
1112 	/* next scoot the offsets directory over (add word_len to offsets) */
1113 	page->n++;
1114 	for (x = page->n; x > ins; x--) {
1115 		page->offset[x] = page->offset[x - 1] + word_len;
1116 	}
1117 }
1118 
1119 
1120 /* fills in word with the shortest separator between the two pages, and returns the word length */
etymon_index_shortest_sep_l(ETYMON_INDEX_PAGE_L * left,ETYMON_INDEX_PAGE_L * right,unsigned char * word)1121 int etymon_index_shortest_sep_l(ETYMON_INDEX_PAGE_L* left, ETYMON_INDEX_PAGE_L* right, unsigned char* word) {
1122 	static int p;
1123 	static int max;
1124 	static unsigned char* left_word;
1125 	static int left_word_len;
1126 	max = right->offset[1];
1127 	left_word = left->keys + left->offset[left->n - 1];
1128 	left_word_len = left->offset[left->n] - left->offset[left->n - 1];
1129 	p = 0;
1130 	do {
1131 		word[p] = right->keys[p];
1132 		p++;
1133 	} while ( (p < max) && (left_word[p - 1] >= word[p - 1]) );
1134 	word[p] = '\0';
1135 	return p;
1136 }
1137 
1138 
1139 /* fills in word with the shortest separator between the two pages, and returns the word length */
etymon_index_shortest_sep_nl(ETYMON_INDEX_PAGE_NL * left,ETYMON_INDEX_PAGE_NL * right,unsigned char * word)1140 int etymon_index_shortest_sep_nl(ETYMON_INDEX_PAGE_NL* left, ETYMON_INDEX_PAGE_NL* right, unsigned char* word) {
1141 	static int p;
1142 	static int max;
1143 	static unsigned char* left_word;
1144 	static int left_word_len;
1145 	max = right->offset[1];
1146 	left_word = left->keys + left->offset[left->n - 1];
1147 	left_word_len = left->offset[left->n] - left->offset[left->n - 1];
1148 	p = 0;
1149 	do {
1150 		word[p] = right->keys[p];
1151 		p++;
1152 	} while ( (p < max) && (left_word[p - 1] >= word[p - 1]) );
1153 	word[p] = '\0';
1154 	return p;
1155 }
1156 
1157 
etymon_index_parent_add_key(ETYMON_INDEX_INDEXING_STATE * state,int level,unsigned char * word,size_t word_len,Uint4 child)1158 void etymon_index_parent_add_key(ETYMON_INDEX_INDEXING_STATE* state, int level, unsigned char* word, size_t word_len,
1159 				 Uint4 child) {
1160 	int x, y, ins;
1161 	unsigned char new_word[ETYMON_MAX_WORD_SIZE];
1162 	size_t new_word_len;
1163 	Uint4 overflow_pos;
1164 
1165 	/* first check if we have ascended above the root of the tree */
1166 	if (level < 0) {
1167 		/* if so, we create a new root page */
1168 		/* we can do it place of the old root position in the pcache, and invalidate the rest of the pcache */
1169 		state->pcache_nl[0].nl.n = 1;
1170 		state->pcache_nl[0].nl.p[0] = state->pcache_nl[0].pos; /* grab the pos from the now former root page */
1171 		state->pcache_nl[0].nl.p[1] = child;
1172 		state->pcache_nl[0].nl.offset[0] = 0;
1173 		state->pcache_nl[0].nl.offset[1] = word_len;
1174 		memcpy(state->pcache_nl[0].nl.keys, word, word_len);
1175 		state->pcache_nl[0].pos = state->udict_size;
1176 		state->pcache_nl[0].is_nl = 1;
1177 		state->pcache_count = 1;
1178 		/* now write out the new root page */
1179 		etymon_index_write_nl(state->udict_fd, state->pcache_nl[0].pos, &(state->pcache_nl[0].nl));
1180 		state->udict_size += sizeof(Uint1) + sizeof(ETYMON_INDEX_PAGE_NL);
1181 		/* set new root pointer */
1182 		state->udict_root = state->pcache_nl[0].pos;
1183 		return;
1184 	}
1185 
1186 	/* check if page is full */
1187 	if ( (state->pcache_nl[level].nl.n >= ETYMON_MAX_KEYS_NL) ||
1188 	     ((ETYMON_MAX_KEY_AREA_NL - state->pcache_nl[level].nl.offset[state->pcache_nl[level].nl.n]) /* empty key space */
1189 	      < (int)word_len) ) {
1190 
1191 		/* split the page */
1192 
1193 		/* allocate new non-leaf page for split overflow */
1194 		/* we move half of the keys into the overflow leaf page */
1195 		state->overflow_nl.n = state->pcache_nl[level].nl.n / 2;
1196 		/* move the offsets - by hand */
1197 		y = state->pcache_nl[level].nl.offset[state->pcache_nl[level].nl.n - state->overflow_nl.n];
1198 		for (x = 0; x <= state->overflow_nl.n; x++) {
1199 			state->overflow_nl.offset[x] =
1200 				state->pcache_nl[level].nl.offset[x + state->pcache_nl[level].nl.n -
1201 								 state->overflow_nl.n] - y;
1202 		}
1203 
1204 		/* move the keys */
1205 		memcpy(state->overflow_nl.keys,
1206 		       state->pcache_nl[level].nl.keys +
1207 		       state->pcache_nl[level].nl.offset[state->pcache_nl[level].nl.n - state->overflow_nl.n],
1208 		       state->pcache_nl[level].nl.offset[state->pcache_nl[level].nl.n] -
1209 		       state->pcache_nl[level].nl.offset[state->pcache_nl[level].nl.n - state->overflow_nl.n]);
1210 
1211 		/* move the page pointers */
1212 		memcpy(state->overflow_nl.p,
1213 		       state->pcache_nl[level].nl.p +
1214 		       state->pcache_nl[level].nl.n - state->overflow_nl.n,
1215 		       (state->overflow_nl.n + 1) * sizeof(Uint4));
1216 
1217 		state->pcache_nl[level].nl.n -= state->overflow_nl.n;
1218 
1219 		/* remove the median key (now at the end of the old page),
1220 		   which we remember and will insert into the parent */
1221 		new_word_len = state->pcache_nl[level].nl.offset[state->pcache_nl[level].nl.n] -
1222 			state->pcache_nl[level].nl.offset[state->pcache_nl[level].nl.n - 1];
1223 		memcpy(new_word, state->pcache_nl[level].nl.keys +
1224 		       state->pcache_nl[level].nl.offset[state->pcache_nl[level].nl.n - 1], new_word_len);
1225 		new_word[new_word_len] = '\0';
1226 		state->pcache_nl[level].nl.n--;
1227 
1228 		/* insert the new key into either the old or new nl page */
1229 		/* search in the new page */
1230 		ins = etymon_index_search_keys_nl(word, word_len, &(state->overflow_nl));
1231 		/* if it's at 0 then put it in the old */
1232 		if (ins == 0) {
1233 			/* it goes in the old page, in the last key position */
1234 			etymon_index_insert_key_nl(&(state->pcache_nl[level].nl), state->pcache_nl[level].nl.n, word,
1235 						word_len);
1236 
1237 			state->pcache_nl[level].nl.p[ins + 1] = child;
1238 		} else {
1239 			/* it goes in the new page */
1240 			etymon_index_insert_key_nl(&(state->overflow_nl), ins, word, word_len);
1241 
1242 			state->overflow_nl.p[ins + 1] = child;
1243 		}
1244 
1245 		/* write out old nl page */
1246 		etymon_index_write_nl(state->udict_fd, state->pcache_nl[level].pos, &(state->pcache_nl[level].nl));
1247 		/* write out new overflow page */
1248 		overflow_pos = state->udict_size;
1249 		etymon_index_write_nl(state->udict_fd, overflow_pos, &(state->overflow_nl));
1250 		/* update size of dictionary file */
1251 		state->udict_size += sizeof(Uint1) + sizeof(ETYMON_INDEX_PAGE_NL);
1252 
1253 		/* recursively update parent page, splitting if necessary */
1254 		etymon_index_parent_add_key(state, level - 1, new_word, new_word_len, overflow_pos);
1255 
1256 	} else {
1257 		/* if not full, simply insert the new key */
1258 		ins = etymon_index_search_keys_nl(word, word_len, &(state->pcache_nl[level].nl));
1259 		etymon_index_insert_key_nl(&(state->pcache_nl[level].nl), ins, word, word_len);
1260 		state->pcache_nl[level].nl.p[ins + 1] = child;
1261 		/* write out the updated page */
1262 		etymon_index_write_nl(state->udict_fd, state->pcache_nl[level].pos, &(state->pcache_nl[level].nl));
1263 	}
1264 
1265 }
1266 
1267 
1268 /* returns 0 if everything was OK */
etymon_index_index_word(ETYMON_INDEX_INDEXING_STATE * state,int wcache_p)1269 int etymon_index_index_word(ETYMON_INDEX_INDEXING_STATE* state, int wcache_p) {
1270 	static int x, y, match, ins;
1271 	static Uint4 p;
1272 	static ssize_t nbytes;
1273 	static Uint1 leaf_flag;
1274 	static unsigned char* word;
1275 	static unsigned char new_word[ETYMON_MAX_WORD_SIZE];
1276 	static size_t word_len, new_word_len;
1277 	static int level;
1278 
1279 	word = state->wcache[wcache_p].word;
1280 
1281 	/* search for the right page */
1282 
1283 	/* start at the root page */
1284 	p = state->udict_root;
1285 	level = 0;
1286 
1287 	/* search the tree by descent */
1288 	do {
1289 
1290 		/* get the page from the cache if available */
1291 
1292 		if ( (level < state->pcache_count) &&
1293 		     (state->pcache_nl[level].pos == p) ) {
1294 			/* yes, it is in the cache */
1295 			/* check whether it is a leaf page */
1296 			if (state->pcache_nl[level].is_nl == 1) {
1297 				leaf_flag = 0;
1298 			} else {
1299 				leaf_flag = 1;
1300 			}
1301 		} else {
1302 			/* not in the cache, so read it from disk */
1303 			if (etymon_af_lseek(state->udict_fd, (etymon_af_off_t)p, SEEK_SET) == -1) {
1304 				perror("index_index_word():lseek()");
1305 			}
1306 			/* check whether it is a leaf page */
1307 			nbytes = read(state->udict_fd, &(leaf_flag), sizeof(Uint1));
1308 			if (nbytes != sizeof(Uint1)) {
1309 				/* ERROR */
1310 				printf("error reading from index (LP)\n");
1311 				exit(1);
1312 			}
1313 			if (leaf_flag == 0) {
1314 				/* read the non-leaf page into the right cache position */
1315 				state->pcache_nl[level].pos = p;
1316 				state->pcache_nl[level].is_nl = 1;
1317 				state->pcache_count = level + 1;
1318 				nbytes = read(state->udict_fd, &(state->pcache_nl[level].nl),
1319 					      sizeof(ETYMON_INDEX_PAGE_NL));
1320 				if (nbytes != sizeof(ETYMON_INDEX_PAGE_NL)) {
1321 					/* ERROR */
1322 					printf("error reading from index (NL)\n");
1323 					exit(1);
1324 				}
1325 			} else {
1326 
1327 				/* read the leaf page into the leaf page cache */
1328 
1329 				if (etymon_index_flush_l(state) == 1) { /* flush leaf write cache first */
1330 					if (etymon_af_lseek(state->udict_fd, (etymon_af_off_t)(p + 1), SEEK_SET) == -1) {
1331 						perror("index_index_word():lseek()");
1332 					}
1333 				}
1334 
1335 				state->pcache_nl[level].pos = p;
1336 				state->pcache_nl[level].is_nl = 0;
1337 				state->pcache_count = level + 1;
1338 				nbytes = read(state->udict_fd, &(state->pcache_l), sizeof(ETYMON_INDEX_PAGE_L));
1339 				if (nbytes != sizeof(ETYMON_INDEX_PAGE_L)) {
1340 					/* ERROR */
1341 					printf("error reading from index (L)\n");
1342 					exit(1);
1343 				}
1344 			}
1345 		}
1346 
1347 		/* if it is not a leaf page, determine next seek */
1348 		if (leaf_flag == 0) {
1349 			ins = etymon_index_search_keys_nl(word, word_len,
1350 							  &(state->pcache_nl[level].nl));
1351 			p = state->pcache_nl[level].nl.p[ins];
1352 			level++;
1353 			/* Internal overflow (level) */
1354 			if (level >= ETYMON_MAX_PAGE_DEPTH)
1355 				return aferr(AFEUNKNOWN);
1356 		}
1357 
1358 	} while (leaf_flag == 0);
1359 	/* we have reached a leaf page */
1360 
1361 	word_len = strlen((char*)word);
1362 
1363 	/* determine position in key list to insert new key */
1364 	ins = etymon_index_search_keys_l(word, word_len,
1365 					 &(state->pcache_l), &match);
1366 
1367 	/* if we found a perfect match, then no need to insert the key; just add it to the postings */
1368 	if (match) {
1369 
1370 		etymon_index_write_upost(state, wcache_p, &(state->pcache_l), ins);
1371 
1372 		/* tag leaf for write cache */
1373 		state->pcache_l_write = p;
1374 
1375 	} else {
1376 		/* insert the key */
1377 		/* check if page is full */
1378 		if ( (state->pcache_l.n >= ETYMON_MAX_KEYS_L) ||
1379 		     ((ETYMON_MAX_KEY_AREA_L - state->pcache_l.offset[state->pcache_l.n]) /* empty key space */
1380 		      < (int)word_len) ) {
1381 
1382 			/* if so, we split the page */
1383 
1384 			/* allocate new leaf page for split overflow */
1385 			/* we move half of the keys into the overflow leaf page */
1386 			state->overflow_l.n = state->pcache_l.n / 2;
1387 			state->overflow_l.prev = p;
1388 			state->overflow_l.next = state->pcache_l.next;
1389 			/* move the posting data */
1390 			memcpy(state->overflow_l.post,
1391 			       state->pcache_l.post +
1392 			       state->pcache_l.n - state->overflow_l.n,
1393 			       state->overflow_l.n * sizeof(Uint4));
1394 			memcpy(state->overflow_l.post_n,
1395 			       state->pcache_l.post_n +
1396 			       state->pcache_l.n - state->overflow_l.n,
1397 			       state->overflow_l.n * sizeof(Uint4));
1398 			/* move the offsets - by hand */
1399 			y = state->pcache_l.offset[state->pcache_l.n - state->overflow_l.n];
1400 			for (x = 0; x <= state->overflow_l.n; x++) {
1401 				state->overflow_l.offset[x] =
1402 					state->pcache_l.offset[x + state->pcache_l.n - state->overflow_l.n] - y;
1403 			}
1404 			/* move the keys */
1405 			memcpy(state->overflow_l.keys,
1406 			       state->pcache_l.keys +
1407 			       state->pcache_l.offset[state->pcache_l.n - state->overflow_l.n],
1408 			       state->pcache_l.offset[state->pcache_l.n] -
1409 			       state->pcache_l.offset[state->pcache_l.n - state->overflow_l.n]);
1410 			state->pcache_l.n -= state->overflow_l.n;
1411 			state->pcache_l.next = state->udict_size;
1412 
1413 			/* insert the new key into either the old or new leaf page */
1414 			if (ins <= state->pcache_l.n) {
1415 				/* it goes in the old page */
1416 				etymon_index_insert_key_l(&(state->pcache_l), ins, word, word_len);
1417 
1418 				state->pcache_l.post[ins] = 0;
1419 				state->pcache_l.post_n[ins] = 0;
1420 				etymon_index_write_upost(state, wcache_p, &(state->pcache_l), ins);
1421 			} else {
1422 				/* it goes in the new page */
1423 
1424 				x = etymon_index_search_keys_l(word, word_len,
1425 							       &(state->overflow_l), &match);
1426 
1427 				etymon_index_insert_key_l(&(state->overflow_l), x, word, word_len);
1428 
1429 				state->overflow_l.post[x] = 0;
1430 				state->overflow_l.post_n[x] = 0;
1431 				etymon_index_write_upost(state, wcache_p, &(state->overflow_l), x);
1432 			}
1433 
1434 			/* tag old leaf for write caching */
1435 			state->pcache_l_write = p;
1436 			/* write out new overflow page */
1437 			etymon_index_write_l(state->udict_fd, state->udict_size, &(state->overflow_l));
1438 			y = state->udict_size; /**/
1439 			/* update size of dictionary file */
1440 			state->udict_size += sizeof(Uint1) + sizeof(ETYMON_INDEX_PAGE_L);
1441 
1442 			/* update prev pointer in far-right leaf to point to overflow_l */
1443 			if (state->overflow_l.next != 0) {
1444 				if (etymon_af_lseek(state->udict_fd,
1445 						    (etymon_af_off_t)(state->overflow_l.next + 1), SEEK_SET) == -1) {
1446 					perror("index_index_word():lseek()");
1447 				}
1448 				nbytes = read(state->udict_fd, &(state->extra_l), sizeof(ETYMON_INDEX_PAGE_L));
1449 				if (nbytes == -1) {
1450 					perror("index_index_word():read()");
1451 				}
1452 				state->extra_l.prev = state->pcache_l.next;
1453 				if (etymon_af_lseek(state->udict_fd, (etymon_af_off_t)(state->overflow_l.next + 1), SEEK_SET) == -1) {
1454 					perror("index_index_word():lseek()");
1455 				}
1456 				nbytes = write(state->udict_fd, &(state->extra_l), sizeof(ETYMON_INDEX_PAGE_L));
1457 				if (nbytes == -1) {
1458 					perror("index_index_word():write()");
1459 				}
1460 			}
1461 
1462 			/* we want to insert a new key in the parent to fork the split */
1463 
1464 			new_word_len = etymon_index_shortest_sep_l(&(state->pcache_l), &(state->overflow_l),
1465 								   new_word);
1466 
1467 			/* recursively update parent page, splitting if necessary */
1468 			etymon_index_parent_add_key(state, level - 1, new_word, new_word_len,
1469 						    state->pcache_l.next);
1470 
1471 		} else {
1472 
1473 			/* if not, simply scoot keys over and insert new key */
1474 			etymon_index_insert_key_l(&(state->pcache_l), ins, word, word_len);
1475 			state->pcache_l.post[ins] = 0;
1476 			state->pcache_l.post_n[ins] = 0;
1477 
1478 			/* next add postings for the new key */
1479 
1480 			etymon_index_write_upost(state, wcache_p, &(state->pcache_l), ins);
1481 
1482 			/* tag leaf for write cache */
1483 			state->pcache_l_write = p;
1484 		}
1485 
1486 	}
1487 
1488 	return 0;
1489 }
1490 
1491 
1492 /* returns 0 if everything was OK */
etymon_index_traverse_wcache(ETYMON_INDEX_INDEXING_STATE * state,int p)1493 int etymon_index_traverse_wcache(ETYMON_INDEX_INDEXING_STATE* state, int p) {
1494 	int c;
1495 	int start;
1496 	if (state->wcache[p].left != -1) {
1497 		if (etymon_index_traverse_wcache(state, state->wcache[p].left) == -1) {
1498 			return -1;
1499 		}
1500 	}
1501 	start = state->wcache[p].next;
1502 	c = start;
1503 	do {
1504 		if (etymon_index_index_word(state, c) == -1) {
1505 			return -1;
1506 		}
1507 		c = state->wcache[c].next;
1508 	} while (c != start);
1509 	if (state->wcache[p].right != -1) {
1510 		if (etymon_index_traverse_wcache(state, state->wcache[p].right) == -1) {
1511 			return -1;
1512 		}
1513 	}
1514 	return 0;
1515 }
1516 
1517 
1518 /* returns 0 if everything was OK */
etymon_index_dclass_index(ETYMON_INDEX_INDEXING_STATE * state)1519 int etymon_index_dclass_index(ETYMON_INDEX_INDEXING_STATE* state) {
1520 	static Uint1 leaf_flag;
1521 	static ssize_t nbytes;
1522 
1523 	if (!state->flushmsg) {
1524 		state->flushmsg = 1;
1525 		afprintv(state->verbose, 2, "Flushing index buffers");
1526 	}
1527 	/* make sure there is at least one page (root) */
1528 	if (state->udict_root == 0) {
1529 		/* seek to offset 0 and write one zero byte (unused) */
1530 		if (etymon_af_lseek(state->udict_fd, (etymon_af_off_t)0, SEEK_SET) == -1) {
1531 			perror("index_dclass_index():lseek()");
1532 		}
1533 		leaf_flag = 0; /* we'll use the leaf_flag variable, but this isn't really a leaf flag byte */
1534 		nbytes = write(state->udict_fd, &(leaf_flag), 1);
1535 		if (nbytes == -1) {
1536 			perror("index_dclass_index():write()");
1537 		}
1538 		/* write an empty root page now at offset 1 */
1539 		state->pcache_l.n = 0;
1540 		state->pcache_l.prev = 0;
1541 		state->pcache_l.next = 0;
1542 		state->pcache_l.offset[0] = 0;
1543 		leaf_flag = 1;
1544 		nbytes = write(state->udict_fd, &(leaf_flag), sizeof(Uint1));
1545 		if (nbytes != sizeof(Uint1)) {
1546 			/* ERROR */
1547 			printf("error writing to index\n");
1548 			exit(1);
1549 		}
1550 		nbytes = write(state->udict_fd, &(state->pcache_l), sizeof(ETYMON_INDEX_PAGE_L));
1551 		if (nbytes != sizeof(ETYMON_INDEX_PAGE_L)) {
1552 			/* ERROR */
1553 			printf("error writing to index\n");
1554 			exit(1);
1555 		}
1556 		/* update size of dictionary file */
1557 		state->udict_size += 2 + sizeof(ETYMON_INDEX_PAGE_L);
1558 		/* set root pointer */
1559 		state->udict_root = 1;
1560 		/* root page is now cached and it is a leaf */
1561 		state->pcache_nl[0].pos = 1;
1562 		state->pcache_nl[0].is_nl = 0;
1563 		state->pcache_count = 1;
1564 	}
1565 
1566 	if (state->wcache_count > 0) {
1567 		if (etymon_index_traverse_wcache(state, state->wcache_root) == -1) {
1568 			return -1;
1569 		}
1570 	}
1571 	return 0;
1572 }
1573 
1574 
1575 /* return 0 if everything went well */
etymon_index_dclass_finish(ETYMON_INDEX_INDEXING_STATE * state)1576 int etymon_index_dclass_finish(ETYMON_INDEX_INDEXING_STATE* state) {
1577 	/* perform last indexing pass */
1578 	if (etymon_index_dclass_index(state) == -1) {
1579 		return -1;
1580 	}
1581 	/* flush write cached leaf node */
1582 	etymon_index_flush_l(state);
1583 
1584 	return 0;
1585 }
1586 
1587 
etymon_af_index_get_split_list(ETYMON_AF_DC_INDEX * dc_index,char * split)1588 ETYMON_AF_DC_SPLIT* etymon_af_index_get_split_list(ETYMON_AF_DC_INDEX*
1589 						   dc_index, char* split) {
1590 	ETYMON_AF_DC_SPLIT* split_list;
1591 	ETYMON_AF_DC_SPLIT* split_p;
1592 	ETYMON_DOCBUF* docbuf = dc_index->docbuf;
1593 	int split_len = strlen(split);
1594 	int split_match = 0;  /* number of characters matched with
1595 				 delimiter string */
1596 	unsigned char ch;
1597 	Uint4 offset = 0;
1598 
1599 	/* initialize split list */
1600 	split_list =
1601 		(ETYMON_AF_DC_SPLIT*)(malloc(
1602 			sizeof(ETYMON_AF_DC_SPLIT)));
1603 	split_p = split_list;
1604 
1605 	/* return if the document size is 0 */
1606 	if (docbuf->data_len == 0) {
1607 		split_list->end = 0;
1608 		split_list->next = NULL;
1609 		return split_list;
1610 	}
1611 
1612 	/* skip first character to avoid a 0 length document resulting
1613 	   from an immediate match */
1614 	etymon_docbuf_next_char(docbuf);
1615 	offset++;
1616 
1617 	/* find matches to the delimiter string */
1618 	while ( ! docbuf->eof ) {
1619 		ch = etymon_docbuf_next_char(docbuf);
1620 		offset++;
1621 		if (ch == split[split_match]) {
1622 			split_match++;
1623 			if (split_match == split_len) {
1624 				split_match = 0;
1625 				split_p->end = offset - split_len;
1626 				split_p->next = (ETYMON_AF_DC_SPLIT*)(malloc(
1627 					sizeof(ETYMON_AF_DC_SPLIT)));
1628 				split_p = split_p->next;
1629 			}
1630 		} else {
1631 			if (split_match != 0) {
1632 				split_match = 0;
1633 			}
1634 		}
1635 	}
1636 
1637 	split_p->end = docbuf->st.st_size;
1638 	split_p->next = NULL;
1639 
1640 	return split_list;
1641 }
1642 
1643 
1644 /* returns 0 if everything went OK */
etymon_index_add_files(Afindex * opt)1645 int etymon_index_add_files(Afindex *opt) {
1646 	ETYMON_DOCBUF* docbuf;
1647 	ETYMON_INDEX_INDEXING_STATE* state;
1648 	char s_file[ETYMON_MAX_PATH_SIZE];
1649 	char* source_file;
1650 	char fn[ETYMON_MAX_PATH_SIZE];
1651 	char cwd[ETYMON_MAX_PATH_SIZE];
1652 	ETYMON_AF_STAT st;
1653 	ssize_t nbytes;
1654 	size_t maxmem, memleft;
1655 /*	int dbinfo_fd; */
1656 	int x_file;
1657 	Uint4 magic;
1658 	ETYMON_DB_INFO *dbinfo;
1659 	int dclass_id;
1660 	int result;
1661 	int fdef_fd;
1662 	int done_files;
1663 	int file_good;
1664 	int x;
1665 	size_t wcache_alloc, fcache_alloc, wncache_alloc;
1666 	ETYMON_AF_DC_INDEX dc_index;
1667 	ETYMON_AF_DC_INIT dc_init;
1668 	ETYMON_AF_DC_SPLIT* split_list = NULL;
1669 	ETYMON_AF_DC_SPLIT* split_p;
1670 	int use_docbuf; /* 1: use docbuf; 0: don't use it */
1671 	char *dbname;
1672 
1673 	dbname = etymon_af_state[opt->dbid]->dbname;
1674 
1675 	maxmem = ((size_t) opt->memory) * 1048576 - 1315000;
1676 
1677 	/* make sure database is ready */
1678 	if (etymon_db_ready(dbname) == 0)
1679 		return aferr(AFEDBLOCK);
1680 
1681 	/* lock the database */
1682 	etymon_db_lock(dbname, NULL);
1683 
1684 	/* open db info file for read/write */
1685 /*
1686 	etymon_db_construct_path(ETYMON_DBF_INFO, dbname, fn);
1687 	dbinfo_fd = open(fn, O_RDWR | ETYMON_AF_O_LARGEFILE);
1688 	if (dbinfo_fd == -1) {
1689 		etymon_db_unlock(dbname);
1690 		return aferr(AFEDBIO);
1691 	}
1692 	nbytes = read(dbinfo_fd, &magic, sizeof(Uint4));
1693 	if (nbytes != sizeof(Uint4)) {
1694 		printf("unable to read %s\n", fn);
1695 		exit(1);
1696 	}
1697 	if (magic != ETYMON_INDEX_MAGIC) {
1698 		close(dbinfo_fd);
1699 		etymon_db_unlock(dbname);
1700 		return aferr(AFEVERSION);
1701 	}
1702 	nbytes = read(dbinfo_fd, &dbinfo, sizeof(ETYMON_DB_INFO));
1703 	if (nbytes != sizeof(ETYMON_DB_INFO)) {
1704 		printf("unable to read %s\n", fn);
1705 		exit(1);
1706 	}
1707 */
1708 	dbinfo = &(etymon_af_state[opt->dbid]->info);
1709 
1710 	if (dbinfo->stemming && !af_stem_available()) {
1711 		etymon_db_unlock(dbname);
1712 		return aferr(AFENOSTEM);
1713 	}
1714 
1715 	/* we can only add files if the database is not optimized */
1716 	if (dbinfo->optimized == 1) {
1717 		etymon_db_unlock(dbname);
1718 		return aferr(AFELINEAR);
1719 	}
1720 
1721 	/* set up state information for indexing */
1722 
1723 	state = (ETYMON_INDEX_INDEXING_STATE*)(malloc(sizeof(ETYMON_INDEX_INDEXING_STATE)));
1724 
1725 	state->udict_root = dbinfo->udict_root;
1726 	state->doc_n = dbinfo->doc_n;
1727 	state->dbname = dbname;
1728 	state->verbose = opt->verbose;
1729 	state->long_words = opt->_longwords;
1730 
1731 	/* open doctable for append */
1732 	etymon_db_construct_path(ETYMON_DBF_DOCTABLE, dbname, fn);
1733 	state->doctable_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
1734 	if (state->doctable_fd == -1) {
1735 		/* ERROR */
1736 		printf("unable to open %s for append\n", fn);
1737 		exit(1);
1738 	}
1739 	/* stat doctable to get size */
1740 	if (etymon_af_fstat(state->doctable_fd, &st) == -1) {
1741 		perror("index_add_files():fstat()");
1742 	}
1743 	state->doctable_next_id = st.st_size / sizeof(ETYMON_DOCTABLE) + 1;
1744 
1745 	/* open udict for read/write */
1746 	etymon_db_construct_path(ETYMON_DBF_UDICT, dbname, fn);
1747 	state->udict_fd = open(fn, O_RDWR | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
1748 	if (state->udict_fd == -1) {
1749 		/* ERROR */
1750 		printf("unable to open %s for read/write\n", fn);
1751 		exit(1);
1752 	}
1753 	/* stat udict to get size */
1754 	if (etymon_af_fstat(state->udict_fd, &st) == -1) {
1755 		perror("index_add_files():fstat()");
1756 	}
1757 	state->udict_size = st.st_size;
1758 
1759 	/* open upost for append */
1760 	etymon_db_construct_path(ETYMON_DBF_UPOST, dbname, fn);
1761 	state->upost_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
1762 	if (state->upost_fd == -1) {
1763 		/* ERROR */
1764 		printf("unable to open %s for append\n", fn);
1765 		exit(1);
1766 	}
1767 	/* stat upost to get size */
1768 	if (etymon_af_fstat(state->upost_fd, &st) == -1) {
1769 		perror("index_add_files():fstat()");
1770 	}
1771 	state->upost_isize = st.st_size / sizeof(ETYMON_INDEX_UPOST);
1772 
1773 	/* open ufield for append */
1774 	etymon_db_construct_path(ETYMON_DBF_UFIELD, dbname, fn);
1775 	state->ufield_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
1776 	if (state->ufield_fd == -1) {
1777 		/* ERROR */
1778 		printf("unable to open %s for append\n", fn);
1779 		exit(1);
1780 	}
1781 	/* stat ufield to get size */
1782 	if (etymon_af_fstat(state->ufield_fd, &st) == -1) {
1783 		perror("index_add_files():fstat()");
1784 	}
1785 	state->ufield_isize = st.st_size / sizeof(ETYMON_INDEX_UFIELD);
1786 
1787 	/* open uword for append */
1788 	etymon_db_construct_path(ETYMON_DBF_UWORD, dbname, fn);
1789 	state->uword_fd = open(fn, O_WRONLY | O_APPEND | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
1790 	if (state->uword_fd == -1) {
1791 		/* ERROR */
1792 		printf("unable to open %s for append\n", fn);
1793 		exit(1);
1794 	}
1795 	/* stat uword to get size */
1796 	if (etymon_af_fstat(state->uword_fd, &st) == -1) {
1797 		perror("index_add_files():fstat()");
1798 	}
1799 	state->uword_isize = st.st_size / sizeof(ETYMON_INDEX_UWORD);
1800 
1801 	/* allocate memory for page cache */
1802 	state->pcache_nl_size = ETYMON_MAX_PAGE_DEPTH;
1803 	memleft = maxmem - ((size_t) (sizeof(ETYMON_INDEX_PCACHE_NODE) * ETYMON_MAX_PAGE_DEPTH));
1804 	if (memleft < 1048576) {
1805 		memleft = 1048576;
1806 	}
1807 	state->pcache_nl = (ETYMON_INDEX_PCACHE_NODE*)(malloc(sizeof(ETYMON_INDEX_PCACHE_NODE) * ETYMON_MAX_PAGE_DEPTH));
1808 	if (state->pcache_nl == NULL) {
1809 		/* ERROR */
1810 		printf("unable to allocate memory for cache\n");
1811 		exit(1);
1812 	}
1813 	state->pcache_count = 0;
1814 	state->pcache_nl[0].pos = 0;
1815 
1816 	/* initialize the write cached leaf page */
1817 	state->pcache_l_write = 0;
1818 
1819 	/* turn on word numbering */
1820 	state->phrase = dbinfo->phrase;
1821 	state->word_proximity = dbinfo->word_proximity;
1822 	state->stemming = dbinfo->stemming;
1823 	if ( (dbinfo->phrase) || (dbinfo->word_proximity) ) {
1824 		state->number_words = 1;
1825 	} else {
1826 		state->number_words = 0;
1827 	}
1828 
1829 	/* calculate cache memory allocation based on memleft */
1830 	if (state->number_words) {
1831 		wcache_alloc = (size_t) (memleft * .4);
1832 		fcache_alloc = (size_t) (memleft * .3);
1833 		wncache_alloc = (size_t) (memleft * .3);
1834 	} else {
1835 		wcache_alloc = (size_t) (memleft * .5);
1836 		fcache_alloc = (size_t) (memleft * .5);
1837 		wncache_alloc = 0;
1838 	}
1839 	/*
1840 	if (wcache_alloc > 2000000000)
1841 		wcache_alloc = 2000000000;
1842 	if (fcache_alloc > 2000000000)
1843 		fcache_alloc = 2000000000;
1844 	if (wncache_alloc > 2000000000)
1845 		wncache_alloc = 2000000000;
1846 	*/
1847 
1848 	/* allocate memory for word cache */
1849 	state->wcache_size = wcache_alloc / ((size_t) sizeof(ETYMON_INDEX_WCACHE_NODE));
1850 	state->wcache = (ETYMON_INDEX_WCACHE_NODE*)(malloc(sizeof(ETYMON_INDEX_WCACHE_NODE) * state->wcache_size));
1851 	if (state->wcache == NULL) {
1852 		/* ERROR */
1853 		printf("unable to allocate memory for cache\n");
1854 		exit(1);
1855 	}
1856 	state->wcache_count = 0;
1857 	state->wcache_root = -1;
1858 
1859 	/* allocate memory for field cache */
1860 	state->fcache_size = fcache_alloc / ((size_t) sizeof(ETYMON_INDEX_FCACHE_NODE));
1861 	state->fcache = (ETYMON_INDEX_FCACHE_NODE*)(malloc(sizeof(ETYMON_INDEX_FCACHE_NODE) * state->fcache_size));
1862 	if (state->fcache == NULL) {
1863 		/* ERROR */
1864 		printf("unable to allocate memory for cache\n");
1865 		exit(1);
1866 	}
1867 	state->fcache_count = 0;
1868 
1869 	/* allocate memory for word number cache */
1870 	if (state->number_words) {
1871 		state->wncache_size = wncache_alloc / ((size_t) sizeof(ETYMON_INDEX_WNCACHE_NODE));
1872 		state->wncache = (ETYMON_INDEX_WNCACHE_NODE*)(malloc(sizeof(ETYMON_INDEX_WNCACHE_NODE) *
1873 								     state->wncache_size));
1874 		if (state->wncache == NULL) {
1875 			/* ERROR */
1876 			printf("unable to allocate memory for cache\n");
1877 			exit(1);
1878 		}
1879 		state->wncache_count = 0;
1880 	}
1881 
1882 	/* load field definitions into a binary tree */
1883 	/* open fdef for read/write */
1884 	etymon_db_construct_path(ETYMON_DBF_FDEF, dbname, fn);
1885 	fdef_fd = open(fn, O_RDWR | O_CREAT | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
1886 	if (fdef_fd == -1) {
1887 		/* ERROR */
1888 		printf("unable to open %s for read/write\n", fn);
1889 		exit(1);
1890 	}
1891 	state->fdef_count = etymon_af_fdef_read_mem(fdef_fd, &(state->fdef_root), &(state->fdef_tail));
1892 
1893 	/* set dclass_id based on dclass string */
1894 	if (strcmp(opt->doctype, "xml") == 0) {
1895 		dclass_id = 1;
1896 	}
1897 	else if (strcmp(opt->doctype, "xml_test") == 0) {
1898 		dclass_id = 2;
1899 	}
1900 	else if (strcmp(opt->doctype, "erc") == 0) {
1901 		dclass_id = 100;
1902 	} else {
1903 		/* need to print an error here if the input is unknown
1904                    - right now it's being handled in af.cc which
1905                    is the wrong place */
1906 		dclass_id = 0;
1907 	}
1908 
1909 	/* set up parameters to pass in to document class init
1910 	   function */
1911 	dc_init.use_docbuf = 1;
1912 	dc_init.dc_state = NULL;
1913 
1914 	/* for now we hard code calls */
1915 	switch (dclass_id) {
1916 	case 2:
1917 		dc_index.dclass_id = 2;
1918 		result = dc_xml_test_init(&dc_init);
1919 		break;
1920 	case 100:
1921 		dc_index.dclass_id = 100;
1922 		result = dc_erc_init(&dc_init);
1923 		break;
1924 	case 1:
1925 #ifdef ETYMON_AF_XML
1926 		dc_index.dclass_id = 1;
1927 		result = dc_xml_init(&dc_init);
1928 		break;
1929 #endif
1930 	default:
1931 		dc_index.dclass_id = 0;
1932 		result = dc_text_init(&dc_init);
1933 	}
1934 	if (result == -1) {
1935 		free(state->wcache);
1936 		free(state->fcache);
1937 		if (state->number_words) {
1938 			free(state->wncache);
1939 		}
1940 		free(state->pcache_nl);
1941 		close(state->doctable_fd);
1942 		close(state->udict_fd);
1943 		close(state->upost_fd);
1944 		close(state->ufield_fd);
1945 		close(fdef_fd);
1946 		etymon_af_fdef_free_mem(state->fdef_root);
1947 		free(state);
1948 		return -1;
1949 	}
1950 
1951 	use_docbuf = dc_init.use_docbuf;
1952 
1953 	if (use_docbuf) {
1954 		/* set up buffering for input documents */
1955 		docbuf = (ETYMON_DOCBUF*)(malloc(sizeof(ETYMON_DOCBUF)));
1956 		docbuf->buf = NULL; /* first time it will be NULL,
1957 				       because we need to get
1958 				       st_blksize from stat */
1959 	} else {
1960 		docbuf = NULL;
1961 	}
1962 
1963 	/* set up parameters to pass in to document class index function */
1964 	dc_index.docbuf = docbuf;
1965 	dc_index.filename = fn;
1966 	dc_index.split_list = NULL;
1967 	dc_index.dlevel = opt->dlevel;
1968 	dc_index.state = state;
1969 	dc_index.dc_state = dc_init.dc_state;
1970 
1971 	/* get cwd */
1972 	getcwd(cwd, ETYMON_MAX_PATH_SIZE);
1973 
1974 	/* loop through and index each file */
1975 	x_file = 0;
1976 	done_files = 0;
1977 	do {
1978 
1979 		file_good = 1;
1980 
1981 		/* load file into buffer */
1982 		if (opt->_stdin) {
1983 			if (fgets(s_file, ETYMON_MAX_PATH_SIZE, stdin) == NULL) {
1984 				/* need to check fgets more correctly for errors */
1985 				done_files = 1;
1986 				break;
1987 			} else {
1988 				/* remove '\n' at end */
1989 				x = strlen(s_file);
1990 				if ( (x > 1) && (s_file[x - 1] == '\n') ) {
1991 					s_file[x - 1] = '\0';
1992 				}
1993 				source_file = s_file;
1994 			}
1995 		} else {
1996 			if (x_file == opt->sourcen) {
1997 				done_files = 1;
1998 				break;
1999 			} else {
2000 				source_file = opt->source[x_file];
2001 			}
2002 		}
2003 
2004 		if (done_files) {
2005 			break;
2006 		}
2007 
2008 		etymon_index_expand_path(source_file, fn, cwd);
2009 
2010 		if (use_docbuf) {
2011 			docbuf->fn = fn;
2012 			docbuf->filedes = open(docbuf->fn, O_RDONLY | ETYMON_AF_O_LARGEFILE);
2013 			if (docbuf->filedes == -1) {
2014 				/*
2015 				int e;
2016 				char s[ETYMON_MAX_MSG_SIZE];
2017 				sprintf(s, "%s: No such file or directory", docbuf->fn);
2018 				file_good = 0;
2019 				e = opt->log.error(s, 0);
2020 				if (e != 0) {
2021 					exit(e);
2022 				}
2023 				*/
2024 			} else {
2025 
2026 				/* stat the file */
2027 				if (etymon_af_fstat(docbuf->filedes,
2028 						    &(docbuf->st)) == -1) {
2029 					perror("index_add_files():fstat()");
2030 				}
2031 				/* make sure it is a regular file */
2032 				if (S_ISREG(docbuf->st.st_mode) == 0) {
2033 					int e;
2034 					char s[ETYMON_MAX_MSG_SIZE];
2035 					sprintf(s,
2036 						"%s: file not recognized: File format not recognized",
2037 						docbuf->fn);
2038 					file_good = 0;
2039 					close(docbuf->filedes);
2040 					/*
2041 					e = opt->log.error(s, 0);
2042 					if (e != 0) {
2043 						exit(e);
2044 					}
2045 					*/
2046 				}
2047 
2048 			}
2049 		}
2050 
2051 		if (file_good) {
2052 
2053 			state->flushmsg = 0;
2054 
2055 			if (opt->verbose >= 1) {
2056 				if (opt->verbose >= 2) {
2057 					printf("Indexing ");
2058 				}
2059 				printf("%s\n", fn);
2060 			}
2061 
2062 			if (use_docbuf) {
2063 				/* initialize the buffer if it hasn't been done */
2064 				if (docbuf->buf == NULL) {
2065 					docbuf->buf_size = docbuf->st.st_blksize;
2066 					docbuf->buf = (unsigned char*)(malloc(docbuf->buf_size));
2067 				}
2068 				/* continue setting up to load the file */
2069 				docbuf->eof = 0;
2070 				/* read the first page from disk */
2071 				etymon_docbuf_load_page(docbuf);
2072 
2073 				/* ok, the docbuf page has been
2074 				   prepared */
2075 				/* check if we need to split the file
2076 				   into multiple documents */
2077 				if (*(opt->split) != '\0') {
2078 					split_list =
2079 						etymon_af_index_get_split_list(
2080 							&dc_index,
2081 							(char *) opt->split);
2082 					/* reset the docbuf page */
2083 					if (etymon_af_lseek(docbuf->filedes,
2084 							    (etymon_af_off_t)0, SEEK_SET) == -1) {
2085 						perror("index_add_files():lseek()");
2086 						exit(-1);
2087 					}
2088 					docbuf->eof = 0;
2089 					etymon_docbuf_load_page(docbuf);
2090 				} else {
2091 					/* otherwise set up a single
2092 					   node split list, marking
2093 					   the end of the file */
2094 					split_list =
2095 						(ETYMON_AF_DC_SPLIT*)(malloc(
2096 									      sizeof(ETYMON_AF_DC_SPLIT)));
2097 					split_list->end = docbuf->st.st_size;
2098 					split_list->next = NULL;
2099 				}
2100 				dc_index.split_list = split_list;
2101 			}
2102 
2103 			/* here we must call the indexing function in the doctype,
2104 			   handing it a pointer to a struct of call back functions */
2105 			/* for now we hard code calls */
2106 			switch (dclass_id) {
2107 			case 2:
2108 				dc_index.dclass_id = 2;
2109 				result = dc_xml_test_index(&dc_index);
2110 				break;
2111 			case 100:
2112 				dc_index.dclass_id = 100;
2113 				result = dc_erc_index(&dc_index);
2114 				break;
2115 			case 1:
2116 #ifdef ETYMON_AF_XML
2117 				dc_index.dclass_id = 1;
2118 				result = dc_xml_index(&dc_index);
2119 				break;
2120 #endif
2121 			default:
2122 				dc_index.dclass_id = 0;
2123 				result = dc_text_index(&dc_index);
2124 			}
2125 			/* free split list */
2126 			while (split_list) {
2127 				split_p = split_list;
2128 				split_list = split_list->next;
2129 				free(split_p);
2130 			}
2131 			/* check result from document class */
2132 			if (result == -1) {
2133 				free(state->wcache);
2134 				free(state->fcache);
2135 				if (state->number_words) {
2136 					free(state->wncache);
2137 				}
2138 				free(state->pcache_nl);
2139 				if (use_docbuf) {
2140 					if (docbuf->buf != NULL) {
2141 						free(docbuf->buf);
2142 					}
2143 					free(docbuf);
2144 				}
2145 				close(state->doctable_fd);
2146 				close(state->udict_fd);
2147 				close(state->upost_fd);
2148 				close(state->ufield_fd);
2149 				close(fdef_fd);
2150 				etymon_af_fdef_free_mem(state->fdef_root);
2151 				free(state);
2152 				return -1;
2153 			}
2154 
2155 			if (use_docbuf) {
2156 				/* close the document file */
2157 				close(docbuf->filedes);
2158 			}
2159 
2160 		} /* if (file_good) */
2161 
2162 		x_file++;
2163 
2164 	} while (done_files == 0);
2165 
2166 	if (etymon_index_dclass_finish(state) == -1) {
2167 		free(state->wcache);
2168 		free(state->fcache);
2169 		if (state->number_words) {
2170 			free(state->wncache);
2171 		}
2172 		free(state->pcache_nl);
2173 		if (use_docbuf) {
2174 			if (docbuf->buf != NULL) {
2175 				free(docbuf->buf);
2176 			}
2177 			free(docbuf);
2178 		}
2179 		close(state->doctable_fd);
2180 		close(state->udict_fd);
2181 		close(state->upost_fd);
2182 		close(state->ufield_fd);
2183 		close(fdef_fd);
2184 		etymon_af_fdef_free_mem(state->fdef_root);
2185 		free(state);
2186 		return -1;
2187 	}
2188 
2189 	/* write out fdef file */
2190 	/* re-open fdef and overwrite */
2191 	close(fdef_fd);
2192 	etymon_db_construct_path(ETYMON_DBF_FDEF, dbname, fn);
2193 	fdef_fd = open(fn, O_WRONLY | O_CREAT | O_TRUNC | ETYMON_AF_O_LARGEFILE, ETYMON_DB_PERM);
2194 	if (fdef_fd == -1) {
2195 		/* ERROR */
2196 		printf("unable to open %s for read/write\n", fn);
2197 		exit(1);
2198 	}
2199 	etymon_af_fdef_write_mem(fdef_fd, state->fdef_root);
2200 
2201 	/* update dbinfo */
2202 /*
2203 	if (etymon_af_lseek(dbinfo_fd, (etymon_af_off_t)0, SEEK_SET) == -1) {
2204 		perror("index_add_files():lseek()");
2205 	}
2206 	magic = ETYMON_INDEX_MAGIC;
2207 	nbytes = write(dbinfo_fd, &magic, sizeof(Uint4));
2208 	if (nbytes != sizeof(Uint4)) {
2209 		printf("unable to write MN\n");
2210 		exit(1);
2211 	}
2212 	dbinfo.udict_root = state->udict_root;
2213 	dbinfo.doc_n = state->doc_n;
2214 	nbytes = write(dbinfo_fd, &dbinfo, sizeof(ETYMON_DB_INFO));
2215 	if (nbytes != sizeof(ETYMON_DB_INFO)) {
2216 		printf("unable to write DBI\n");
2217 		exit(1);
2218 	}
2219 	close(dbinfo_fd);
2220 */
2221 	dbinfo->udict_root = state->udict_root;
2222 	dbinfo->doc_n = state->doc_n;
2223 
2224 	/* clean up */
2225 	free(state->wcache);
2226 	free(state->fcache);
2227 	if (state->number_words) {
2228 		free(state->wncache);
2229 	}
2230 	free(state->pcache_nl);
2231 	if (use_docbuf) {
2232 		if (docbuf->buf != NULL) {
2233 			free(docbuf->buf);
2234 		}
2235 		free(docbuf);
2236 	}
2237 	close(state->doctable_fd);
2238 	close(state->udict_fd);
2239 	close(state->upost_fd);
2240 	close(state->ufield_fd);
2241 	close(fdef_fd);
2242 	etymon_af_fdef_free_mem(state->fdef_root);
2243 	free(state);
2244 
2245 	/* unlock the database */
2246 	etymon_db_unlock(dbname);
2247 
2248 	return 0;
2249 }
2250 
2251 
etymon_af_index_add_word(ETYMON_AF_INDEX_ADD_WORD * opt)2252 int etymon_af_index_add_word(ETYMON_AF_INDEX_ADD_WORD* opt) {
2253 	ETYMON_INDEX_INDEXING_STATE* state = opt->state;
2254 	int tree_p, comp, field_p, word_number_p;
2255 	int* tree_link;
2256 	int done;
2257 	int full;
2258 
2259 	if (state->verbose >= 5) {
2260 		afprintvp(state->verbose, 5);
2261 		printf("Adding word: \"%s\"\n", (const char *) opt->word);
2262 	}
2263 
2264 	/*
2265 	size_t tmp_len = strlen((const char*)opt->word);
2266 	char tmp_word[1024];
2267 	strcpy(tmp_word, (const char*)opt->word);
2268 	af_stem(opt->word);
2269 	if (strlen((const char*)opt->word) != tmp_len) {
2270 		printf("%s -> ", tmp_word);
2271 		printf("%s\n", (const char*)opt->word);
2272 	}
2273 	*/
2274 	if (state->stemming)
2275 		af_stem(opt->word);
2276 
2277 	/* if any caches are full, then index this block and clear the caches */
2278 	if (state->number_words) {
2279 		full = (state->wcache_count == state->wcache_size) ||
2280 			(state->fcache_count == state->fcache_size) ||
2281 			(state->wncache_count == state->wncache_size);
2282 	} else {
2283 		full = (state->wcache_count == state->wcache_size) ||
2284 			(state->fcache_count == state->fcache_size);
2285 	}
2286 	if (full) {
2287 		if (etymon_index_dclass_index(state) == -1) {
2288 			return -1;
2289 		}
2290 		state->wcache_count = 0;
2291 		state->wcache_root = -1;
2292 		state->fcache_count = 0;
2293 		state->wncache_count = 0;
2294 	}
2295 
2296 	/* add the new word to the cache */
2297 
2298 	/* search the binary tree for a matching word
2299 	   - tree_p will end up the index of the matching node, or -1 if no match exists
2300 	   - tree_link will end up pointing to the parent link (or the root pointer) */
2301 
2302 	tree_link = &(state->wcache_root);
2303 	tree_p = state->wcache_root;
2304 	comp = -1;
2305 	while ( (comp != 0) && (tree_p != -1) ) {
2306 		comp = strcmp((char*)(opt->word), (char*)(state->wcache[tree_p].word));
2307 		if (comp < 0) {
2308 			tree_link = &(state->wcache[tree_p].left);
2309 			tree_p = *tree_link;
2310 		}
2311 		else if (comp > 0) {
2312 			tree_link = &(state->wcache[tree_p].right);
2313 			tree_p = *tree_link;
2314 		}
2315 	}
2316 
2317 	if (tree_p == -1) {
2318 		/* if there was no match, we create a new node */
2319 		memcpy(state->wcache[state->wcache_count].word, opt->word, ETYMON_MAX_WORD_SIZE);
2320 		state->wcache[state->wcache_count].left = -1;
2321 		state->wcache[state->wcache_count].right = -1;
2322 		state->wcache[state->wcache_count].next = state->wcache_count; /* here next points to the tail */
2323 		state->wcache[state->wcache_count].freq = 1;
2324 		state->wcache[state->wcache_count].doc_id = opt->doc_id;
2325 		/* add new node to field cache */
2326 		if (opt->fields[0] != 0) {
2327 			memcpy(state->fcache[state->fcache_count].f, opt->fields, ETYMON_MAX_FIELD_NEST * 2);
2328 			state->fcache[state->fcache_count].next = -1;
2329 			state->wcache[state->wcache_count].fields = state->fcache_count;
2330 			state->fcache_count++;
2331 		} else {
2332 			state->wcache[state->wcache_count].fields = -1;
2333 		}
2334 		/* add new node to word number cache */
2335 		if (state->number_words) {
2336 			state->wncache[state->wncache_count].wn = opt->word_number;
2337 			state->wncache[state->wncache_count].next = -1;
2338 			state->wcache[state->wcache_count].word_numbers_head = state->wncache_count;
2339 			state->wcache[state->wcache_count].word_numbers_tail = state->wncache_count;
2340 			state->wncache_count++;
2341 		} else {
2342 			state->wcache[state->wcache_count].word_numbers_head = -1;
2343 			state->wcache[state->wcache_count].word_numbers_tail = -1;
2344 		}
2345 		/* update parent node in binary tree */
2346 		if (tree_link != NULL) {
2347 			*tree_link = state->wcache_count;
2348 		}
2349 		state->wcache_count++;
2350 	} else {
2351 		/* there was a word match, so now we check if the doc_id's match */
2352 		if (opt->doc_id == state->wcache[tree_p].doc_id) {
2353 			/* doc_id's match, so we simply increment the frequency */
2354 			state->wcache[tree_p].freq++;
2355 			/* now add new node to field cache if there is no matching field */
2356 			if (opt->fields[0] != 0) {
2357 				/* search for a matching field */
2358 				field_p = state->wcache[tree_p].fields;
2359 				done = 0;
2360 				while ( (done == 0) && (field_p != -1) ) {
2361 					if (memcmp(state->fcache[field_p].f, opt->fields,
2362 						   ETYMON_MAX_FIELD_NEST * 2) != 0) {
2363 						field_p = state->fcache[field_p].next;
2364 					} else {
2365 						done = 1;
2366 					}
2367 				}
2368 				if (field_p == -1) {
2369 					/* no match, so add a new field node */
2370 					memcpy(state->fcache[state->fcache_count].f, opt->fields,
2371 					       ETYMON_MAX_FIELD_NEST * 2);
2372 					state->fcache[state->fcache_count].next = state->wcache[tree_p].fields;
2373 					state->wcache[tree_p].fields = state->fcache_count;
2374 					state->fcache_count++;
2375 				}
2376 			}
2377 			/* now add new node to word number cache at end of list */
2378 			if (state->number_words) {
2379 				/* search to the end of the list */
2380 				word_number_p = state->wcache[tree_p].word_numbers_tail;
2381 				/* add a new word number node */
2382 				state->wncache[state->wncache_count].wn = opt->word_number;
2383 				state->wncache[state->wcache[tree_p].word_numbers_tail].next = state->wncache_count;
2384 				state->wncache[state->wncache_count].next = -1;
2385 				state->wcache[tree_p].word_numbers_tail = state->wncache_count;
2386 				state->wncache_count++;
2387 			}
2388 		} else {
2389 			/* doc_id's don't match, so we create a new node in the binary tree */
2390 			memcpy(state->wcache[state->wcache_count].word, opt->word, ETYMON_MAX_WORD_SIZE);
2391 			state->wcache[state->wcache_count].left = state->wcache[tree_p].left;
2392 			state->wcache[state->wcache_count].right = state->wcache[tree_p].right;
2393 			state->wcache[state->wcache_count].next = state->wcache[tree_p].next; /* tail */
2394 			state->wcache[tree_p].next = state->wcache_count; /* to point back to the new node */
2395 			state->wcache[state->wcache_count].freq = 1;
2396 			state->wcache[state->wcache_count].doc_id = opt->doc_id;
2397 			/* add new node to field cache */
2398 			if (opt->fields[0] != 0) {
2399 				memcpy(state->fcache[state->fcache_count].f, opt->fields, ETYMON_MAX_FIELD_NEST * 2);
2400 				state->fcache[state->fcache_count].next = -1;
2401 				state->wcache[state->wcache_count].fields = state->fcache_count;
2402 				state->fcache_count++;
2403 			} else {
2404 				state->wcache[state->wcache_count].fields = -1;
2405 			}
2406 			/* add new node to word number cache */
2407 			if (state->number_words) {
2408 				state->wncache[state->wncache_count].wn = opt->word_number;
2409 				state->wncache[state->wncache_count].next = -1;
2410 				state->wcache[state->wcache_count].word_numbers_head = state->wncache_count;
2411 				state->wcache[state->wcache_count].word_numbers_tail = state->wncache_count;
2412 				state->wncache_count++;
2413 			} else {
2414 				state->wcache[state->wcache_count].word_numbers_head = -1;
2415 				state->wcache[state->wcache_count].word_numbers_head = -1;
2416 			}
2417 			/* update parent node in binary tree */
2418 			if (tree_link != NULL) {
2419 				*tree_link = state->wcache_count;
2420 			}
2421 			state->wcache_count++;
2422 		}
2423 	}
2424 
2425 	return 0;
2426 }
2427 
2428 
etymon_af_index_add_doc(ETYMON_AF_INDEX_ADD_DOC * opt)2429 Uint4 etymon_af_index_add_doc(ETYMON_AF_INDEX_ADD_DOC* opt) {
2430 	ETYMON_INDEX_INDEXING_STATE* state = opt->state;
2431 	ssize_t nbytes;
2432 
2433 	/* fill in doctable entry with new data */
2434 	if (opt->key == NULL) {
2435 		/* fill in default key, based on doctable id */
2436 		/*
2437 		snprintf((char*)(state->doctable.key), ETYMON_MAX_KEY_SIZE, "%ld",
2438 			(unsigned long)(state->doctable_next_id));
2439 		*/
2440 		state->doctable.key[0] = '\0';
2441 	} else {
2442 		strncpy((char*)(state->doctable.key), (char*)(opt->key), ETYMON_MAX_KEY_SIZE - 1);
2443 		state->doctable.key[ETYMON_MAX_KEY_SIZE - 1] = '\0';
2444 	}
2445 	strncpy(state->doctable.filename, opt->filename, ETYMON_MAX_PATH_SIZE - 1);
2446 	state->doctable.filename[ETYMON_MAX_PATH_SIZE - 1] = '\0';
2447 	state->doctable.begin = opt->begin;
2448 	state->doctable.end = opt->end;
2449 	state->doctable.parent = opt->parent;
2450 	state->doctable.dclass_id = opt->dclass_id;
2451 	state->doctable.deleted = 0;
2452 	/* write out doctable entry */
2453 	nbytes = write(state->doctable_fd, &(state->doctable), sizeof(ETYMON_DOCTABLE));
2454 	if (nbytes != sizeof(ETYMON_DOCTABLE)) {
2455 		/* ERROR */
2456 		printf("error writing to file in etymon_index_dclass_add_doc\n");
2457 		exit(1);
2458 	}
2459 	/* increment count of total number of (non-deleted) documents in database */
2460 	state->doc_n++;
2461 	/* increment next counter */
2462 	return state->doctable_next_id++;
2463 }
2464 
2465 
2466 /* need to change this function prototype to conform to other document
2467    class call-backs */
etymon_index_dclass_get_next_doc_id(ETYMON_INDEX_INDEXING_STATE * state)2468 Uint4 etymon_index_dclass_get_next_doc_id(ETYMON_INDEX_INDEXING_STATE* state) {
2469 	return state->doctable_next_id;
2470 }
2471