1 /*
2  *  Copyright (C) 1999-2004 Etymon Systems, Inc.
3  *
4  *  Authors:  Nassib Nassar
5  */
6 
7 #include <stdio.h>
8 #include <string.h>
9 #include "linear.h"
10 #include "lock.h"
11 #include "util.h"
12 #include "info.h"
13 #include "linbuf.h"
14 
15 typedef struct {
16 	const Aflinear *rq;
17 	int postx;
18 	Affile f;
19 	Dbinfo info;
20 	off_t udictp;
21 	off_t udictnb;
22 	Uint4 upostp;
23 	off_t upostn;
24 	Uint4 ufieldp;
25 	off_t ufieldn;
26 	Uint4 uwordp;
27 	off_t uwordn;
28 	off_t lpostn;
29 	off_t lfieldn;
30 	off_t lwordn;
31 	Uint4 lpostpsave;
32 	Uint4 lfieldpsave;
33 	Uint4 fieldc;
34 	Uint4 lwordpsave;
35 	Uint4 wordc;
36 	ETYMON_INDEX_PAGE_L pagel;
37 	ETYMON_INDEX_UPOST upost;
38 	ETYMON_INDEX_LPOST lpost;
39 	ETYMON_INDEX_UFIELD ufield;
40 	ETYMON_INDEX_LFIELD lfield;
41 	ETYMON_INDEX_UWORD uword;
42 	ETYMON_INDEX_LWORD lword;
43 } Aflinst;
44 
getlock(const char * db)45 static int getlock(const char *db)
46 {
47 	if (!etymon_db_ready(db))
48 		return aferr(AFEDBLOCK);
49 	if (etymon_db_lock(db, NULL) < 0)
50 		return -1;
51 
52 	return 0;
53 }
54 
freelock(const char * db)55 static int freelock(const char *db)
56 {
57 	etymon_db_unlock(db);
58 
59 	return 0;
60 }
61 
openfiles(const char * db,Affile * f)62 static int openfiles(const char *db, Affile *f)
63 {
64 	memset(f, 0, sizeof f);
65 	if (!(f->info = afopendbf(db, AFFTINFO, "r+b")))
66 		return -1;
67 	if (!(f->udict = afopendbf(db, AFFTUDICT, "r+b")))
68 		return -1;
69 	if (!(f->upost = afopendbf(db, AFFTUPOST, "rb")))
70 		return -1;
71 	if (!(f->ufield = afopendbf(db, AFFTUFIELD, "rb")))
72 		return -1;
73 	if (!(f->uword = afopendbf(db, AFFTUWORD, "rb")))
74 		return -1;
75 	if (!(f->lpost = afopendbf(db, AFFTLPOST, "ab")))
76 		return -1;
77 	if (!(f->lfield = afopendbf(db, AFFTLFIELD, "ab")))
78 		return -1;
79 	if (!(f->lword = afopendbf(db, AFFTLWORD, "ab")))
80 		return -1;
81 	return 0;
82 }
83 
closefile(FILE * f)84 static int closefile(FILE *f)
85 {
86 	if (f) {
87 		if (fclose(f) != 0)
88 			return aferr(AFEDBIO);
89 	}
90 
91 	return 0;
92 }
93 
truncfile(const char * db,int type)94 static int truncfile(const char *db, int type)
95 {
96 	FILE *f;
97 
98 	if (!(f = afopendbf(db, type, "wb")))
99 		return aferr(AFEDBIO);
100 	if (fclose(f) != 0)
101 		return aferr(AFEDBIO);
102 
103 	return 0;
104 }
105 
truncufiles(const char * db)106 static int truncufiles(const char *db)
107 {
108 	if (truncfile(db, AFFTUPOST) < 0)
109 		return -1;
110 	if (truncfile(db, AFFTUFIELD) < 0)
111 		return -1;
112 	if (truncfile(db, AFFTUFIELD) < 0)
113 		return -1;
114 
115 	return 0;
116 }
117 
closefiles(Affile * f)118 static int closefiles(Affile *f)
119 {
120 	if (closefile(f->info) < 0)
121 		return -1;
122 	if (closefile(f->udict) < 0)
123 		return -1;
124 	if (closefile(f->upost) < 0)
125 		return -1;
126 	if (closefile(f->ufield) < 0)
127 		return -1;
128 	if (closefile(f->uword) < 0)
129 		return -1;
130 	if (closefile(f->lpost) < 0)
131 		return -1;
132 	if (closefile(f->lfield) < 0)
133 		return -1;
134 	if (closefile(f->lword) < 0)
135 		return -1;
136 
137 	return 0;
138 }
139 
getfsizes(Aflinst * t)140 static int getfsizes(Aflinst *t)
141 {
142 	off_t nb;
143 
144 	if (afgetfsize(t->f.udict, &t->udictnb) < 0)
145 		return -1;
146 
147 	if (afgetfsize(t->f.upost, &nb) < 0)
148 		return -1;
149 	t->upostn = nb / sizeof (ETYMON_INDEX_UPOST);
150 	if (afgetfsize(t->f.ufield, &nb) < 0)
151 		return -1;
152 	t->ufieldn = nb / sizeof (ETYMON_INDEX_UFIELD);
153 	if (afgetfsize(t->f.uword, &nb) < 0)
154 		return -1;
155 	t->uwordn = nb / sizeof (ETYMON_INDEX_UWORD);
156 	if (afgetfsize(t->f.lpost, &nb) < 0)
157 		return -1;
158 	t->lpostn = nb / sizeof (ETYMON_INDEX_LPOST);
159 	if (afgetfsize(t->f.lfield, &nb) < 0)
160 		return -1;
161 	t->lfieldn = nb / sizeof (ETYMON_INDEX_LFIELD);
162 	if (afgetfsize(t->f.lword, &nb) < 0)
163 		return -1;
164 	t->lwordn = nb / sizeof (ETYMON_INDEX_LWORD);
165 
166 	return 0;
167 }
168 
seekleftleaf(Aflinst * t)169 static int seekleftleaf(Aflinst *t)
170 {
171 	Uint1 leaf;
172 	ETYMON_INDEX_PAGE_NL pagenl;
173 
174 	t->udictp = t->info.udict_root;
175 	while (1) {
176 		if (fseeko(t->f.udict, (off_t) t->udictp, SEEK_SET) < 0)
177 			return aferr(AFEDBIO);
178 		if (fread(&leaf, 1, 1, t->f.udict) < 0)
179 			return aferr(AFEDBIO);
180 		if (leaf)
181 			return 0;
182 		if (fread(&pagenl, 1, sizeof pagenl, t->f.udict) < sizeof pagenl)
183 			return aferr(AFEDBIO);
184 		t->udictp = pagenl.p[0];
185 	}
186 }
187 
readpagel(Aflinst * t)188 static int readpagel(Aflinst *t)
189 {
190 	if (fseeko(t->f.udict, (off_t) (t->udictp + 1), SEEK_SET) < 0)
191 		return aferr(AFEDBIO);
192 	if (fread(&(t->pagel), 1, sizeof t->pagel, t->f.udict) < sizeof t->pagel)
193 		return aferr(AFEDBIO);
194 
195 	return 0;
196 }
197 
readupost(Aflinst * t)198 static int readupost(Aflinst *t)
199 {
200 	if (t->rq->nobuffer) {
201 		if (fseeko(t->f.upost,
202 			   (off_t) ( ((off_t) (t->upostp - 1)) *
203 				     ((off_t) (sizeof t->upost)) ),
204 			   SEEK_SET) < 0)
205 			return aferr(AFEDBIO);
206 		if (fread(&(t->upost), 1, sizeof t->upost, t->f.upost) < sizeof t->upost)
207 			return aferr(AFEDBIO);
208 	} else {
209 		if (aflinread(&(t->upost),
210 			      (off_t) ( ((off_t) (t->upostp - 1)) *
211 					((off_t) (sizeof t->upost)) ),
212 			      sizeof t->upost) < 0)
213 			return -1;
214 	}
215 
216 	return 0;
217 }
218 
readufield(Aflinst * t)219 static int readufield(Aflinst *t)
220 {
221 	if (fseeko(t->f.ufield,
222 		   (off_t) ( ((off_t) (t->ufieldp - 1)) *
223 			     ((off_t) (sizeof t->ufield)) ),
224 		   SEEK_SET) < 0)
225 		return aferr(AFEDBIO);
226 	if (fread(&(t->ufield), 1, sizeof t->ufield, t->f.ufield) <
227 	    sizeof t->ufield)
228 		return aferr(AFEDBIO);
229 
230 	return 0;
231 }
232 
writelfield(Aflinst * t)233 static int writelfield(Aflinst *t)
234 {
235 	if (fwrite(&(t->lfield), 1, sizeof t->lfield, t->f.lfield) <
236 	    sizeof t->lfield)
237 		return aferr(AFEDBIO);
238 
239 	return 0;
240 }
241 
linfield(Aflinst * t)242 static int linfield(Aflinst *t)
243 {
244 	/* do we need to look for duplicates?? */
245 	t->lfieldpsave = t->lfieldn + 1;
246 	t->fieldc = 0;
247 	t->ufieldp = t->upost.fields;
248 	while (t->ufieldp) {
249 		t->fieldc++;
250 		if (readufield(t) < 0)
251 			return -1;
252 		memcpy(t->lfield.fields, t->ufield.fields,
253 		       ETYMON_MAX_FIELD_NEST * 2);
254 		if (writelfield(t) < 0)
255 			return -1;
256 		(t->lfieldn)++;
257 		t->ufieldp = t->ufield.next;
258 	}
259 
260 	return 0;
261 }
262 
readuword(Aflinst * t)263 static int readuword(Aflinst *t)
264 {
265 	if (fseeko(t->f.uword,
266 		   (off_t) ( ((off_t) (t->uwordp - 1)) *
267 			     ((off_t) (sizeof t->uword)) ),
268 		   SEEK_SET) < 0)
269 		return aferr(AFEDBIO);
270 	if (fread(&(t->uword), 1, sizeof t->uword, t->f.uword) <
271 	    sizeof t->uword)
272 		return aferr(AFEDBIO);
273 
274 	return 0;
275 }
276 
writelword(Aflinst * t)277 static int writelword(Aflinst *t)
278 {
279 	if (fwrite(&(t->lword), 1, sizeof t->lword, t->f.lword) <
280 	    sizeof t->lword)
281 		return aferr(AFEDBIO);
282 
283 	return 0;
284 }
285 
linwn(Aflinst * t)286 static int linwn(Aflinst *t)
287 {
288 	t->lwordpsave = t->lwordn + 1;
289 	t->wordc = 0;
290 	t->uwordp = t->upost.word_numbers;
291 	while (t->uwordp) {
292 		t->wordc++;
293 		if (readuword(t) < 0)
294 			return -1;
295 		t->lword.wn = t->uword.wn;
296 		if (writelword(t) < 0)
297 			return -1;
298 		(t->lwordn)++;
299 		t->uwordp = t->uword.next;
300 	}
301 
302 	return 0;
303 }
304 
writelpost(Aflinst * t)305 static int writelpost(Aflinst *t)
306 {
307 	if (fwrite(&(t->lpost), 1, sizeof t->lpost, t->f.lpost) <
308 	    sizeof t->lpost)
309 		return aferr(AFEDBIO);
310 
311 	return 0;
312 }
313 
updatelpost(Aflinst * t)314 static int updatelpost(Aflinst *t)
315 {
316 	/* compare the doc_id with our cached lpost */
317 	if (t->upost.doc_id == t->lpost.doc_id) {
318 		/* increment the frequency and field count */
319 		t->lpost.freq += t->upost.freq;
320 		t->lpost.fields_n += t->fieldc;
321 		t->lpost.word_numbers_n += t->wordc;
322 	} else {
323 		/* flush lpost */
324 		/* only flush if lpost contains something */
325 		if (t->lpost.doc_id) {
326 			if (writelpost(t) < 0)
327 				return -1;
328 			(t->lpostn)++;
329 			(t->pagel.post_n[t->postx])++;
330 		}
331 		/* replace lpost with upost */
332 		t->lpost.doc_id = t->upost.doc_id;
333 		t->lpost.freq = t->upost.freq;
334 		t->lpost.fields_n = t->fieldc;
335 		t->lpost.word_numbers_n = t->wordc;
336 		/* set field pointer */
337 		t->lpost.fields = t->lfieldpsave;
338 		t->lpost.word_numbers = t->lwordpsave;
339 	}
340 
341 	return 0;
342 }
343 
writepagel(Aflinst * t)344 static int writepagel(Aflinst *t)
345 {
346 	if (fseeko(t->f.udict, (off_t) (t->udictp + 1), SEEK_SET) < 0)
347 		return aferr(AFEDBIO);
348 	if (fwrite(&(t->pagel), 1, sizeof t->pagel, t->f.udict) <
349 	    sizeof t->pagel)
350 		return aferr(AFEDBIO);
351 
352 	return 0;
353 }
354 
debugpostings(Aflinst * t)355 static void debugpostings(Aflinst *t)
356 {
357 	if (t->rq->verbose >= 6) {
358 		int x;
359 		afprintvp(t->rq->verbose, 6);
360 		printf("Read postings (key=\"");
361 		for (x = t->pagel.offset[t->postx];
362 		     x < t->pagel.offset[t->postx + 1];
363 		     x++)
364 			printf("%c", (char) t->pagel.keys[x]);
365 		printf("\")\n");
366 	}
367 }
368 
linpost(Aflinst * t)369 static int linpost(Aflinst *t)
370 {
371 	for (t->postx = 0; t->postx < t->pagel.n; (t->postx)++) {
372 		debugpostings(t);
373 		t->pagel.post_n[t->postx] = 0;
374 		t->lpostpsave = t->lpostn + 1;
375 		t->upostp = t->pagel.post[t->postx];
376 		t->lpost.doc_id = 0;
377 		while (t->upostp) {
378 			afprintv(t->rq->verbose, 6, "Read posting");
379 			if (readupost(t) < 0)
380 				return -1;
381 			afprintv(t->rq->verbose, 6, "Linearize fields");
382 			if (linfield(t) < 0)
383 				return -1;
384 			afprintv(t->rq->verbose, 6, "Linearize word numbers");
385 			if (linwn(t) < 0)
386 				return -1;
387 			if (updatelpost(t) < 0)
388 				return -1;
389 			t->upostp = t->upost.next;
390 		}
391 		/* flush lpost if it contains something */
392 		if (t->lpost.doc_id) {
393 			if (writelpost(t) < 0)
394 				return -1;
395 			(t->lpostn)++;
396 			(t->pagel.post_n[t->postx])++;
397 		}
398 		t->pagel.post[t->postx] = t->lpostpsave;
399 	}
400 	if (writepagel(t) < 0)
401 		return -1;
402 	t->udictp = t->pagel.next;
403 
404 	return 0;
405 }
406 
debugpagel(Aflinst * t)407 static void debugpagel(Aflinst *t)
408 {
409 	if (t->rq->verbose >= 6) {
410 		afprintvp(t->rq->verbose, 6);
411 		printf("Leaf node: keys=\"%s\"\n",
412 		       (char *) t->pagel.keys);
413 	}
414 }
415 
416 /* this was written before the advent of ETYMON_INDEX_PAGE_L.post_n[],
417    ETYMON_INDEX_UPOST.fields_n, and ETYMON_INDEX_UPOST.word_numbers_n
418    in the first unlinearized pass; so it explicitly counts these
419    values while building the linear structures */
linearize(Aflinst * t)420 static int linearize(Aflinst *t)
421 {
422 	afprintv(t->rq->verbose, 5, "Seek to leftmost leaf node");
423 	if (seekleftleaf(t) < 0)
424 		return -1;
425 	do {
426 		afprintv(t->rq->verbose, 5, "Read leaf node");
427 		if (readpagel(t) < 0)
428 			return -1;
429 		debugpagel(t);
430 		afprintv(t->rq->verbose, 5, "Linearize postings");
431 		if (linpost(t) < 0)
432 			return -1;
433 	} while (t->udictp);
434 
435 	return 0;
436 }
437 
linopen(Aflinst * t)438 static int linopen(Aflinst *t)
439 {
440 	afprintv(t->rq->verbose, 4, "Locking database");
441 	if (getlock(t->rq->db) < 0)
442 		return -1;
443 
444 	afprintv(t->rq->verbose, 4, "Opening database files");
445 	if (openfiles(t->rq->db, &(t->f)) < 0)
446 		return -1;
447 
448 	afprintv(t->rq->verbose, 4, "Reading database information");
449 	if (afreadinfo(t->f.info, &(t->info)) < 0)
450 		return -1;
451 
452 	afprintv(t->rq->verbose, 4, "Checking file sizes");
453 	if (getfsizes(t) < 0)
454 		return -1;
455 
456 	if (!t->rq->nobuffer) {
457 		if (aflinbuf(t->f.upost, t->rq->memory) < 0)
458 			return -1;
459 	}
460 
461 	return 0;
462 }
463 
linclose(Aflinst * t)464 static int linclose(Aflinst *t)
465 {
466 	t->info.optimized = 1;
467 
468 	afprintv(t->rq->verbose, 4, "Writing database information");
469 	if (afwriteinfo(t->f.info, &(t->info)) < 0)
470 		return -1;
471 
472 	afprintv(t->rq->verbose, 4, "Closing database files");
473 	if (closefiles(&(t->f)) < 0)
474 		return -1;
475 
476 	afprintv(t->rq->verbose, 4, "Truncating unlinearized files");
477 	if (truncufiles(t->rq->db) < 0)
478 		return -1;
479 
480 	afprintv(t->rq->verbose, 4, "Unlocking database");
481 	if (freelock(t->rq->db) < 0)
482 		return -1;
483 
484 	return 0;
485 }
486 
_aflinear(const Aflinear * rq)487 int _aflinear(const Aflinear *rq)
488 {
489 	Aflinst t;
490 
491 	t.rq = rq;
492 	afprintv(rq->verbose, 2, "Linearizing");
493 
494 	afprintv(rq->verbose, 3, "Opening database");
495 	if (linopen(&t) < 0)
496 		return -1;
497 	afprintv(rq->verbose, 4, "Checking if database is linearized");
498 	/* exit if db is already linearized */
499 	if (t.info.optimized) {
500 		if (afclosedbf(&t.f) < 0)
501 			return -1;
502 		if (freelock(t.rq->db) < 0)
503 			return -1;
504 		return aferr(AFELINEAR);
505 	}
506 
507 	afprintv(rq->verbose, 3, "Performing linearize process");
508 	if (linearize(&t) < 0)
509 		return -1;
510 
511 	afprintv(rq->verbose, 3, "Closing database");
512 	if (linclose(&t) < 0)
513 		return -1;
514 
515 	return 0;
516 }
517