1 /*
2 * Copyright (C) 1999-2004 Etymon Systems, Inc.
3 *
4 * Authors: Nassib Nassar
5 */
6
7 #include <stdio.h>
8 #include <string.h>
9 #include "linear.h"
10 #include "lock.h"
11 #include "util.h"
12 #include "info.h"
13 #include "linbuf.h"
14
15 typedef struct {
16 const Aflinear *rq;
17 int postx;
18 Affile f;
19 Dbinfo info;
20 off_t udictp;
21 off_t udictnb;
22 Uint4 upostp;
23 off_t upostn;
24 Uint4 ufieldp;
25 off_t ufieldn;
26 Uint4 uwordp;
27 off_t uwordn;
28 off_t lpostn;
29 off_t lfieldn;
30 off_t lwordn;
31 Uint4 lpostpsave;
32 Uint4 lfieldpsave;
33 Uint4 fieldc;
34 Uint4 lwordpsave;
35 Uint4 wordc;
36 ETYMON_INDEX_PAGE_L pagel;
37 ETYMON_INDEX_UPOST upost;
38 ETYMON_INDEX_LPOST lpost;
39 ETYMON_INDEX_UFIELD ufield;
40 ETYMON_INDEX_LFIELD lfield;
41 ETYMON_INDEX_UWORD uword;
42 ETYMON_INDEX_LWORD lword;
43 } Aflinst;
44
getlock(const char * db)45 static int getlock(const char *db)
46 {
47 if (!etymon_db_ready(db))
48 return aferr(AFEDBLOCK);
49 if (etymon_db_lock(db, NULL) < 0)
50 return -1;
51
52 return 0;
53 }
54
freelock(const char * db)55 static int freelock(const char *db)
56 {
57 etymon_db_unlock(db);
58
59 return 0;
60 }
61
openfiles(const char * db,Affile * f)62 static int openfiles(const char *db, Affile *f)
63 {
64 memset(f, 0, sizeof f);
65 if (!(f->info = afopendbf(db, AFFTINFO, "r+b")))
66 return -1;
67 if (!(f->udict = afopendbf(db, AFFTUDICT, "r+b")))
68 return -1;
69 if (!(f->upost = afopendbf(db, AFFTUPOST, "rb")))
70 return -1;
71 if (!(f->ufield = afopendbf(db, AFFTUFIELD, "rb")))
72 return -1;
73 if (!(f->uword = afopendbf(db, AFFTUWORD, "rb")))
74 return -1;
75 if (!(f->lpost = afopendbf(db, AFFTLPOST, "ab")))
76 return -1;
77 if (!(f->lfield = afopendbf(db, AFFTLFIELD, "ab")))
78 return -1;
79 if (!(f->lword = afopendbf(db, AFFTLWORD, "ab")))
80 return -1;
81 return 0;
82 }
83
closefile(FILE * f)84 static int closefile(FILE *f)
85 {
86 if (f) {
87 if (fclose(f) != 0)
88 return aferr(AFEDBIO);
89 }
90
91 return 0;
92 }
93
truncfile(const char * db,int type)94 static int truncfile(const char *db, int type)
95 {
96 FILE *f;
97
98 if (!(f = afopendbf(db, type, "wb")))
99 return aferr(AFEDBIO);
100 if (fclose(f) != 0)
101 return aferr(AFEDBIO);
102
103 return 0;
104 }
105
truncufiles(const char * db)106 static int truncufiles(const char *db)
107 {
108 if (truncfile(db, AFFTUPOST) < 0)
109 return -1;
110 if (truncfile(db, AFFTUFIELD) < 0)
111 return -1;
112 if (truncfile(db, AFFTUFIELD) < 0)
113 return -1;
114
115 return 0;
116 }
117
closefiles(Affile * f)118 static int closefiles(Affile *f)
119 {
120 if (closefile(f->info) < 0)
121 return -1;
122 if (closefile(f->udict) < 0)
123 return -1;
124 if (closefile(f->upost) < 0)
125 return -1;
126 if (closefile(f->ufield) < 0)
127 return -1;
128 if (closefile(f->uword) < 0)
129 return -1;
130 if (closefile(f->lpost) < 0)
131 return -1;
132 if (closefile(f->lfield) < 0)
133 return -1;
134 if (closefile(f->lword) < 0)
135 return -1;
136
137 return 0;
138 }
139
getfsizes(Aflinst * t)140 static int getfsizes(Aflinst *t)
141 {
142 off_t nb;
143
144 if (afgetfsize(t->f.udict, &t->udictnb) < 0)
145 return -1;
146
147 if (afgetfsize(t->f.upost, &nb) < 0)
148 return -1;
149 t->upostn = nb / sizeof (ETYMON_INDEX_UPOST);
150 if (afgetfsize(t->f.ufield, &nb) < 0)
151 return -1;
152 t->ufieldn = nb / sizeof (ETYMON_INDEX_UFIELD);
153 if (afgetfsize(t->f.uword, &nb) < 0)
154 return -1;
155 t->uwordn = nb / sizeof (ETYMON_INDEX_UWORD);
156 if (afgetfsize(t->f.lpost, &nb) < 0)
157 return -1;
158 t->lpostn = nb / sizeof (ETYMON_INDEX_LPOST);
159 if (afgetfsize(t->f.lfield, &nb) < 0)
160 return -1;
161 t->lfieldn = nb / sizeof (ETYMON_INDEX_LFIELD);
162 if (afgetfsize(t->f.lword, &nb) < 0)
163 return -1;
164 t->lwordn = nb / sizeof (ETYMON_INDEX_LWORD);
165
166 return 0;
167 }
168
seekleftleaf(Aflinst * t)169 static int seekleftleaf(Aflinst *t)
170 {
171 Uint1 leaf;
172 ETYMON_INDEX_PAGE_NL pagenl;
173
174 t->udictp = t->info.udict_root;
175 while (1) {
176 if (fseeko(t->f.udict, (off_t) t->udictp, SEEK_SET) < 0)
177 return aferr(AFEDBIO);
178 if (fread(&leaf, 1, 1, t->f.udict) < 0)
179 return aferr(AFEDBIO);
180 if (leaf)
181 return 0;
182 if (fread(&pagenl, 1, sizeof pagenl, t->f.udict) < sizeof pagenl)
183 return aferr(AFEDBIO);
184 t->udictp = pagenl.p[0];
185 }
186 }
187
readpagel(Aflinst * t)188 static int readpagel(Aflinst *t)
189 {
190 if (fseeko(t->f.udict, (off_t) (t->udictp + 1), SEEK_SET) < 0)
191 return aferr(AFEDBIO);
192 if (fread(&(t->pagel), 1, sizeof t->pagel, t->f.udict) < sizeof t->pagel)
193 return aferr(AFEDBIO);
194
195 return 0;
196 }
197
readupost(Aflinst * t)198 static int readupost(Aflinst *t)
199 {
200 if (t->rq->nobuffer) {
201 if (fseeko(t->f.upost,
202 (off_t) ( ((off_t) (t->upostp - 1)) *
203 ((off_t) (sizeof t->upost)) ),
204 SEEK_SET) < 0)
205 return aferr(AFEDBIO);
206 if (fread(&(t->upost), 1, sizeof t->upost, t->f.upost) < sizeof t->upost)
207 return aferr(AFEDBIO);
208 } else {
209 if (aflinread(&(t->upost),
210 (off_t) ( ((off_t) (t->upostp - 1)) *
211 ((off_t) (sizeof t->upost)) ),
212 sizeof t->upost) < 0)
213 return -1;
214 }
215
216 return 0;
217 }
218
readufield(Aflinst * t)219 static int readufield(Aflinst *t)
220 {
221 if (fseeko(t->f.ufield,
222 (off_t) ( ((off_t) (t->ufieldp - 1)) *
223 ((off_t) (sizeof t->ufield)) ),
224 SEEK_SET) < 0)
225 return aferr(AFEDBIO);
226 if (fread(&(t->ufield), 1, sizeof t->ufield, t->f.ufield) <
227 sizeof t->ufield)
228 return aferr(AFEDBIO);
229
230 return 0;
231 }
232
writelfield(Aflinst * t)233 static int writelfield(Aflinst *t)
234 {
235 if (fwrite(&(t->lfield), 1, sizeof t->lfield, t->f.lfield) <
236 sizeof t->lfield)
237 return aferr(AFEDBIO);
238
239 return 0;
240 }
241
linfield(Aflinst * t)242 static int linfield(Aflinst *t)
243 {
244 /* do we need to look for duplicates?? */
245 t->lfieldpsave = t->lfieldn + 1;
246 t->fieldc = 0;
247 t->ufieldp = t->upost.fields;
248 while (t->ufieldp) {
249 t->fieldc++;
250 if (readufield(t) < 0)
251 return -1;
252 memcpy(t->lfield.fields, t->ufield.fields,
253 ETYMON_MAX_FIELD_NEST * 2);
254 if (writelfield(t) < 0)
255 return -1;
256 (t->lfieldn)++;
257 t->ufieldp = t->ufield.next;
258 }
259
260 return 0;
261 }
262
readuword(Aflinst * t)263 static int readuword(Aflinst *t)
264 {
265 if (fseeko(t->f.uword,
266 (off_t) ( ((off_t) (t->uwordp - 1)) *
267 ((off_t) (sizeof t->uword)) ),
268 SEEK_SET) < 0)
269 return aferr(AFEDBIO);
270 if (fread(&(t->uword), 1, sizeof t->uword, t->f.uword) <
271 sizeof t->uword)
272 return aferr(AFEDBIO);
273
274 return 0;
275 }
276
writelword(Aflinst * t)277 static int writelword(Aflinst *t)
278 {
279 if (fwrite(&(t->lword), 1, sizeof t->lword, t->f.lword) <
280 sizeof t->lword)
281 return aferr(AFEDBIO);
282
283 return 0;
284 }
285
linwn(Aflinst * t)286 static int linwn(Aflinst *t)
287 {
288 t->lwordpsave = t->lwordn + 1;
289 t->wordc = 0;
290 t->uwordp = t->upost.word_numbers;
291 while (t->uwordp) {
292 t->wordc++;
293 if (readuword(t) < 0)
294 return -1;
295 t->lword.wn = t->uword.wn;
296 if (writelword(t) < 0)
297 return -1;
298 (t->lwordn)++;
299 t->uwordp = t->uword.next;
300 }
301
302 return 0;
303 }
304
writelpost(Aflinst * t)305 static int writelpost(Aflinst *t)
306 {
307 if (fwrite(&(t->lpost), 1, sizeof t->lpost, t->f.lpost) <
308 sizeof t->lpost)
309 return aferr(AFEDBIO);
310
311 return 0;
312 }
313
updatelpost(Aflinst * t)314 static int updatelpost(Aflinst *t)
315 {
316 /* compare the doc_id with our cached lpost */
317 if (t->upost.doc_id == t->lpost.doc_id) {
318 /* increment the frequency and field count */
319 t->lpost.freq += t->upost.freq;
320 t->lpost.fields_n += t->fieldc;
321 t->lpost.word_numbers_n += t->wordc;
322 } else {
323 /* flush lpost */
324 /* only flush if lpost contains something */
325 if (t->lpost.doc_id) {
326 if (writelpost(t) < 0)
327 return -1;
328 (t->lpostn)++;
329 (t->pagel.post_n[t->postx])++;
330 }
331 /* replace lpost with upost */
332 t->lpost.doc_id = t->upost.doc_id;
333 t->lpost.freq = t->upost.freq;
334 t->lpost.fields_n = t->fieldc;
335 t->lpost.word_numbers_n = t->wordc;
336 /* set field pointer */
337 t->lpost.fields = t->lfieldpsave;
338 t->lpost.word_numbers = t->lwordpsave;
339 }
340
341 return 0;
342 }
343
writepagel(Aflinst * t)344 static int writepagel(Aflinst *t)
345 {
346 if (fseeko(t->f.udict, (off_t) (t->udictp + 1), SEEK_SET) < 0)
347 return aferr(AFEDBIO);
348 if (fwrite(&(t->pagel), 1, sizeof t->pagel, t->f.udict) <
349 sizeof t->pagel)
350 return aferr(AFEDBIO);
351
352 return 0;
353 }
354
debugpostings(Aflinst * t)355 static void debugpostings(Aflinst *t)
356 {
357 if (t->rq->verbose >= 6) {
358 int x;
359 afprintvp(t->rq->verbose, 6);
360 printf("Read postings (key=\"");
361 for (x = t->pagel.offset[t->postx];
362 x < t->pagel.offset[t->postx + 1];
363 x++)
364 printf("%c", (char) t->pagel.keys[x]);
365 printf("\")\n");
366 }
367 }
368
linpost(Aflinst * t)369 static int linpost(Aflinst *t)
370 {
371 for (t->postx = 0; t->postx < t->pagel.n; (t->postx)++) {
372 debugpostings(t);
373 t->pagel.post_n[t->postx] = 0;
374 t->lpostpsave = t->lpostn + 1;
375 t->upostp = t->pagel.post[t->postx];
376 t->lpost.doc_id = 0;
377 while (t->upostp) {
378 afprintv(t->rq->verbose, 6, "Read posting");
379 if (readupost(t) < 0)
380 return -1;
381 afprintv(t->rq->verbose, 6, "Linearize fields");
382 if (linfield(t) < 0)
383 return -1;
384 afprintv(t->rq->verbose, 6, "Linearize word numbers");
385 if (linwn(t) < 0)
386 return -1;
387 if (updatelpost(t) < 0)
388 return -1;
389 t->upostp = t->upost.next;
390 }
391 /* flush lpost if it contains something */
392 if (t->lpost.doc_id) {
393 if (writelpost(t) < 0)
394 return -1;
395 (t->lpostn)++;
396 (t->pagel.post_n[t->postx])++;
397 }
398 t->pagel.post[t->postx] = t->lpostpsave;
399 }
400 if (writepagel(t) < 0)
401 return -1;
402 t->udictp = t->pagel.next;
403
404 return 0;
405 }
406
debugpagel(Aflinst * t)407 static void debugpagel(Aflinst *t)
408 {
409 if (t->rq->verbose >= 6) {
410 afprintvp(t->rq->verbose, 6);
411 printf("Leaf node: keys=\"%s\"\n",
412 (char *) t->pagel.keys);
413 }
414 }
415
416 /* this was written before the advent of ETYMON_INDEX_PAGE_L.post_n[],
417 ETYMON_INDEX_UPOST.fields_n, and ETYMON_INDEX_UPOST.word_numbers_n
418 in the first unlinearized pass; so it explicitly counts these
419 values while building the linear structures */
linearize(Aflinst * t)420 static int linearize(Aflinst *t)
421 {
422 afprintv(t->rq->verbose, 5, "Seek to leftmost leaf node");
423 if (seekleftleaf(t) < 0)
424 return -1;
425 do {
426 afprintv(t->rq->verbose, 5, "Read leaf node");
427 if (readpagel(t) < 0)
428 return -1;
429 debugpagel(t);
430 afprintv(t->rq->verbose, 5, "Linearize postings");
431 if (linpost(t) < 0)
432 return -1;
433 } while (t->udictp);
434
435 return 0;
436 }
437
linopen(Aflinst * t)438 static int linopen(Aflinst *t)
439 {
440 afprintv(t->rq->verbose, 4, "Locking database");
441 if (getlock(t->rq->db) < 0)
442 return -1;
443
444 afprintv(t->rq->verbose, 4, "Opening database files");
445 if (openfiles(t->rq->db, &(t->f)) < 0)
446 return -1;
447
448 afprintv(t->rq->verbose, 4, "Reading database information");
449 if (afreadinfo(t->f.info, &(t->info)) < 0)
450 return -1;
451
452 afprintv(t->rq->verbose, 4, "Checking file sizes");
453 if (getfsizes(t) < 0)
454 return -1;
455
456 if (!t->rq->nobuffer) {
457 if (aflinbuf(t->f.upost, t->rq->memory) < 0)
458 return -1;
459 }
460
461 return 0;
462 }
463
linclose(Aflinst * t)464 static int linclose(Aflinst *t)
465 {
466 t->info.optimized = 1;
467
468 afprintv(t->rq->verbose, 4, "Writing database information");
469 if (afwriteinfo(t->f.info, &(t->info)) < 0)
470 return -1;
471
472 afprintv(t->rq->verbose, 4, "Closing database files");
473 if (closefiles(&(t->f)) < 0)
474 return -1;
475
476 afprintv(t->rq->verbose, 4, "Truncating unlinearized files");
477 if (truncufiles(t->rq->db) < 0)
478 return -1;
479
480 afprintv(t->rq->verbose, 4, "Unlocking database");
481 if (freelock(t->rq->db) < 0)
482 return -1;
483
484 return 0;
485 }
486
_aflinear(const Aflinear * rq)487 int _aflinear(const Aflinear *rq)
488 {
489 Aflinst t;
490
491 t.rq = rq;
492 afprintv(rq->verbose, 2, "Linearizing");
493
494 afprintv(rq->verbose, 3, "Opening database");
495 if (linopen(&t) < 0)
496 return -1;
497 afprintv(rq->verbose, 4, "Checking if database is linearized");
498 /* exit if db is already linearized */
499 if (t.info.optimized) {
500 if (afclosedbf(&t.f) < 0)
501 return -1;
502 if (freelock(t.rq->db) < 0)
503 return -1;
504 return aferr(AFELINEAR);
505 }
506
507 afprintv(rq->verbose, 3, "Performing linearize process");
508 if (linearize(&t) < 0)
509 return -1;
510
511 afprintv(rq->verbose, 3, "Closing database");
512 if (linclose(&t) < 0)
513 return -1;
514
515 return 0;
516 }
517