1 /* This file is part of GNU Dico.
2    Copyright (C) 2012-2020 Sergey Poznyakoff
3 
4    GNU Dico is free software; you can redistribute it and/or modify
5    it under the terms of the GNU General Public License as published by
6    the Free Software Foundation; either version 3, or (at your option)
7    any later version.
8 
9    GNU Dico is distributed in the hope that it will be useful,
10    but WITHOUT ANY WARRANTY; without even the implied warranty of
11    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12    GNU General Public License for more details.
13 
14    You should have received a copy of the GNU General Public License
15    along with GNU Dico.  If not, see <http://www.gnu.org/licenses/>. */
16 
17 #include <config.h>
18 #include <stdlib.h>
19 #include <unistd.h>
20 #include <fcntl.h>
21 #include <sys/stat.h>
22 #include <sys/wait.h>
23 #include <dico.h>
24 #include "gcide.h"
25 #include <errno.h>
26 #include <appi18n.h>
27 
28 struct gcide_idx_cache {
29     size_t pageno;
30     unsigned refcount;
31     struct gcide_idx_page *page;
32 };
33 
34 struct gcide_idx_file {
35     char *name;
36     int fd;
37     struct gcide_idx_header header;
38     size_t cache_size;
39     size_t cache_used;
40     struct gcide_idx_cache **cache;
41     size_t compare_count;
42 };
43 
44 #define REF_NOT_FOUND ((size_t)-1)
45 
46 static void
_free_index(struct gcide_idx_file * file)47 _free_index(struct gcide_idx_file *file)
48 {
49     size_t i;
50     free(file->name);
51     for (i = 0; i < file->cache_used; i++) {
52 	free(file->cache[i]->page);
53 	free(file->cache[i]);
54     }
55     free(file->cache);
56     free(file);
57 }
58 
59 static int
_idx_full_read(struct gcide_idx_file * file,void * buf,size_t size)60 _idx_full_read(struct gcide_idx_file *file, void *buf, size_t size)
61 {
62     char *p = buf;
63     while (size) {
64 	ssize_t rc = read(file->fd, p, size);
65 	if (rc == -1) {
66 	    if (errno == EAGAIN)
67 		continue;
68 	    dico_log(L_ERR, errno, _("error reading from `%s'"), file->name);
69 	    return -1;
70 	} else if (rc == 0) {
71 	    dico_log(L_ERR, errno, _("short read while reading from `%s'"),
72 		     file->name);
73 	    return -1;
74 	}
75 	p += rc;
76 	size -= rc;
77     }
78     return 0;
79 }
80 
81 static int
_open_index(struct gcide_idx_file * file)82 _open_index(struct gcide_idx_file *file)
83 {
84     off_t total;
85     struct stat st;
86 
87     if (_idx_full_read(file, &file->header, sizeof(file->header)))
88 	return 1;
89     if (memcmp(file->header.ihdr_magic, GCIDE_IDX_MAGIC,
90 	       GCIDE_IDX_MAGIC_LEN)) {
91 	dico_log(L_ERR, 0, _("file `%s' is not a valid gcide index file"),
92 		 file->name);
93 	return 1;
94     }
95 
96     if (fstat(file->fd, &st)) {
97 	dico_log(L_ERR, errno, "fstat `%s'", file->name);
98 	return 1;
99     }
100 
101     total = (file->header.ihdr_num_pages + 1) * file->header.ihdr_pagesize;
102     if (total != st.st_size) {
103 	dico_log(L_ERR, 0, _("index file `%s' is corrupted"), file->name);
104 	return 1;
105     }
106     return 0;
107 }
108 
109 struct gcide_idx_file *
gcide_idx_file_open(const char * name,size_t cachesize)110 gcide_idx_file_open(const char *name, size_t cachesize)
111 {
112     int fd;
113     struct gcide_idx_file *file;
114 
115     file = calloc(1, sizeof(*file));
116     if (!file) {
117         DICO_LOG_ERRNO();
118 	return NULL;
119     }
120     file->name = strdup(name);
121     if (!file->name) {
122         DICO_LOG_ERRNO();
123 	free(file);
124 	return NULL;
125     }
126     fd = open(name, O_RDONLY);
127     if (fd == -1) {
128 	dico_log(L_ERR, errno, _("cannot open index file `%s'"), name);
129 	free(file);
130     }
131     file->fd = fd;
132     if (_open_index(file)) {
133 	_free_index(file);
134 	return NULL;
135     }
136     file->cache_size = cachesize;
137     return file;
138 }
139 
140 void
gcide_idx_file_close(struct gcide_idx_file * file)141 gcide_idx_file_close(struct gcide_idx_file *file)
142 {
143     if (file) {
144 	close(file->fd);
145 	_free_index(file);
146     }
147 }
148 
149 size_t
gcide_idx_headwords(struct gcide_idx_file * file)150 gcide_idx_headwords(struct gcide_idx_file *file)
151 {
152     return file->header.ihdr_num_headwords;
153 }
154 
155 size_t
gcide_idx_defs(struct gcide_idx_file * file)156 gcide_idx_defs(struct gcide_idx_file *file)
157 {
158     return file->header.ihdr_num_defs;
159 }
160 
161 static struct gcide_idx_cache *
_cache_alloc(struct gcide_idx_file * file)162 _cache_alloc(struct gcide_idx_file *file)
163 {
164     struct gcide_idx_cache *cp;
165 
166     if (!file->cache) {
167 	file->cache = calloc(file->cache_size, sizeof(file->cache[0]));
168 	if (!file->cache) {
169             DICO_LOG_ERRNO();
170 	    return NULL;
171 	}
172     }
173     if (file->cache_used < file->cache_size) {
174 	if (file->cache_used &&
175 	    file->cache[file->cache_used - 1]->refcount == 0)
176 	    return file->cache[file->cache_used - 1];
177 	cp = calloc(1, sizeof(*cp));
178 	if (!cp) {
179             DICO_LOG_ERRNO();
180 	    return NULL;
181 	}
182 	cp->page = malloc(file->header.ihdr_pagesize);
183 	if (!cp->page) {
184             DICO_LOG_ERRNO();
185 	    free(cp);
186 	    return NULL;
187 	}
188 	file->cache[file->cache_used++] = cp;
189     } else
190 	cp = file->cache[file->cache_used - 1];
191     cp->pageno = 0;
192     cp->refcount = 0;
193     return cp;
194 }
195 
196 static void
_cache_promote(struct gcide_idx_file * file,int n)197 _cache_promote(struct gcide_idx_file *file, int n)
198 {
199     int i;
200     unsigned refcount = ++file->cache[n]->refcount;
201 
202     if (n == 0)
203 	return;
204     for (i = n - 1; i >= 0; i--)
205 	if (file->cache[i]->refcount >= refcount)
206 	    break;
207     i++;
208     if (i != n) {
209 	struct gcide_idx_cache *tmp = file->cache[n];
210 	file->cache[n] = file->cache[i];
211 	file->cache[i] = tmp;
212     }
213 }
214 
215 static struct gcide_idx_cache *
_cache_find_page(struct gcide_idx_file * file,size_t n)216 _cache_find_page(struct gcide_idx_file *file, size_t n)
217 {
218     size_t i;
219 
220     for (i = 0; i < file->cache_used; i++) {
221 	if (file->cache[i]->pageno == n) {
222 	    struct gcide_idx_cache *cp = file->cache[i];
223 	    _cache_promote(file, n);
224 	    return cp;
225 	}
226     }
227 
228     return NULL;
229 }
230 
231 struct gcide_idx_page *
_idx_get_page(struct gcide_idx_file * file,size_t n)232 _idx_get_page(struct gcide_idx_file *file, size_t n)
233 {
234     off_t off;
235     struct gcide_idx_cache *cp;
236     struct gcide_idx_page *page;
237     size_t i;
238 
239     cp = _cache_find_page(file, n);
240     if (cp)
241 	return cp->page;
242 
243     off = (n + 1) * file->header.ihdr_pagesize;
244     if (lseek(file->fd, off, SEEK_SET) != off) {
245 	dico_log(L_ERR, errno,
246 		 _("seek error on `%s' while positioning to %lu"),
247 		 file->name, (unsigned long) off);
248 	return NULL;
249     }
250     cp = _cache_alloc(file);
251     if (!cp)
252 	return NULL;
253     if (_idx_full_read(file, cp->page, file->header.ihdr_pagesize))
254 	return NULL;
255     cp->refcount++;
256     page = cp->page;
257     for (i = 0; i < page->ipg_header.hdr.phdr_numentries; i++)
258 	page->ipg_ref[i].ref_headword =
259 	    (char*)page + page->ipg_ref[i].ref_hwoff;
260     return page;
261 }
262 
263 
264 int
gcide_idx_enumerate(struct gcide_idx_file * file,int (* fun)(struct gcide_ref *,void *),void * data)265 gcide_idx_enumerate(struct gcide_idx_file *file,
266 		    int (*fun)(struct gcide_ref *, void *),
267 		    void *data)
268 {
269     size_t i;
270 
271     for (i = 0; i < file->header.ihdr_num_pages; i++) {
272 	int j;
273 
274 	struct gcide_idx_page *page = _idx_get_page(file, i);
275 	if (!page)
276 	    return -1;
277 	for (j = 0; j < page->ipg_header.hdr.phdr_numentries; j++)
278 	    if (fun(&page->ipg_ref[j], data))
279 		break;
280     }
281     return 0;
282 }
283 
284 static int
_compare(struct gcide_idx_file * file,char * hw,struct gcide_ref * ref,size_t hwlen)285 _compare(struct gcide_idx_file *file, char *hw, struct gcide_ref *ref,
286 	 size_t hwlen)
287 {
288     file->compare_count++;
289     if (hwlen) {
290 	if (hwlen > ref->ref_hwlen)
291 	    hwlen = ref->ref_hwlen;
292 	return utf8_strncasecmp(hw, ref->ref_headword, hwlen);
293     }
294     return utf8_strcasecmp(hw, ref->ref_headword);
295 }
296 
297 static size_t
_idx_ref_locate(struct gcide_idx_file * file,struct gcide_idx_page * page,char * headword,size_t hwlen)298 _idx_ref_locate(struct gcide_idx_file *file,
299 		struct gcide_idx_page *page, char *headword, size_t hwlen)
300 {
301     size_t l, u, idx;
302     int res;
303 
304     l = 0;
305     u = page->ipg_header.hdr.phdr_numentries;
306     while (l < u) {
307 	idx = (l + u) / 2;
308 	res = _compare(file, headword, &page->ipg_ref[idx], hwlen);
309 	if (res < 0)
310 	    u = idx;
311 	else if (res > 0)
312 	    l = idx + 1;
313 	else
314 	    return idx;
315     }
316     return REF_NOT_FOUND;
317 }
318 
319 static size_t
_idx_page_locate(struct gcide_idx_file * file,char * headword,size_t hwlen)320 _idx_page_locate(struct gcide_idx_file *file, char *headword, size_t hwlen)
321 {
322     size_t l, u, idx;
323     int res;
324 
325     l = 0;
326     u = file->header.ihdr_num_pages;
327     while (l < u) {
328 	struct gcide_idx_page *page;
329 
330 	idx = (l + u) / 2;
331 	page = _idx_get_page(file, idx);
332 	if (!page)
333 	    return REF_NOT_FOUND;
334 
335 	res = _compare(file, headword, &page->ipg_ref[0], hwlen);
336 	if (res < 0)
337 	    u = idx;
338 	else if (res == 0)
339 	    return idx;
340 	else {
341 	    res = _compare(file, headword,
342 			   &page->ipg_ref[page->ipg_header.hdr.phdr_numentries-1],
343 			   hwlen);
344 	    if (res > 0)
345 		l = idx + 1;
346 	    else
347 		return idx;
348 	}
349     }
350     return REF_NOT_FOUND;
351 }
352 
353 struct gcide_iterator {
354     struct gcide_idx_file *file; /* Index file */
355     char *headword;              /* Headword this iterator tracks */
356     size_t hwlen;                /* Length of the headword */
357     size_t start_pageno;         /* Number of start page */
358     size_t start_refno;          /* Number of start reference in page */
359     size_t cur_pageno;           /* Number of current page */
360     size_t cur_refno;            /* Number of current reference in page */
361     size_t page_numrefs;         /* Total number of references in cur. page */
362     size_t compare_count;        /* Number of comparisons */
363     /* */
364     size_t numrefs;              /* Number of references to headword. Zero
365 				    if not yet determined. */
366     size_t curref;               /* Position in iterator */
367 
368     char *prevbuf;               /* Previous match */
369     size_t prevsize;
370 
371     int flags;                   /* User-defined flags. */
372 };
373 
374 gcide_iterator_t
gcide_idx_locate(struct gcide_idx_file * file,char * headword,size_t hwlen)375 gcide_idx_locate(struct gcide_idx_file *file, char *headword, size_t hwlen)
376 {
377     size_t pageno, refno;
378     struct gcide_idx_page *page;
379     struct gcide_iterator *itr;
380 
381     file->compare_count = 0;
382     pageno = _idx_page_locate(file, headword, hwlen);
383     if (pageno == REF_NOT_FOUND)
384 	return NULL;
385     page = _idx_get_page(file, pageno);
386     if (!page)
387 	return NULL;
388     refno = _idx_ref_locate(file, page, headword, hwlen);
389     if (refno == REF_NOT_FOUND)
390 	return NULL;
391 
392     for (;;) {
393 	if (refno > 0) {
394 	    if (_compare(file, headword, &page->ipg_ref[refno-1], hwlen) > 0)
395 		break;
396 	    --refno;
397 	}
398 	if (refno == 0) {
399 	    if (pageno == 0)
400 		break;
401 	    page = _idx_get_page(file, --pageno);
402 	    if (!page)
403 		return NULL;
404 	    refno = page->ipg_header.hdr.phdr_numentries;
405 	}
406     }
407     if (refno == page->ipg_header.hdr.phdr_numentries) {
408 	pageno++;
409 	refno = 0;
410     }
411 
412     itr = malloc(sizeof(*itr));
413     if (!itr) {
414         DICO_LOG_ERRNO();
415 	return NULL;
416     }
417     if (hwlen) {
418 	itr->headword = malloc(hwlen);
419 	if (itr->headword)
420 	    memcpy(itr->headword, headword, hwlen);
421     } else
422 	itr->headword = strdup(headword);
423 
424     if (!itr->headword) {
425         DICO_LOG_ERRNO();
426 	free(itr);
427 	return NULL;
428     }
429 
430     itr->hwlen = hwlen;
431 
432     itr->file = file;
433     itr->start_pageno = itr->cur_pageno = pageno;
434     itr->start_refno = itr->cur_refno = refno;
435     itr->page_numrefs = page->ipg_header.hdr.phdr_numentries;
436     itr->curref = itr->numrefs = 0;
437     itr->compare_count = file->compare_count;
438 
439     return itr;
440 }
441 
442 void
gcide_iterator_free(gcide_iterator_t itr)443 gcide_iterator_free(gcide_iterator_t itr)
444 {
445     if (itr) {
446 	free(itr->headword);
447 	free(itr);
448     }
449 }
450 
451 int
gcide_iterator_next(gcide_iterator_t itr)452 gcide_iterator_next(gcide_iterator_t itr)
453 {
454     struct gcide_idx_page *page;
455     size_t pageno, refno;
456 
457     if (!itr)
458 	return -1;
459 
460     if (itr->numrefs && itr->curref == itr->numrefs-1)
461 	return -1;
462 
463     if (itr->cur_refno < itr->page_numrefs-1) {
464 	pageno = itr->cur_pageno;
465 	refno = itr->cur_refno + 1;
466     } else if (itr->cur_pageno == itr->file->header.ihdr_num_pages) {
467 	if (!itr->numrefs)
468 	    itr->numrefs = itr->curref + 1;
469 	return -1;
470     } else {
471 	pageno = itr->cur_pageno + 1;
472 	refno = 0;
473     }
474 
475     page = _idx_get_page(itr->file, pageno);
476     if (!page)
477 	return -1;
478     if (!itr->numrefs &&
479 	_compare(itr->file, itr->headword, &page->ipg_ref[refno],
480 		 itr->hwlen)) {
481 	if (!itr->numrefs)
482 	    itr->numrefs = itr->curref + 1;
483 	return -1;
484     }
485     itr->page_numrefs = page->ipg_header.hdr.phdr_numentries;
486     itr->cur_pageno = pageno;
487     itr->cur_refno = refno;
488     itr->curref++;
489     return 0;
490 }
491 
492 int
gcide_iterator_rewind(gcide_iterator_t itr)493 gcide_iterator_rewind(gcide_iterator_t itr)
494 {
495     struct gcide_idx_page *page;
496 
497     if (!itr)
498 	return -1;
499     itr->cur_pageno = itr->start_pageno;
500     itr->cur_refno = itr->start_refno;
501     itr->curref = 0;
502     page = _idx_get_page(itr->file, itr->cur_pageno);
503     if (!page)
504 	return -1;
505     itr->page_numrefs = page->ipg_header.hdr.phdr_numentries;
506     return 0;
507 }
508 
509 struct gcide_ref *
gcide_iterator_ref(gcide_iterator_t itr)510 gcide_iterator_ref(gcide_iterator_t itr)
511 {
512     struct gcide_idx_page *page;
513 
514     if (!itr)
515 	return NULL;
516     page = _idx_get_page(itr->file, itr->cur_pageno);
517     if (!page)
518 	return NULL;
519     return &page->ipg_ref[itr->cur_refno];
520 }
521 
522 size_t
gcide_iterator_count(gcide_iterator_t itr)523 gcide_iterator_count(gcide_iterator_t itr)
524 {
525     if (!itr)
526 	return 0;
527     if (!itr->numrefs) {
528 	while (gcide_iterator_next(itr) == 0)
529 	    ;
530 	gcide_iterator_rewind(itr);
531     }
532     return itr->numrefs;
533 }
534 
535 size_t
gcide_iterator_compare_count(gcide_iterator_t itr)536 gcide_iterator_compare_count(gcide_iterator_t itr)
537 {
538     return itr->compare_count;
539 }
540 
541 void
gcide_iterator_store_flags(gcide_iterator_t itr,int flags)542 gcide_iterator_store_flags(gcide_iterator_t itr, int flags)
543 {
544     itr->flags = flags;
545 }
546 
547 int
gcide_iterator_flags(gcide_iterator_t itr)548 gcide_iterator_flags(gcide_iterator_t itr)
549 {
550     if (!itr)
551 	return 0;
552     return itr->flags;
553 }
554