1 /* This file is part of GNU Dico.
2 Copyright (C) 2012-2020 Sergey Poznyakoff
3
4 GNU Dico is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 GNU Dico is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with GNU Dico. If not, see <http://www.gnu.org/licenses/>. */
16
17 #include <config.h>
18 #include <stdlib.h>
19 #include <unistd.h>
20 #include <fcntl.h>
21 #include <sys/stat.h>
22 #include <sys/wait.h>
23 #include <dico.h>
24 #include "gcide.h"
25 #include <errno.h>
26 #include <appi18n.h>
27
28 struct gcide_idx_cache {
29 size_t pageno;
30 unsigned refcount;
31 struct gcide_idx_page *page;
32 };
33
34 struct gcide_idx_file {
35 char *name;
36 int fd;
37 struct gcide_idx_header header;
38 size_t cache_size;
39 size_t cache_used;
40 struct gcide_idx_cache **cache;
41 size_t compare_count;
42 };
43
44 #define REF_NOT_FOUND ((size_t)-1)
45
46 static void
_free_index(struct gcide_idx_file * file)47 _free_index(struct gcide_idx_file *file)
48 {
49 size_t i;
50 free(file->name);
51 for (i = 0; i < file->cache_used; i++) {
52 free(file->cache[i]->page);
53 free(file->cache[i]);
54 }
55 free(file->cache);
56 free(file);
57 }
58
59 static int
_idx_full_read(struct gcide_idx_file * file,void * buf,size_t size)60 _idx_full_read(struct gcide_idx_file *file, void *buf, size_t size)
61 {
62 char *p = buf;
63 while (size) {
64 ssize_t rc = read(file->fd, p, size);
65 if (rc == -1) {
66 if (errno == EAGAIN)
67 continue;
68 dico_log(L_ERR, errno, _("error reading from `%s'"), file->name);
69 return -1;
70 } else if (rc == 0) {
71 dico_log(L_ERR, errno, _("short read while reading from `%s'"),
72 file->name);
73 return -1;
74 }
75 p += rc;
76 size -= rc;
77 }
78 return 0;
79 }
80
81 static int
_open_index(struct gcide_idx_file * file)82 _open_index(struct gcide_idx_file *file)
83 {
84 off_t total;
85 struct stat st;
86
87 if (_idx_full_read(file, &file->header, sizeof(file->header)))
88 return 1;
89 if (memcmp(file->header.ihdr_magic, GCIDE_IDX_MAGIC,
90 GCIDE_IDX_MAGIC_LEN)) {
91 dico_log(L_ERR, 0, _("file `%s' is not a valid gcide index file"),
92 file->name);
93 return 1;
94 }
95
96 if (fstat(file->fd, &st)) {
97 dico_log(L_ERR, errno, "fstat `%s'", file->name);
98 return 1;
99 }
100
101 total = (file->header.ihdr_num_pages + 1) * file->header.ihdr_pagesize;
102 if (total != st.st_size) {
103 dico_log(L_ERR, 0, _("index file `%s' is corrupted"), file->name);
104 return 1;
105 }
106 return 0;
107 }
108
109 struct gcide_idx_file *
gcide_idx_file_open(const char * name,size_t cachesize)110 gcide_idx_file_open(const char *name, size_t cachesize)
111 {
112 int fd;
113 struct gcide_idx_file *file;
114
115 file = calloc(1, sizeof(*file));
116 if (!file) {
117 DICO_LOG_ERRNO();
118 return NULL;
119 }
120 file->name = strdup(name);
121 if (!file->name) {
122 DICO_LOG_ERRNO();
123 free(file);
124 return NULL;
125 }
126 fd = open(name, O_RDONLY);
127 if (fd == -1) {
128 dico_log(L_ERR, errno, _("cannot open index file `%s'"), name);
129 free(file);
130 }
131 file->fd = fd;
132 if (_open_index(file)) {
133 _free_index(file);
134 return NULL;
135 }
136 file->cache_size = cachesize;
137 return file;
138 }
139
140 void
gcide_idx_file_close(struct gcide_idx_file * file)141 gcide_idx_file_close(struct gcide_idx_file *file)
142 {
143 if (file) {
144 close(file->fd);
145 _free_index(file);
146 }
147 }
148
149 size_t
gcide_idx_headwords(struct gcide_idx_file * file)150 gcide_idx_headwords(struct gcide_idx_file *file)
151 {
152 return file->header.ihdr_num_headwords;
153 }
154
155 size_t
gcide_idx_defs(struct gcide_idx_file * file)156 gcide_idx_defs(struct gcide_idx_file *file)
157 {
158 return file->header.ihdr_num_defs;
159 }
160
161 static struct gcide_idx_cache *
_cache_alloc(struct gcide_idx_file * file)162 _cache_alloc(struct gcide_idx_file *file)
163 {
164 struct gcide_idx_cache *cp;
165
166 if (!file->cache) {
167 file->cache = calloc(file->cache_size, sizeof(file->cache[0]));
168 if (!file->cache) {
169 DICO_LOG_ERRNO();
170 return NULL;
171 }
172 }
173 if (file->cache_used < file->cache_size) {
174 if (file->cache_used &&
175 file->cache[file->cache_used - 1]->refcount == 0)
176 return file->cache[file->cache_used - 1];
177 cp = calloc(1, sizeof(*cp));
178 if (!cp) {
179 DICO_LOG_ERRNO();
180 return NULL;
181 }
182 cp->page = malloc(file->header.ihdr_pagesize);
183 if (!cp->page) {
184 DICO_LOG_ERRNO();
185 free(cp);
186 return NULL;
187 }
188 file->cache[file->cache_used++] = cp;
189 } else
190 cp = file->cache[file->cache_used - 1];
191 cp->pageno = 0;
192 cp->refcount = 0;
193 return cp;
194 }
195
196 static void
_cache_promote(struct gcide_idx_file * file,int n)197 _cache_promote(struct gcide_idx_file *file, int n)
198 {
199 int i;
200 unsigned refcount = ++file->cache[n]->refcount;
201
202 if (n == 0)
203 return;
204 for (i = n - 1; i >= 0; i--)
205 if (file->cache[i]->refcount >= refcount)
206 break;
207 i++;
208 if (i != n) {
209 struct gcide_idx_cache *tmp = file->cache[n];
210 file->cache[n] = file->cache[i];
211 file->cache[i] = tmp;
212 }
213 }
214
215 static struct gcide_idx_cache *
_cache_find_page(struct gcide_idx_file * file,size_t n)216 _cache_find_page(struct gcide_idx_file *file, size_t n)
217 {
218 size_t i;
219
220 for (i = 0; i < file->cache_used; i++) {
221 if (file->cache[i]->pageno == n) {
222 struct gcide_idx_cache *cp = file->cache[i];
223 _cache_promote(file, n);
224 return cp;
225 }
226 }
227
228 return NULL;
229 }
230
231 struct gcide_idx_page *
_idx_get_page(struct gcide_idx_file * file,size_t n)232 _idx_get_page(struct gcide_idx_file *file, size_t n)
233 {
234 off_t off;
235 struct gcide_idx_cache *cp;
236 struct gcide_idx_page *page;
237 size_t i;
238
239 cp = _cache_find_page(file, n);
240 if (cp)
241 return cp->page;
242
243 off = (n + 1) * file->header.ihdr_pagesize;
244 if (lseek(file->fd, off, SEEK_SET) != off) {
245 dico_log(L_ERR, errno,
246 _("seek error on `%s' while positioning to %lu"),
247 file->name, (unsigned long) off);
248 return NULL;
249 }
250 cp = _cache_alloc(file);
251 if (!cp)
252 return NULL;
253 if (_idx_full_read(file, cp->page, file->header.ihdr_pagesize))
254 return NULL;
255 cp->refcount++;
256 page = cp->page;
257 for (i = 0; i < page->ipg_header.hdr.phdr_numentries; i++)
258 page->ipg_ref[i].ref_headword =
259 (char*)page + page->ipg_ref[i].ref_hwoff;
260 return page;
261 }
262
263
264 int
gcide_idx_enumerate(struct gcide_idx_file * file,int (* fun)(struct gcide_ref *,void *),void * data)265 gcide_idx_enumerate(struct gcide_idx_file *file,
266 int (*fun)(struct gcide_ref *, void *),
267 void *data)
268 {
269 size_t i;
270
271 for (i = 0; i < file->header.ihdr_num_pages; i++) {
272 int j;
273
274 struct gcide_idx_page *page = _idx_get_page(file, i);
275 if (!page)
276 return -1;
277 for (j = 0; j < page->ipg_header.hdr.phdr_numentries; j++)
278 if (fun(&page->ipg_ref[j], data))
279 break;
280 }
281 return 0;
282 }
283
284 static int
_compare(struct gcide_idx_file * file,char * hw,struct gcide_ref * ref,size_t hwlen)285 _compare(struct gcide_idx_file *file, char *hw, struct gcide_ref *ref,
286 size_t hwlen)
287 {
288 file->compare_count++;
289 if (hwlen) {
290 if (hwlen > ref->ref_hwlen)
291 hwlen = ref->ref_hwlen;
292 return utf8_strncasecmp(hw, ref->ref_headword, hwlen);
293 }
294 return utf8_strcasecmp(hw, ref->ref_headword);
295 }
296
297 static size_t
_idx_ref_locate(struct gcide_idx_file * file,struct gcide_idx_page * page,char * headword,size_t hwlen)298 _idx_ref_locate(struct gcide_idx_file *file,
299 struct gcide_idx_page *page, char *headword, size_t hwlen)
300 {
301 size_t l, u, idx;
302 int res;
303
304 l = 0;
305 u = page->ipg_header.hdr.phdr_numentries;
306 while (l < u) {
307 idx = (l + u) / 2;
308 res = _compare(file, headword, &page->ipg_ref[idx], hwlen);
309 if (res < 0)
310 u = idx;
311 else if (res > 0)
312 l = idx + 1;
313 else
314 return idx;
315 }
316 return REF_NOT_FOUND;
317 }
318
319 static size_t
_idx_page_locate(struct gcide_idx_file * file,char * headword,size_t hwlen)320 _idx_page_locate(struct gcide_idx_file *file, char *headword, size_t hwlen)
321 {
322 size_t l, u, idx;
323 int res;
324
325 l = 0;
326 u = file->header.ihdr_num_pages;
327 while (l < u) {
328 struct gcide_idx_page *page;
329
330 idx = (l + u) / 2;
331 page = _idx_get_page(file, idx);
332 if (!page)
333 return REF_NOT_FOUND;
334
335 res = _compare(file, headword, &page->ipg_ref[0], hwlen);
336 if (res < 0)
337 u = idx;
338 else if (res == 0)
339 return idx;
340 else {
341 res = _compare(file, headword,
342 &page->ipg_ref[page->ipg_header.hdr.phdr_numentries-1],
343 hwlen);
344 if (res > 0)
345 l = idx + 1;
346 else
347 return idx;
348 }
349 }
350 return REF_NOT_FOUND;
351 }
352
353 struct gcide_iterator {
354 struct gcide_idx_file *file; /* Index file */
355 char *headword; /* Headword this iterator tracks */
356 size_t hwlen; /* Length of the headword */
357 size_t start_pageno; /* Number of start page */
358 size_t start_refno; /* Number of start reference in page */
359 size_t cur_pageno; /* Number of current page */
360 size_t cur_refno; /* Number of current reference in page */
361 size_t page_numrefs; /* Total number of references in cur. page */
362 size_t compare_count; /* Number of comparisons */
363 /* */
364 size_t numrefs; /* Number of references to headword. Zero
365 if not yet determined. */
366 size_t curref; /* Position in iterator */
367
368 char *prevbuf; /* Previous match */
369 size_t prevsize;
370
371 int flags; /* User-defined flags. */
372 };
373
374 gcide_iterator_t
gcide_idx_locate(struct gcide_idx_file * file,char * headword,size_t hwlen)375 gcide_idx_locate(struct gcide_idx_file *file, char *headword, size_t hwlen)
376 {
377 size_t pageno, refno;
378 struct gcide_idx_page *page;
379 struct gcide_iterator *itr;
380
381 file->compare_count = 0;
382 pageno = _idx_page_locate(file, headword, hwlen);
383 if (pageno == REF_NOT_FOUND)
384 return NULL;
385 page = _idx_get_page(file, pageno);
386 if (!page)
387 return NULL;
388 refno = _idx_ref_locate(file, page, headword, hwlen);
389 if (refno == REF_NOT_FOUND)
390 return NULL;
391
392 for (;;) {
393 if (refno > 0) {
394 if (_compare(file, headword, &page->ipg_ref[refno-1], hwlen) > 0)
395 break;
396 --refno;
397 }
398 if (refno == 0) {
399 if (pageno == 0)
400 break;
401 page = _idx_get_page(file, --pageno);
402 if (!page)
403 return NULL;
404 refno = page->ipg_header.hdr.phdr_numentries;
405 }
406 }
407 if (refno == page->ipg_header.hdr.phdr_numentries) {
408 pageno++;
409 refno = 0;
410 }
411
412 itr = malloc(sizeof(*itr));
413 if (!itr) {
414 DICO_LOG_ERRNO();
415 return NULL;
416 }
417 if (hwlen) {
418 itr->headword = malloc(hwlen);
419 if (itr->headword)
420 memcpy(itr->headword, headword, hwlen);
421 } else
422 itr->headword = strdup(headword);
423
424 if (!itr->headword) {
425 DICO_LOG_ERRNO();
426 free(itr);
427 return NULL;
428 }
429
430 itr->hwlen = hwlen;
431
432 itr->file = file;
433 itr->start_pageno = itr->cur_pageno = pageno;
434 itr->start_refno = itr->cur_refno = refno;
435 itr->page_numrefs = page->ipg_header.hdr.phdr_numentries;
436 itr->curref = itr->numrefs = 0;
437 itr->compare_count = file->compare_count;
438
439 return itr;
440 }
441
442 void
gcide_iterator_free(gcide_iterator_t itr)443 gcide_iterator_free(gcide_iterator_t itr)
444 {
445 if (itr) {
446 free(itr->headword);
447 free(itr);
448 }
449 }
450
451 int
gcide_iterator_next(gcide_iterator_t itr)452 gcide_iterator_next(gcide_iterator_t itr)
453 {
454 struct gcide_idx_page *page;
455 size_t pageno, refno;
456
457 if (!itr)
458 return -1;
459
460 if (itr->numrefs && itr->curref == itr->numrefs-1)
461 return -1;
462
463 if (itr->cur_refno < itr->page_numrefs-1) {
464 pageno = itr->cur_pageno;
465 refno = itr->cur_refno + 1;
466 } else if (itr->cur_pageno == itr->file->header.ihdr_num_pages) {
467 if (!itr->numrefs)
468 itr->numrefs = itr->curref + 1;
469 return -1;
470 } else {
471 pageno = itr->cur_pageno + 1;
472 refno = 0;
473 }
474
475 page = _idx_get_page(itr->file, pageno);
476 if (!page)
477 return -1;
478 if (!itr->numrefs &&
479 _compare(itr->file, itr->headword, &page->ipg_ref[refno],
480 itr->hwlen)) {
481 if (!itr->numrefs)
482 itr->numrefs = itr->curref + 1;
483 return -1;
484 }
485 itr->page_numrefs = page->ipg_header.hdr.phdr_numentries;
486 itr->cur_pageno = pageno;
487 itr->cur_refno = refno;
488 itr->curref++;
489 return 0;
490 }
491
492 int
gcide_iterator_rewind(gcide_iterator_t itr)493 gcide_iterator_rewind(gcide_iterator_t itr)
494 {
495 struct gcide_idx_page *page;
496
497 if (!itr)
498 return -1;
499 itr->cur_pageno = itr->start_pageno;
500 itr->cur_refno = itr->start_refno;
501 itr->curref = 0;
502 page = _idx_get_page(itr->file, itr->cur_pageno);
503 if (!page)
504 return -1;
505 itr->page_numrefs = page->ipg_header.hdr.phdr_numentries;
506 return 0;
507 }
508
509 struct gcide_ref *
gcide_iterator_ref(gcide_iterator_t itr)510 gcide_iterator_ref(gcide_iterator_t itr)
511 {
512 struct gcide_idx_page *page;
513
514 if (!itr)
515 return NULL;
516 page = _idx_get_page(itr->file, itr->cur_pageno);
517 if (!page)
518 return NULL;
519 return &page->ipg_ref[itr->cur_refno];
520 }
521
522 size_t
gcide_iterator_count(gcide_iterator_t itr)523 gcide_iterator_count(gcide_iterator_t itr)
524 {
525 if (!itr)
526 return 0;
527 if (!itr->numrefs) {
528 while (gcide_iterator_next(itr) == 0)
529 ;
530 gcide_iterator_rewind(itr);
531 }
532 return itr->numrefs;
533 }
534
535 size_t
gcide_iterator_compare_count(gcide_iterator_t itr)536 gcide_iterator_compare_count(gcide_iterator_t itr)
537 {
538 return itr->compare_count;
539 }
540
541 void
gcide_iterator_store_flags(gcide_iterator_t itr,int flags)542 gcide_iterator_store_flags(gcide_iterator_t itr, int flags)
543 {
544 itr->flags = flags;
545 }
546
547 int
gcide_iterator_flags(gcide_iterator_t itr)548 gcide_iterator_flags(gcide_iterator_t itr)
549 {
550 if (!itr)
551 return 0;
552 return itr->flags;
553 }
554