1 /* $Id: tdx-group.c 9859 2015-05-14 13:25:42Z iulius $
2 **
3 ** Group index handling for the tradindexed overview method.
4 **
5 ** Implements the handling of the group.index file for the tradindexed
6 ** overview method. This file contains an entry for every group and stores
7 ** the high and low article marks and the base article numbers for each
8 ** individual group index file.
9 **
10 ** Externally visible functions have a tdx_ prefix; internal functions do
11 ** not. (Externally visible unfortunately means everything that needs to be
12 ** visible outside of this object file, not just interfaces exported to
13 ** consumers of the overview API.)
14 **
15 ** This code has to support readers and writers sharing the same files, and
16 ** we want to avoid locking where possible since locking may be very slow
17 ** (such as over NFS). Each group has two data files (and one has to get the
18 ** right index file for a given data file or get mangled results) and one
19 ** piece of data in the main index file required to interpret the individual
20 ** index file, namely the article base of that index.
21 **
22 ** We can make the following assumptions:
23 **
24 ** - The high water mark for a group is monotonically increasing; in other
25 ** words, the highest numbered article in a group won't ever decrease.
26 **
27 ** - While the article base may either increase or decrease, it will never
28 ** change unless the inode of the index file on disk also changes, since
29 ** changing the base requires rewriting the index file.
30 **
31 ** - No two files will have the same inode (this requirement should be safe
32 ** even in strange Unix file formats, since the files are all in the same
33 ** directory).
34 **
35 ** We therefore use the following procedure to update the data: The high
36 ** water mark may be changed at any time but surrounded in a write lock. The
37 ** base may only be changed as part of an index rebuild. To do an index
38 ** rebuild, we follow the following procedure:
39 **
40 ** 1) Obtain a write lock on the group entry in the main index.
41 ** 2) Write out new index and data files to new temporary file names.
42 ** 3) Store the new index inode into the main index.
43 ** 4) Update the high, low, and base article numbers in the main index.
44 ** 5) Rename the data file to its correct name.
45 ** 6) Rename the index file to its correct name.
46 ** 7) Release the write lock.
47 **
48 ** We use the following procedure to read the data:
49 **
50 ** 1) Open the group data files (both index and data).
51 ** 2) Store copies of the current high water mark and base in variables.
52 ** 3) Check to be sure the index inode matches the master index file.
53 **
54 ** If it does match, then we have a consistent set of data, since the high
55 ** water mark and base values have to match the index we have (the inode
56 ** value is updated first). It may not be the most current set of data, but
57 ** since we have those index and data files open, even if they're later
58 ** rebuilt we'll continue looking at the same files. They may have further
59 ** data appended to them, but that's safe.
60 **
61 ** If the index inode doesn't match, someone's rebuilt the file while we were
62 ** trying to open it. Continue with the following procedure:
63 **
64 ** 4) Close the data files that we opened.
65 ** 5) Obtain a read lock on the group entry in the main index.
66 ** 6) Reopen the data files.
67 ** 7) Grab the current high water mark and base.
68 ** 8) Release the read lock.
69 **
70 ** In other words, if there appears to be contention, we fall back to using
71 ** locking so that we don't try to loop (which also avoids an infinite loop
72 ** in the event of corruption of the main index).
73 **
74 ** Note that once we have a consistent set of data files open, we don't need
75 ** to aggressively check for new data files until someone asks for an article
76 ** outside the range of articles that we know about. We may be working from
77 ** outdated data files, but the most we'll miss is a cancel or an expiration
78 ** run. Overview data doesn't change; new data is appended and old data is
79 ** expired. We can afford to check only every once in a while, just to be
80 ** sure that we're not going to hand out overview data for a bunch of expired
81 ** articles.
82 */
83
84 #include "config.h"
85 #include "clibrary.h"
86 #include "portable/mmap.h"
87 #include <errno.h>
88 #include <fcntl.h>
89 #include <limits.h>
90 #include <sys/stat.h>
91 #include <time.h>
92
93 #include "inn/fdflag.h"
94 #include "inn/hashtab.h"
95 #include "inn/innconf.h"
96 #include "inn/messages.h"
97 #include "inn/mmap.h"
98 #include "inn/qio.h"
99 #include "inn/vector.h"
100 #include "inn/libinn.h"
101 #include "inn/paths.h"
102 #include "tdx-private.h"
103 #include "tdx-structure.h"
104
105 /* Returned to callers as an opaque data type, this stashes all of the
106 information about an open group.index file. */
107 struct group_index {
108 char *path;
109 int fd;
110 bool writable;
111 struct group_header *header;
112 struct group_entry *entries;
113 int count;
114 };
115
116 /* Forward declaration. */
117 struct hashmap;
118
119 /* Internal prototypes. */
120 static int index_entry_count(size_t size);
121 static size_t index_file_size(int count);
122 static bool index_lock(int fd, enum inn_locktype type);
123 static bool index_lock_group(int fd, ptrdiff_t offset, enum inn_locktype);
124 static bool index_map(struct group_index *);
125 static bool index_maybe_remap(struct group_index *, long loc);
126 static void index_unmap(struct group_index *);
127 static bool index_expand(struct group_index *);
128 static long index_find(struct group_index *, const char *group);
129
130
131 /*
132 ** Given a file size, return the number of group entries that it contains.
133 */
134 static int
index_entry_count(size_t size)135 index_entry_count(size_t size)
136 {
137 return (size - sizeof(struct group_header)) / sizeof(struct group_entry);
138 }
139
140
141 /*
142 ** Given a number of group entries, return the required file size.
143 */
144 static size_t
index_file_size(int count)145 index_file_size(int count)
146 {
147 return sizeof(struct group_header) + count * sizeof(struct group_entry);
148 }
149
150
151 /*
152 ** Lock the hash table for the group index, used to acquire global locks on
153 ** the group index when updating it.
154 */
155 static bool
index_lock(int fd,enum inn_locktype type)156 index_lock(int fd, enum inn_locktype type)
157 {
158 bool status;
159
160 status = inn_lock_range(fd, type, true, 0, sizeof(struct group_header));
161 if (!status)
162 syswarn("tradindexed: cannot %s index hash table",
163 (type == INN_LOCK_UNLOCK) ? "unlock" : "lock");
164 return status;
165 }
166
167
168 /*
169 ** Lock the group entry for a particular group. Takes the offset of that
170 ** group entry from the start of the group entries (not the start of the
171 ** file; we have to add the size of the group header). Used for coordinating
172 ** updates of the data for a group.
173 */
174 static bool
index_lock_group(int fd,ptrdiff_t offset,enum inn_locktype type)175 index_lock_group(int fd, ptrdiff_t offset, enum inn_locktype type)
176 {
177 bool status;
178 size_t size;
179
180 size = sizeof(struct group_entry);
181 offset = offset * size + sizeof(struct group_header);
182 status = inn_lock_range(fd, type, true, offset, size);
183 if (!status)
184 syswarn("tradindexed: cannot %s group entry at %lu",
185 (type == INN_LOCK_UNLOCK) ? "unlock" : "lock",
186 (unsigned long) offset);
187 return status;
188 }
189
190
191 /*
192 ** Memory map (or read into memory) the key portions of the group.index
193 ** file. Takes a struct group_index to fill in and returns true on success
194 ** and false on failure.
195 */
196 static bool
index_map(struct group_index * index)197 index_map(struct group_index *index)
198 {
199 if (!innconf->tradindexedmmap && index->writable) {
200 warn("tradindexed: cannot open for writing without mmap");
201 return false;
202 }
203
204 if (!innconf->tradindexedmmap) {
205 ssize_t header_size;
206 ssize_t entry_size;
207
208 header_size = sizeof(struct group_header);
209 entry_size = index->count * sizeof(struct group_entry);
210 index->header = xmalloc(header_size);
211 index->entries = xmalloc(entry_size);
212 if (read(index->fd, index->header, header_size) != header_size) {
213 syswarn("tradindexed: cannot read header from %s", index->path);
214 goto fail;
215 }
216 if (read(index->fd, index->entries, entry_size) != entry_size) {
217 syswarn("tradindexed: cannot read entries from %s", index->path);
218 goto fail;
219 }
220 return true;
221
222 fail:
223 free(index->header);
224 free(index->entries);
225 index->header = NULL;
226 index->entries = NULL;
227 return false;
228
229 } else {
230 char *data;
231 size_t size;
232 int flag = PROT_READ;
233
234 if (index->writable)
235 flag = PROT_READ | PROT_WRITE;
236 size = index_file_size(index->count);
237 data = mmap(NULL, size, flag, MAP_SHARED, index->fd, 0);
238 if (data == MAP_FAILED) {
239 syswarn("tradindexed: cannot mmap %s", index->path);
240 return false;
241 }
242 index->header = (struct group_header *)(void *) data;
243 index->entries = (struct group_entry *)
244 (void *)(data + sizeof(struct group_header));
245 return true;
246 }
247 }
248
249
250 static bool
file_open_group_index(struct group_index * index,struct stat * st)251 file_open_group_index(struct group_index *index, struct stat *st)
252 {
253 int open_mode;
254
255 index->header = NULL;
256 open_mode = index->writable ? O_RDWR | O_CREAT : O_RDONLY;
257 index->fd = open(index->path, open_mode, ARTFILE_MODE);
258 if (index->fd < 0) {
259 syswarn("tradindexed: cannot open %s", index->path);
260 goto fail;
261 }
262
263 if (fstat(index->fd, st) < 0) {
264 syswarn("tradindexed: cannot fstat %s", index->path);
265 goto fail;
266 }
267 fdflag_close_exec(index->fd, true);
268 return true;
269
270 fail:
271 if (index->fd >= 0) {
272 close(index->fd);
273 index->fd = -1;
274 }
275 return false;
276 }
277
278
279 /*
280 ** Given a group location, remap the index file if our existing mapping isn't
281 ** large enough to include that group. (This can be the case when another
282 ** writer is appending entries to the group index.) Returns true on success
283 ** (which includes "did not need to remap") and false on failure.
284 */
285 static bool
index_maybe_remap(struct group_index * index,long loc)286 index_maybe_remap(struct group_index *index, long loc)
287 {
288 struct stat st;
289 int count;
290 int r;
291
292 if (loc < index->count)
293 return true;
294
295 /* Don't remap if remapping wouldn't actually help. */
296 r = fstat(index->fd, &st);
297 if (r == -1) {
298 if (errno == ESTALE) {
299 index_unmap(index);
300 if (!file_open_group_index(index, &st))
301 return false;
302 } else {
303 syswarn("tradindexed: cannot stat %s", index->path);
304 return false;
305 }
306 }
307 count = index_entry_count(st.st_size);
308 if (count < loc && index->header != NULL)
309 return true;
310
311 /* Okay, remapping will actually help. */
312 index_unmap(index);
313 index->count = count;
314 return index_map(index);
315 }
316
317
318 /*
319 ** Unmap the index file, either in preparation for closing the overview
320 ** method or to get ready to remap it. We warn about failures to munmap but
321 ** don't do anything about them; there isn't much that we can do.
322 */
323 static void
index_unmap(struct group_index * index)324 index_unmap(struct group_index *index)
325 {
326 if (index->header == NULL)
327 return;
328 if (!innconf->tradindexedmmap) {
329 free(index->header);
330 free(index->entries);
331 } else {
332 if (munmap(index->header, index_file_size(index->count)) < 0)
333 syswarn("tradindexed: cannot munmap %s", index->path);
334 }
335 index->header = NULL;
336 index->entries = NULL;
337 }
338
339
340 /*
341 ** Expand the group.index file to hold more entries; also used to build the
342 ** initial file. The caller is expected to lock the group index.
343 */
344 static bool
index_expand(struct group_index * index)345 index_expand(struct group_index *index)
346 {
347 int i;
348
349 index_unmap(index);
350 index->count += 1024;
351 if (ftruncate(index->fd, index_file_size(index->count)) < 0) {
352 syswarn("tradindexed: cannot expand %s", index->path);
353 return false;
354 }
355
356 /* If mapping the index fails, we've already extended it but we haven't
357 done anything with the new portion of the file. That means that it's
358 all zeroes, which means that it contains index entries who all think
359 their next entry is entry 0. We don't want to leave things in this
360 state (particularly if this was the first expansion of the index file,
361 in which case entry 0 points to entry 0 and our walking functions may
362 go into infinite loops). Undo the file expansion. */
363 if (!index_map(index)) {
364 index->count -= 1024;
365 if (ftruncate(index->fd, index_file_size(index->count)) < 0) {
366 syswarn("tradindexed: cannot shrink %s", index->path);
367 }
368 return false;
369 }
370
371 /* If the magic isn't right, assume this is a new index file. */
372 if (index->header->magic != TDX_MAGIC) {
373 index->header->magic = TDX_MAGIC;
374 index->header->freelist.recno = -1;
375 for (i = 0; i < TDX_HASH_SIZE; i++)
376 index->header->hash[i].recno = -1;
377 }
378
379 /* Walk the new entries back to front, adding them to the free list. */
380 for (i = index->count - 1; i >= index->count - 1024; i--) {
381 index->entries[i].next = index->header->freelist;
382 index->header->freelist.recno = i;
383 }
384
385 inn_msync_page(index->header, index_file_size(index->count), MS_ASYNC);
386 return true;
387 }
388
389
390 /*
391 ** Open the group.index file and allocate a new struct for it, returning a
392 ** pointer to that struct. Takes a bool saying whether or not the overview
393 ** should be opened for write.
394 */
395 struct group_index *
tdx_index_open(bool writable)396 tdx_index_open(bool writable)
397 {
398 struct group_index *index;
399 struct stat st;
400
401 index = xmalloc(sizeof(struct group_index));
402 index->path = concatpath(innconf->pathoverview, "group.index");
403 index->writable = writable;
404 if (!file_open_group_index(index, &st)) {
405 goto fail;
406 }
407 if ((size_t) st.st_size > sizeof(struct group_header)) {
408 index->count = index_entry_count(st.st_size);
409 if (!index_map(index))
410 goto fail;
411 } else {
412 index->count = 0;
413 if (index->writable) {
414 if (st.st_size > 0)
415 warn("tradindexed: recreating truncated %s", index->path);
416 if (!index_expand(index))
417 goto fail;
418 } else {
419 index->header = NULL;
420 index->entries = NULL;
421 }
422 }
423 return index;
424
425 fail:
426 tdx_index_close(index);
427 return NULL;
428 }
429
430
431 /*
432 ** Given a group name hash, return an index into the hash table in the
433 ** group.index header.
434 */
435 static long
index_bucket(HASH hash)436 index_bucket(HASH hash)
437 {
438 unsigned int bucket;
439
440 memcpy(&bucket, &hash, sizeof(bucket));
441 return bucket % TDX_HASH_SIZE;
442 }
443
444
445 /*
446 ** Given a pointer to a group entry, return its location number.
447 */
448 static long
entry_loc(const struct group_index * index,const struct group_entry * entry)449 entry_loc(const struct group_index *index, const struct group_entry *entry)
450 {
451 return entry - index->entries;
452 }
453
454
455 /*
456 ** Splice out a particular group entry. Takes the entry and a pointer to the
457 ** location where a pointer to it is stored.
458 */
459 static void
entry_splice(struct group_entry * entry,int * parent)460 entry_splice(struct group_entry *entry, int *parent)
461 {
462 *parent = entry->next.recno;
463 entry->next.recno = -1;
464 inn_msync_page(parent, sizeof(*parent), MS_ASYNC);
465 }
466
467
468 /*
469 ** Add a new entry to the appropriate hash chain.
470 */
471 static void
index_add(struct group_index * index,struct group_entry * entry)472 index_add(struct group_index *index, struct group_entry *entry)
473 {
474 long bucket, loc;
475
476 bucket = index_bucket(entry->hash);
477 loc = entry_loc(index, entry);
478 if (loc == index->header->hash[bucket].recno) {
479 warn("tradindexed: refusing to add a loop for %ld in bucket %ld",
480 loc, bucket);
481 return;
482 }
483 entry->next.recno = index->header->hash[bucket].recno;
484 index->header->hash[bucket].recno = entry_loc(index, entry);
485 inn_msync_page(&index->header->hash[bucket], sizeof(struct loc), MS_ASYNC);
486 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
487 }
488
489
490 /*
491 ** Find a group in the index file, returning the group number for that group
492 ** or -1 if the group can't be found.
493 */
494 static long
index_find(struct group_index * index,const char * group)495 index_find(struct group_index *index, const char *group)
496 {
497 HASH hash;
498 long loc;
499
500 if (index->header == NULL || index->entries == NULL)
501 return -1;
502 hash = Hash(group, strlen(group));
503 if (innconf->nfsreader && !index_maybe_remap(index, LONG_MAX))
504 return -1;
505 loc = index->header->hash[index_bucket(hash)].recno;
506
507 while (loc >= 0) {
508 struct group_entry *entry;
509
510 if (loc >= index->count) {
511 if (!index_maybe_remap(index, loc)) {
512 return -1;
513 }
514 if (loc >= index->count) {
515 syswarn("tradindexed: entry %ld out of range", loc);
516 return -1;
517 }
518 }
519 entry = index->entries + loc;
520 if (entry->deleted == 0)
521 if (memcmp(&hash, &entry->hash, sizeof(hash)) == 0)
522 return loc;
523 if (loc == entry->next.recno) {
524 syswarn("tradindexed: index loop for entry %ld", loc);
525 return -1;
526 }
527 loc = entry->next.recno;
528 }
529 return -1;
530 }
531
532
533 /*
534 ** Add a given entry to the free list.
535 */
536 static void
freelist_add(struct group_index * index,struct group_entry * entry)537 freelist_add(struct group_index *index, struct group_entry *entry)
538 {
539 entry->next.recno = index->header->freelist.recno;
540 index->header->freelist.recno = entry_loc(index, entry);
541 inn_msync_page(&index->header->freelist, sizeof(struct loc), MS_ASYNC);
542 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
543 }
544
545
546 /*
547 ** Find an entry by hash value (rather than group name) and splice it out of
548 ** whatever chain it might belong to. This function is called by both
549 ** index_unlink and index_audit_group. Locking must be done by the caller.
550 ** Returns the group location of the spliced group.
551 */
552 static long
index_unlink_hash(struct group_index * index,HASH hash)553 index_unlink_hash(struct group_index *index, HASH hash)
554 {
555 int *parent;
556 long current;
557
558 parent = &index->header->hash[index_bucket(hash)].recno;
559 current = *parent;
560
561 while (current >= 0) {
562 struct group_entry *entry;
563
564 if (current >= index->count) {
565 if (!index_maybe_remap(index, current)) {
566 return -1;
567 }
568 parent = &index->header->hash[index_bucket(hash)].recno;
569 current = *parent;
570 if (current < 0 || current >= index->count) {
571 syswarn("tradindexed: entry %ld out of range", current);
572 return -1;
573 }
574 }
575 entry = &index->entries[current];
576 if (entry->deleted == 0)
577 if (memcmp(&hash, &entry->hash, sizeof(hash)) == 0) {
578 entry_splice(entry, parent);
579 return current;
580 }
581 if (current == entry->next.recno) {
582 syswarn("tradindexed: index loop for entry %ld", current);
583 return -1;
584 }
585 parent = &entry->next.recno;
586 current = *parent;
587 }
588 return -1;
589 }
590
591
592 /*
593 ** Like index_find, but also removes that entry out of whatever chain it
594 ** might belong to. This function is called by tdx_index_delete. Locking
595 ** must be done by the caller.
596 */
597 static long
index_unlink(struct group_index * index,const char * group)598 index_unlink(struct group_index *index, const char *group)
599 {
600 HASH hash;
601
602 hash = Hash(group, strlen(group));
603 return index_unlink_hash(index, hash);
604 }
605
606
607 /*
608 ** Return the information stored about a given group in the group index.
609 */
610 struct group_entry *
tdx_index_entry(struct group_index * index,const char * group)611 tdx_index_entry(struct group_index *index, const char *group)
612 {
613 long loc;
614 struct group_entry *entry;
615
616 loc = index_find(index, group);
617 if (loc == -1)
618 return NULL;
619 entry = index->entries + loc;
620 if (innconf->tradindexedmmap && innconf->nfsreader)
621 inn_msync_page(entry, sizeof *entry, MS_INVALIDATE);
622 return entry;
623 }
624
625
626 /*
627 ** Add a new newsgroup to the group.index file. Takes the newsgroup name,
628 ** its high and low water marks, and the newsgroup flag. Note that aliased
629 ** newsgroups are not currently handled. If the group already exists, just
630 ** update the flag (not the high and low water marks).
631 */
632 bool
tdx_index_add(struct group_index * index,const char * group,ARTNUM low,ARTNUM high,const char * flag)633 tdx_index_add(struct group_index *index, const char *group, ARTNUM low,
634 ARTNUM high, const char *flag)
635 {
636 HASH hash;
637 long loc;
638 struct group_entry *entry;
639 struct group_data *data;
640
641 if (!index->writable)
642 return false;
643
644 /* If the group already exists, update the flag as necessary and then
645 we're all done. */
646 loc = index_find(index, group);
647 if (loc != -1) {
648 entry = &index->entries[loc];
649 if (entry->flag != *flag) {
650 entry->flag = *flag;
651 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
652 }
653 return true;
654 }
655
656 index_lock(index->fd, INN_LOCK_WRITE);
657
658 /* Find a free entry. If we don't have any free space, make some. */
659 if (index->header->freelist.recno == -1)
660 if (!index_expand(index)) {
661 index_lock(index->fd, INN_LOCK_UNLOCK);
662 return false;
663 }
664 loc = index->header->freelist.recno;
665 index->header->freelist.recno = index->entries[loc].next.recno;
666 inn_msync_page(&index->header->freelist, sizeof(struct loc), MS_ASYNC);
667
668 /* Initialize the entry. */
669 entry = &index->entries[loc];
670 hash = Hash(group, strlen(group));
671 entry->hash = hash;
672 entry->low = (low == 0 && high != 0) ? high + 1 : low;
673 entry->high = high;
674 entry->deleted = 0;
675 entry->base = 0;
676 entry->count = 0;
677 entry->flag = *flag;
678 data = tdx_data_new(group, index->writable);
679 if (!tdx_data_open_files(data))
680 warn("tradindexed: unable to create data files for %s", group);
681 entry->indexinode = data->indexinode;
682 tdx_data_close(data);
683 index_add(index, entry);
684
685 index_lock(index->fd, INN_LOCK_UNLOCK);
686 return true;
687 }
688
689
690 /*
691 ** Delete a group index entry.
692 */
693 bool
tdx_index_delete(struct group_index * index,const char * group)694 tdx_index_delete(struct group_index *index, const char *group)
695 {
696 long loc;
697 struct group_entry *entry;
698
699 if (!index->writable)
700 return false;
701
702 /* Lock the header for the entire operation, mostly as prevention against
703 interfering with ongoing audits (which lock while they're running). */
704 index_lock(index->fd, INN_LOCK_WRITE);
705
706 /* Splice out the entry and mark it as deleted. */
707 loc = index_unlink(index, group);
708 if (loc == -1) {
709 index_lock(index->fd, INN_LOCK_UNLOCK);
710 return false;
711 }
712 entry = &index->entries[loc];
713 entry->deleted = time(NULL);
714 HashClear(&entry->hash);
715
716 /* Add the entry to the free list. */
717 freelist_add(index, entry);
718 index_lock(index->fd, INN_LOCK_UNLOCK);
719
720 /* Delete the group data files for this group. */
721 tdx_data_delete(group, NULL);
722
723 return true;
724 }
725
726
727 /*
728 ** Close an open handle to the group index file, freeing the group_index
729 ** structure at the same time. The argument to this function becomes invalid
730 ** after this call.
731 */
732 void
tdx_index_close(struct group_index * index)733 tdx_index_close(struct group_index *index)
734 {
735 index_unmap(index);
736 if (index->fd >= 0) {
737 close(index->fd);
738 index->fd = -1;
739 }
740 free(index->path);
741 free(index);
742 }
743
744
745 /*
746 ** Open the data files for a particular group. The interface to this has to
747 ** be in this file because we have to lock the group and retry if the inode
748 ** of the opened index file doesn't match the one recorded in the group index
749 ** file. Optionally take a pointer to the group index entry if the caller
750 ** has already gone to the work of finding it.
751 */
752 struct group_data *
tdx_data_open(struct group_index * index,const char * group,struct group_entry * entry)753 tdx_data_open(struct group_index *index, const char *group,
754 struct group_entry *entry)
755 {
756 struct group_data *data;
757 ARTNUM high, base;
758 ptrdiff_t offset;
759
760 if (entry == NULL) {
761 entry = tdx_index_entry(index, group);
762 if (entry == NULL)
763 return NULL;
764 }
765 offset = entry - index->entries;
766 data = tdx_data_new(group, index->writable);
767
768 /* Check to see if the inode of the index file matches. If it doesn't,
769 this probably means that as we were opening the index file, someone
770 else rewrote it (either expire or repack). Obtain a lock and try
771 again. If there's still a mismatch, go with what we get; there's some
772 sort of corruption.
773
774 This code is very sensitive to order and parallelism. See the comment
775 at the beginning of this file for methodology. */
776 if (!tdx_data_open_files(data))
777 goto fail;
778 high = entry->high;
779 base = entry->base;
780 if (entry->indexinode != data->indexinode) {
781 index_lock_group(index->fd, offset, INN_LOCK_READ);
782 if (!tdx_data_open_files(data)) {
783 index_lock_group(index->fd, offset, INN_LOCK_UNLOCK);
784 goto fail;
785 }
786 if (entry->indexinode != data->indexinode)
787 warn("tradindexed: index inode mismatch for %s", group);
788 high = entry->high;
789 base = entry->base;
790 index_lock_group(index->fd, offset, INN_LOCK_UNLOCK);
791 }
792 data->high = high;
793 data->base = base;
794 return data;
795
796 fail:
797 tdx_data_close(data);
798 return NULL;
799 }
800
801
802 /*
803 ** Add an overview record for a particular article. Takes the group entry,
804 ** the open overview data structure, and the information about the article
805 ** and returns true on success, false on failure. This function calls
806 ** tdx_data_store to do most of the real work and then updates the index
807 ** information.
808 */
809 bool
tdx_data_add(struct group_index * index,struct group_entry * entry,struct group_data * data,const struct article * article)810 tdx_data_add(struct group_index *index, struct group_entry *entry,
811 struct group_data *data, const struct article *article)
812 {
813 ARTNUM old_base;
814 ino_t old_inode;
815 ptrdiff_t offset = entry - index->entries;
816
817 if (!index->writable)
818 return false;
819 index_lock_group(index->fd, offset, INN_LOCK_WRITE);
820
821 /* Make sure we have the most current data files and that we have the
822 right base article number. */
823 if (entry->indexinode != data->indexinode) {
824 if (!tdx_data_open_files(data))
825 goto fail;
826 if (entry->indexinode != data->indexinode)
827 warn("tradindexed: index inode mismatch for %s",
828 HashToText(entry->hash));
829 data->base = entry->base;
830 }
831
832 /* If the article number is too low to store in the group index, repack
833 the group with a lower base index. */
834 if (entry->base > article->number) {
835 if (!tdx_data_pack_start(data, article->number))
836 goto fail;
837 old_inode = entry->indexinode;
838 old_base = entry->base;
839 entry->indexinode = data->indexinode;
840 entry->base = data->base;
841 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
842 if (!tdx_data_pack_finish(data)) {
843 entry->base = old_base;
844 entry->indexinode = old_inode;
845 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
846 goto fail;
847 }
848 }
849
850 /* Store the data. */
851 if (!tdx_data_store(data, article))
852 goto fail;
853 if (entry->base == 0)
854 entry->base = data->base;
855 if (entry->low == 0 || entry->low > article->number)
856 entry->low = article->number;
857 if (entry->high < article->number)
858 entry->high = article->number;
859 entry->count++;
860
861 /* Used to know that we have to remap the data file owing to our
862 OVSTATICSEARCH (an article whose number is lower than the highest has
863 been added at the end of the file). */
864 if (data->high > article->number)
865 data->remapoutoforder = true;
866
867 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
868 index_lock_group(index->fd, offset, INN_LOCK_UNLOCK);
869 return true;
870
871 fail:
872 index_lock_group(index->fd, offset, INN_LOCK_UNLOCK);
873 return false;
874 }
875
876
877 /*
878 ** Start a rebuild of the group data for a newsgroup. Right now, all this
879 ** does is lock the group index entry.
880 */
881 bool
tdx_index_rebuild_start(struct group_index * index,struct group_entry * entry)882 tdx_index_rebuild_start(struct group_index *index, struct group_entry *entry)
883 {
884 ptrdiff_t offset;
885
886 offset = entry - index->entries;
887 return index_lock_group(index->fd, offset, INN_LOCK_WRITE);
888 }
889
890
891 /*
892 ** Finish a rebuild of the group data for a newsgroup. Takes the old and new
893 ** entry and writes the data from the new entry into the group index, and
894 ** then unlocks it.
895 */
896 bool
tdx_index_rebuild_finish(struct group_index * index,struct group_entry * entry,struct group_entry * new)897 tdx_index_rebuild_finish(struct group_index *index, struct group_entry *entry,
898 struct group_entry *new)
899 {
900 ptrdiff_t offset;
901 ino_t new_inode;
902
903 new_inode = new->indexinode;
904 new->indexinode = entry->indexinode;
905 *entry = *new;
906 entry->indexinode = new_inode;
907 new->indexinode = new_inode;
908 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
909 offset = entry - index->entries;
910 index_lock_group(index->fd, offset, INN_LOCK_UNLOCK);
911 return true;
912 }
913
914
915 /*
916 ** Expire a single newsgroup. Most of the work is done by tdx_data_expire*,
917 ** but this routine has the responsibility to do locking (the same as would
918 ** be done for repacking, since the group base may change) and updating the
919 ** group entry.
920 */
921 bool
tdx_expire(const char * group,ARTNUM * low,struct history * history)922 tdx_expire(const char *group, ARTNUM *low, struct history *history)
923 {
924 struct group_index *index;
925 struct group_entry *entry;
926 struct group_entry new_entry;
927 struct group_data *data = NULL;
928 ptrdiff_t offset;
929 ARTNUM old_base;
930 ino_t old_inode;
931
932 index = tdx_index_open(true);
933 if (index == NULL)
934 return false;
935 entry = tdx_index_entry(index, group);
936 if (entry == NULL) {
937 tdx_index_close(index);
938 return false;
939 }
940 tdx_index_rebuild_start(index, entry);
941
942 /* tdx_data_expire_start builds the new IDX and DAT files and fills in the
943 struct group_entry that was passed to it. tdx_data_rebuild_finish does
944 the renaming of the new files to the final file names. */
945 new_entry = *entry;
946 new_entry.low = 0;
947 new_entry.count = 0;
948 new_entry.base = 0;
949 data = tdx_data_open(index, group, entry);
950 if (data == NULL)
951 goto fail;
952 if (!tdx_data_expire_start(group, data, &new_entry, history))
953 goto fail;
954 old_inode = entry->indexinode;
955 old_base = entry->base;
956 entry->indexinode = new_entry.indexinode;
957 entry->base = new_entry.base;
958 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
959 tdx_data_close(data);
960 if (!tdx_data_rebuild_finish(group)) {
961 entry->base = old_base;
962 entry->indexinode = old_inode;
963 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
964 goto fail;
965 }
966
967 /* Almost done. Update the group index. If there are no articles in the
968 group, the low water mark should be one more than the high water
969 mark. */
970 if (new_entry.low == 0)
971 new_entry.low = new_entry.high + 1;
972 tdx_index_rebuild_finish(index, entry, &new_entry);
973 if (low != NULL)
974 *low = entry->low;
975 tdx_index_close(index);
976 return true;
977
978 fail:
979 offset = entry - index->entries;
980 index_lock_group(index->fd, offset, INN_LOCK_UNLOCK);
981 if (data != NULL)
982 tdx_data_close(data);
983 tdx_index_close(index);
984 return false;
985 }
986
987
988 /*
989 ** RECOVERY AND AUDITING
990 **
991 ** All code below this point is not used in the normal operations of the
992 ** overview method. Instead, it's code to dump various data structures or
993 ** audit them for consistency, used by recovery tools and inspection tools.
994 */
995
996 /* Holds a newsgroup name and its hash, used to form a hash table mapping
997 newsgroup hash values to the actual names. */
998 struct hashmap {
999 HASH hash;
1000 char *name;
1001 char flag;
1002 };
1003
1004 /* Holds information needed by hash traversal functions. Right now, this is
1005 just the pointer to the group index and a flag saying whether to fix
1006 problems or not. */
1007 struct audit_data {
1008 struct group_index *index;
1009 bool fix;
1010 };
1011
1012
1013 /*
1014 ** Hash table functions for the mapping from group hashes to names.
1015 */
1016 static unsigned long
hashmap_hash(const void * entry)1017 hashmap_hash(const void *entry)
1018 {
1019 unsigned long hash;
1020 const struct hashmap *group = entry;
1021
1022 memcpy(&hash, &group->hash, sizeof(hash));
1023 return hash;
1024 }
1025
1026
1027 static const void *
hashmap_key(const void * entry)1028 hashmap_key(const void *entry)
1029 {
1030 return &((const struct hashmap *) entry)->hash;
1031 }
1032
1033
1034 static bool
hashmap_equal(const void * key,const void * entry)1035 hashmap_equal(const void *key, const void *entry)
1036 {
1037 const HASH *first = key;
1038 const HASH *second;
1039
1040 second = &((const struct hashmap *) entry)->hash;
1041 return memcmp(first, second, sizeof(HASH)) == 0;
1042 }
1043
1044
1045 static void
hashmap_delete(void * entry)1046 hashmap_delete(void *entry)
1047 {
1048 struct hashmap *group = entry;
1049
1050 free(group->name);
1051 free(group);
1052 }
1053
1054
1055 /*
1056 ** Construct a hash table of group hashes to group names by scanning the
1057 ** active file. Returns the constructed hash table.
1058 */
1059 static struct hash *
hashmap_load(void)1060 hashmap_load(void)
1061 {
1062 struct hash *hash;
1063 QIOSTATE *active;
1064 char *activepath, *line;
1065 struct cvector *data = NULL;
1066 struct stat st;
1067 size_t hash_size;
1068 struct hashmap *group;
1069 HASH grouphash;
1070
1071 activepath = concatpath(innconf->pathdb, INN_PATH_ACTIVE);
1072 active = QIOopen(activepath);
1073 free(activepath);
1074 if (active == NULL)
1075 return NULL;
1076 if (fstat(QIOfileno(active), &st) < 0)
1077 hash_size = 32 * 1024;
1078 else
1079 hash_size = st.st_size / 30;
1080 hash = hash_create(hash_size, hashmap_hash, hashmap_key, hashmap_equal,
1081 hashmap_delete);
1082
1083 line = QIOread(active);
1084 while (line != NULL) {
1085 data = cvector_split_space(line, data);
1086 if (data->count != 4) {
1087 warn("tradindexed: malformed active file line %s", line);
1088 continue;
1089 }
1090 group = xmalloc(sizeof(struct hashmap));
1091 group->name = xstrdup(data->strings[0]);
1092 group->flag = data->strings[3][0];
1093 grouphash = Hash(group->name, strlen(group->name));
1094 memcpy(&group->hash, &grouphash, sizeof(HASH));
1095 hash_insert(hash, &group->hash, group);
1096 line = QIOread(active);
1097 }
1098 if (data != NULL)
1099 cvector_free(data);
1100 QIOclose(active);
1101 return hash;
1102 }
1103
1104
1105 /*
1106 ** Print the stored information about a single group in human-readable form
1107 ** to stdout. The format is:
1108 **
1109 ** name high low base count flag deleted inode
1110 **
1111 ** all on one line. Name is passed into this function.
1112 */
1113 void
tdx_index_print(const char * name,const struct group_entry * entry,FILE * output)1114 tdx_index_print(const char *name, const struct group_entry *entry,
1115 FILE *output)
1116 {
1117 fprintf(output, "%s %lu %lu %lu %lu %c %lu %lu\n", name, entry->high,
1118 entry->low, entry->base, (unsigned long) entry->count,
1119 entry->flag, (unsigned long) entry->deleted,
1120 (unsigned long) entry->indexinode);
1121 }
1122
1123
1124 /*
1125 ** Dump the complete contents of the group.index file in human-readable form
1126 ** to the specified file, one line per group.
1127 */
1128 void
tdx_index_dump(struct group_index * index,FILE * output)1129 tdx_index_dump(struct group_index *index, FILE *output)
1130 {
1131 int bucket;
1132 long current;
1133 struct group_entry *entry;
1134 struct hash *hashmap;
1135 struct hashmap *group;
1136 char *name;
1137
1138 if (index->header == NULL || index->entries == NULL)
1139 return;
1140 hashmap = hashmap_load();
1141 for (bucket = 0; bucket < TDX_HASH_SIZE; bucket++) {
1142 current = index->header->hash[bucket].recno;
1143 while (current != -1) {
1144 if (!index_maybe_remap(index, current))
1145 return;
1146 entry = index->entries + current;
1147 name = NULL;
1148 if (hashmap != NULL) {
1149 group = hash_lookup(hashmap, &entry->hash);
1150 if (group != NULL)
1151 name = group->name;
1152 }
1153 if (name == NULL)
1154 name = HashToText(entry->hash);
1155 tdx_index_print(name, entry, output);
1156 if (current == entry->next.recno) {
1157 warn("tradindexed: index loop for entry %ld", current);
1158 return;
1159 }
1160 current = entry->next.recno;
1161 }
1162 }
1163 if (hashmap != NULL)
1164 hash_free(hashmap);
1165 }
1166
1167
1168 /*
1169 ** Audit a particular group entry location to ensure that it points to a
1170 ** valid entry within the group index file. Takes a pointer to the location,
1171 ** the number of the location, a pointer to the group entry if any (if not,
1172 ** the location is assumed to be part of the header hash table), and a flag
1173 ** saying whether to fix problems that are found.
1174 */
1175 static void
index_audit_loc(struct group_index * index,int * loc,long number,struct group_entry * entry,bool fix)1176 index_audit_loc(struct group_index *index, int *loc, long number,
1177 struct group_entry *entry, bool fix)
1178 {
1179 bool error = false;
1180
1181 if (*loc >= index->count) {
1182 warn("tradindexed: out of range index %d in %s %ld",
1183 *loc, (entry == NULL ? "bucket" : "entry"), number);
1184 error = true;
1185 }
1186 if (*loc < 0 && *loc != -1) {
1187 warn("tradindexed: invalid negative index %d in %s %ld",
1188 *loc, (entry == NULL ? "bucket" : "entry"), number);
1189 error = true;
1190 }
1191 if (entry != NULL && *loc == number) {
1192 warn("tradindexed: index loop for entry %ld", number);
1193 error = true;
1194 }
1195
1196 if (fix && error) {
1197 *loc = -1;
1198 inn_msync_page(loc, sizeof(*loc), MS_ASYNC);
1199 }
1200 }
1201
1202
1203 /*
1204 ** Check an entry to see if it was actually deleted. Make sure that all the
1205 ** information is consistent with a deleted group if it's not and the fix
1206 ** flag is set.
1207 */
1208 static void
index_audit_deleted(struct group_entry * entry,long number,bool fix)1209 index_audit_deleted(struct group_entry *entry, long number, bool fix)
1210 {
1211 if (entry->deleted != 0 && !HashEmpty(entry->hash)) {
1212 warn("tradindexed: entry %ld has a delete time but a non-zero hash",
1213 number);
1214 if (fix) {
1215 HashClear(&entry->hash);
1216 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
1217 }
1218 }
1219 }
1220
1221
1222 /*
1223 ** Audit the group header for any inconsistencies. This checks the
1224 ** reachability of all of the group entries, makes sure that deleted entries
1225 ** are on the free list, and otherwise checks the linked structure of the
1226 ** whole file. The data in individual entries is not examined. If the
1227 ** second argument is true, also attempt to fix inconsistencies.
1228 */
1229 static void
index_audit_header(struct group_index * index,bool fix)1230 index_audit_header(struct group_index *index, bool fix)
1231 {
1232 long bucket, current;
1233 struct group_entry *entry;
1234 int *parent, *next;
1235 bool *reachable;
1236
1237 /* First, walk all of the regular hash buckets, making sure that all of
1238 the group location pointers are valid and sane, that all groups that
1239 have been deleted are correctly marked as such, and that all groups are
1240 in their correct hash chain. Build reachability information as we go,
1241 used later to ensure that all group entries are reachable. */
1242 reachable = xcalloc(index->count, sizeof(bool));
1243 for (bucket = 0; bucket < TDX_HASH_SIZE; bucket++) {
1244 parent = &index->header->hash[bucket].recno;
1245 index_audit_loc(index, parent, bucket, NULL, fix);
1246 current = *parent;
1247 while (current >= 0 && current < index->count) {
1248 entry = &index->entries[current];
1249 next = &entry->next.recno;
1250 if (entry->deleted == 0 && bucket != index_bucket(entry->hash)) {
1251 warn("tradindexed: entry %ld is in bucket %ld instead of its"
1252 " correct bucket %ld", current, bucket,
1253 index_bucket(entry->hash));
1254 if (fix) {
1255 entry_splice(entry, parent);
1256 next = parent;
1257 }
1258 } else {
1259 if (reachable[current])
1260 warn("tradindexed: entry %ld is reachable from multiple"
1261 " paths", current);
1262 reachable[current] = true;
1263 }
1264 index_audit_deleted(entry, current, fix);
1265 index_audit_loc(index, &entry->next.recno, current, entry, fix);
1266 if (entry->deleted != 0) {
1267 warn("tradindexed: entry %ld is deleted but not in the free"
1268 " list", current);
1269 if (fix) {
1270 entry_splice(entry, parent);
1271 next = parent;
1272 reachable[current] = false;
1273 }
1274 }
1275 if (*next == current)
1276 break;
1277 parent = next;
1278 current = *parent;
1279 }
1280 }
1281
1282 /* Now, walk the free list. Make sure that each group in the free list is
1283 actually deleted, and update the reachability information. */
1284 index_audit_loc(index, &index->header->freelist.recno, 0, NULL, fix);
1285 parent = &index->header->freelist.recno;
1286 current = *parent;
1287 while (current >= 0 && current < index->count) {
1288 entry = &index->entries[current];
1289 index_audit_deleted(entry, current, fix);
1290 reachable[current] = true;
1291 if (!HashEmpty(entry->hash) && entry->deleted == 0) {
1292 warn("tradindexed: undeleted entry %ld in free list", current);
1293 if (fix) {
1294 entry_splice(entry, parent);
1295 reachable[current] = false;
1296 }
1297 }
1298 index_audit_loc(index, &entry->next.recno, current, entry, fix);
1299 if (entry->next.recno == current)
1300 break;
1301 parent = &entry->next.recno;
1302 current = *parent;
1303 }
1304
1305 /* Finally, check all of the unreachable entries and if fix is true, try
1306 to reattach them in the appropriate location. */
1307 for (current = 0; current < index->count; current++)
1308 if (!reachable[current]) {
1309 warn("tradindexed: unreachable entry %ld", current);
1310 if (fix) {
1311 entry = &index->entries[current];
1312 if (!HashEmpty(entry->hash) && entry->deleted == 0)
1313 index_add(index, entry);
1314 else {
1315 HashClear(&entry->hash);
1316 entry->deleted = 0;
1317 freelist_add(index, entry);
1318 }
1319 }
1320 }
1321
1322 /* All done. */
1323 free(reachable);
1324 }
1325
1326
1327 /*
1328 ** Audit a particular group entry for any inconsistencies. This doesn't
1329 ** check any of the structure, or whether the group is deleted, just the data
1330 ** as stored in the group data files (mostly by calling tdx_data_audit to do
1331 ** the real work). Note that while the low water mark may be updated, the
1332 ** high water mark is left unchanged.
1333 */
1334 static void
index_audit_group(struct group_index * index,struct group_entry * entry,struct hash * hashmap,bool fix)1335 index_audit_group(struct group_index *index, struct group_entry *entry,
1336 struct hash *hashmap, bool fix)
1337 {
1338 struct hashmap *group;
1339 ptrdiff_t offset;
1340
1341 offset = entry - index->entries;
1342 index_lock_group(index->fd, offset, INN_LOCK_WRITE);
1343 group = hash_lookup(hashmap, &entry->hash);
1344 if (group == NULL) {
1345 warn("tradindexed: group %ld not found in active file",
1346 entry_loc(index, entry));
1347 if (fix) {
1348 index_unlink_hash(index, entry->hash);
1349 HashClear(&entry->hash);
1350 entry->deleted = time(NULL);
1351 freelist_add(index, entry);
1352 }
1353 } else {
1354 if (entry->flag != group->flag) {
1355 entry->flag = group->flag;
1356 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
1357 }
1358 tdx_data_audit(group->name, entry, fix);
1359 }
1360 index_lock_group(index->fd, offset, INN_LOCK_UNLOCK);
1361 }
1362
1363
1364 /*
1365 ** Check to be sure that a given group exists in the overview index, and if
1366 ** missing, adds it. Assumes that the index isn't locked, since it calls the
1367 ** normal functions for adding new groups (this should only be called after
1368 ** the index has already been repaired, for the same reason). Called as a
1369 ** hash traversal function, walking the hash table of groups from the active
1370 ** file.
1371 */
1372 static void
index_audit_active(void * value,void * cookie)1373 index_audit_active(void *value, void *cookie)
1374 {
1375 struct hashmap *group = value;
1376 struct audit_data *data = cookie;
1377 struct group_entry *entry;
1378
1379 entry = tdx_index_entry(data->index, group->name);
1380 if (entry == NULL) {
1381 warn("tradindexed: group %s missing from overview", group->name);
1382 if (data->fix)
1383 tdx_index_add(data->index, group->name, 0, 0, &group->flag);
1384 }
1385 }
1386
1387
1388 /*
1389 ** Audit the group index for any inconsistencies. If the argument is true,
1390 ** also attempt to fix those inconsistencies.
1391 */
1392 void
tdx_index_audit(bool fix)1393 tdx_index_audit(bool fix)
1394 {
1395 struct group_index *index;
1396 struct stat st;
1397 off_t expected;
1398 int count;
1399 struct hash *hashmap;
1400 long bucket;
1401 struct group_entry *entry;
1402 struct audit_data data;
1403
1404 index = tdx_index_open(true);
1405 if (index == NULL)
1406 return;
1407
1408 /* Keep a lock on the header through the whole audit process. This will
1409 stall any newgroups or rmgroups, but not normal article reception. We
1410 don't want the structure of the group entries changing out from under
1411 us, although we don't mind if the data does until we're validating that
1412 particular group. */
1413 index_lock(index->fd, INN_LOCK_WRITE);
1414
1415 /* Make sure the size looks sensible. */
1416 if (fstat(index->fd, &st) < 0) {
1417 syswarn("tradindexed: cannot fstat %s", index->path);
1418 return;
1419 }
1420 count = index_entry_count(st.st_size);
1421 expected = index_file_size(count);
1422 if (expected != st.st_size) {
1423 syswarn("tradindexed: %ld bytes of trailing trash in %s",
1424 (unsigned long) (st.st_size - expected), index->path);
1425 if (fix)
1426 if (ftruncate(index->fd, expected) < 0)
1427 syswarn("tradindexed: cannot truncate %s", index->path);
1428 }
1429 index_maybe_remap(index, count);
1430
1431 /* Okay everything is now mapped and happy. Validate the header. */
1432 index_audit_header(index, fix);
1433 index_lock(index->fd, INN_LOCK_UNLOCK);
1434
1435 /* Walk all the group entries and check them individually. To do this, we
1436 need to map hashes to group names, so load a hash of the active file to
1437 do that resolution. */
1438 hashmap = hashmap_load();
1439 if (hashmap == NULL) {
1440 warn("tradindexed: cannot hash active file");
1441 return;
1442 }
1443 data.index = index;
1444 data.fix = fix;
1445 hash_traverse(hashmap, index_audit_active, &data);
1446 for (bucket = 0; bucket < index->count; bucket++) {
1447 entry = &index->entries[bucket];
1448 if (HashEmpty(entry->hash) || entry->deleted != 0)
1449 continue;
1450 index_audit_group(index, entry, hashmap, fix);
1451 }
1452 hash_free(hashmap);
1453 }
1454