1 /*
2 ** Group index handling for the tradindexed overview method.
3 **
4 ** Implements the handling of the group.index file for the tradindexed
5 ** overview method. This file contains an entry for every group and stores
6 ** the high and low article marks and the base article numbers for each
7 ** individual group index file.
8 **
9 ** Externally visible functions have a tdx_ prefix; internal functions do
10 ** not. (Externally visible unfortunately means everything that needs to be
11 ** visible outside of this object file, not just interfaces exported to
12 ** consumers of the overview API.)
13 **
14 ** This code has to support readers and writers sharing the same files, and
15 ** we want to avoid locking where possible since locking may be very slow
16 ** (such as over NFS). Each group has two data files (and one has to get the
17 ** right index file for a given data file or get mangled results) and one
18 ** piece of data in the main index file required to interpret the individual
19 ** index file, namely the article base of that index.
20 **
21 ** We can make the following assumptions:
22 **
23 ** - The high water mark for a group is monotonically increasing; in other
24 ** words, the highest numbered article in a group won't ever decrease.
25 **
26 ** - While the article base may either increase or decrease, it will never
27 ** change unless the inode of the index file on disk also changes, since
28 ** changing the base requires rewriting the index file.
29 **
30 ** - No two files will have the same inode (this requirement should be safe
31 ** even in strange Unix file formats, since the files are all in the same
32 ** directory).
33 **
34 ** We therefore use the following procedure to update the data: The high
35 ** water mark may be changed at any time but surrounded in a write lock. The
36 ** base may only be changed as part of an index rebuild. To do an index
37 ** rebuild, we follow the following procedure:
38 **
39 ** 1) Obtain a write lock on the group entry in the main index.
40 ** 2) Write out new index and data files to new temporary file names.
41 ** 3) Store the new index inode into the main index.
42 ** 4) Update the high, low, and base article numbers in the main index.
43 ** 5) Rename the data file to its correct name.
44 ** 6) Rename the index file to its correct name.
45 ** 7) Release the write lock.
46 **
47 ** We use the following procedure to read the data:
48 **
49 ** 1) Open the group data files (both index and data).
50 ** 2) Store copies of the current high water mark and base in variables.
51 ** 3) Check to be sure the index inode matches the master index file.
52 **
53 ** If it does match, then we have a consistent set of data, since the high
54 ** water mark and base values have to match the index we have (the inode
55 ** value is updated first). It may not be the most current set of data, but
56 ** since we have those index and data files open, even if they're later
57 ** rebuilt we'll continue looking at the same files. They may have further
58 ** data appended to them, but that's safe.
59 **
60 ** If the index inode doesn't match, someone's rebuilt the file while we were
61 ** trying to open it. Continue with the following procedure:
62 **
63 ** 4) Close the data files that we opened.
64 ** 5) Obtain a read lock on the group entry in the main index.
65 ** 6) Reopen the data files.
66 ** 7) Grab the current high water mark and base.
67 ** 8) Release the read lock.
68 **
69 ** In other words, if there appears to be contention, we fall back to using
70 ** locking so that we don't try to loop (which also avoids an infinite loop
71 ** in the event of corruption of the main index).
72 **
73 ** Note that once we have a consistent set of data files open, we don't need
74 ** to aggressively check for new data files until someone asks for an article
75 ** outside the range of articles that we know about. We may be working from
76 ** outdated data files, but the most we'll miss is a cancel or an expiration
77 ** run. Overview data doesn't change; new data is appended and old data is
78 ** expired. We can afford to check only every once in a while, just to be
79 ** sure that we're not going to hand out overview data for a bunch of expired
80 ** articles.
81 */
82
83 #include "portable/system.h"
84
85 #include "portable/mmap.h"
86 #include <errno.h>
87 #include <fcntl.h>
88 #include <limits.h>
89 #include <sys/stat.h>
90 #include <time.h>
91
92 #include "inn/fdflag.h"
93 #include "inn/hashtab.h"
94 #include "inn/innconf.h"
95 #include "inn/libinn.h"
96 #include "inn/messages.h"
97 #include "inn/mmap.h"
98 #include "inn/paths.h"
99 #include "inn/qio.h"
100 #include "inn/vector.h"
101 #include "tdx-private.h"
102 #include "tdx-structure.h"
103
104 /* Returned to callers as an opaque data type, this stashes all of the
105 information about an open group.index file. */
106 struct group_index {
107 char *path;
108 int fd;
109 bool writable;
110 struct group_header *header;
111 struct group_entry *entries;
112 int count;
113 };
114
115 /* Forward declaration. */
116 struct hashmap;
117
118 /* Internal prototypes. */
119 static int index_entry_count(size_t size);
120 static size_t index_file_size(int count);
121 static bool index_lock(int fd, enum inn_locktype type);
122 static bool index_lock_group(int fd, ptrdiff_t offset, enum inn_locktype);
123 static bool index_map(struct group_index *);
124 static bool index_maybe_remap(struct group_index *, long loc);
125 static void index_unmap(struct group_index *);
126 static bool index_expand(struct group_index *);
127 static long index_find(struct group_index *, const char *group);
128
129
130 /*
131 ** Given a file size, return the number of group entries that it contains.
132 */
133 static int
index_entry_count(size_t size)134 index_entry_count(size_t size)
135 {
136 return (size - sizeof(struct group_header)) / sizeof(struct group_entry);
137 }
138
139
140 /*
141 ** Given a number of group entries, return the required file size.
142 */
143 static size_t
index_file_size(int count)144 index_file_size(int count)
145 {
146 return sizeof(struct group_header) + count * sizeof(struct group_entry);
147 }
148
149
150 /*
151 ** Lock the hash table for the group index, used to acquire global locks on
152 ** the group index when updating it.
153 */
154 static bool
index_lock(int fd,enum inn_locktype type)155 index_lock(int fd, enum inn_locktype type)
156 {
157 bool status;
158
159 status = inn_lock_range(fd, type, true, 0, sizeof(struct group_header));
160 if (!status)
161 syswarn("tradindexed: cannot %s index hash table",
162 (type == INN_LOCK_UNLOCK) ? "unlock" : "lock");
163 return status;
164 }
165
166
167 /*
168 ** Lock the group entry for a particular group. Takes the offset of that
169 ** group entry from the start of the group entries (not the start of the
170 ** file; we have to add the size of the group header). Used for coordinating
171 ** updates of the data for a group.
172 */
173 static bool
index_lock_group(int fd,ptrdiff_t offset,enum inn_locktype type)174 index_lock_group(int fd, ptrdiff_t offset, enum inn_locktype type)
175 {
176 bool status;
177 size_t size;
178
179 size = sizeof(struct group_entry);
180 offset = offset * size + sizeof(struct group_header);
181 status = inn_lock_range(fd, type, true, offset, size);
182 if (!status)
183 syswarn("tradindexed: cannot %s group entry at %lu",
184 (type == INN_LOCK_UNLOCK) ? "unlock" : "lock",
185 (unsigned long) offset);
186 return status;
187 }
188
189
190 /*
191 ** Memory map (or read into memory) the key portions of the group.index
192 ** file. Takes a struct group_index to fill in and returns true on success
193 ** and false on failure.
194 */
195 static bool
index_map(struct group_index * index)196 index_map(struct group_index *index)
197 {
198 if (!innconf->tradindexedmmap && index->writable) {
199 warn("tradindexed: cannot open for writing without mmap");
200 return false;
201 }
202
203 if (!innconf->tradindexedmmap) {
204 ssize_t header_size;
205 ssize_t entry_size;
206
207 header_size = sizeof(struct group_header);
208 entry_size = index->count * sizeof(struct group_entry);
209 index->header = xmalloc(header_size);
210 index->entries = xmalloc(entry_size);
211 if (read(index->fd, index->header, header_size) != header_size) {
212 syswarn("tradindexed: cannot read header from %s", index->path);
213 goto fail;
214 }
215 if (read(index->fd, index->entries, entry_size) != entry_size) {
216 syswarn("tradindexed: cannot read entries from %s", index->path);
217 goto fail;
218 }
219 return true;
220
221 fail:
222 free(index->header);
223 free(index->entries);
224 index->header = NULL;
225 index->entries = NULL;
226 return false;
227
228 } else {
229 char *data;
230 size_t size;
231 int flag = PROT_READ;
232
233 if (index->writable)
234 flag = PROT_READ | PROT_WRITE;
235 size = index_file_size(index->count);
236 data = mmap(NULL, size, flag, MAP_SHARED, index->fd, 0);
237 if (data == MAP_FAILED) {
238 syswarn("tradindexed: cannot mmap %s", index->path);
239 return false;
240 }
241 index->header = (struct group_header *) (void *) data;
242 index->entries =
243 (struct group_entry *) (void *) (data
244 + sizeof(struct group_header));
245 return true;
246 }
247 }
248
249
250 static bool
file_open_group_index(struct group_index * index,struct stat * st)251 file_open_group_index(struct group_index *index, struct stat *st)
252 {
253 int open_mode;
254
255 index->header = NULL;
256 open_mode = index->writable ? O_RDWR | O_CREAT : O_RDONLY;
257 index->fd = open(index->path, open_mode, ARTFILE_MODE);
258 if (index->fd < 0) {
259 syswarn("tradindexed: cannot open %s", index->path);
260 goto fail;
261 }
262
263 if (fstat(index->fd, st) < 0) {
264 syswarn("tradindexed: cannot fstat %s", index->path);
265 goto fail;
266 }
267 fdflag_close_exec(index->fd, true);
268 return true;
269
270 fail:
271 if (index->fd >= 0) {
272 close(index->fd);
273 index->fd = -1;
274 }
275 return false;
276 }
277
278
279 /*
280 ** Given a group location, remap the index file if our existing mapping isn't
281 ** large enough to include that group. (This can be the case when another
282 ** writer is appending entries to the group index.) Returns true on success
283 ** (which includes "did not need to remap") and false on failure.
284 */
285 static bool
index_maybe_remap(struct group_index * index,long loc)286 index_maybe_remap(struct group_index *index, long loc)
287 {
288 struct stat st;
289 int count;
290 int r;
291
292 if (loc < index->count)
293 return true;
294
295 /* Don't remap if remapping wouldn't actually help. */
296 r = fstat(index->fd, &st);
297 if (r == -1) {
298 if (errno == ESTALE) {
299 index_unmap(index);
300 if (!file_open_group_index(index, &st))
301 return false;
302 } else {
303 syswarn("tradindexed: cannot stat %s", index->path);
304 return false;
305 }
306 }
307 count = index_entry_count(st.st_size);
308 if (count < loc && index->header != NULL)
309 return true;
310
311 /* Okay, remapping will actually help. */
312 index_unmap(index);
313 index->count = count;
314 return index_map(index);
315 }
316
317
318 /*
319 ** Unmap the index file, either in preparation for closing the overview
320 ** method or to get ready to remap it. We warn about failures to munmap but
321 ** don't do anything about them; there isn't much that we can do.
322 */
323 static void
index_unmap(struct group_index * index)324 index_unmap(struct group_index *index)
325 {
326 if (index->header == NULL)
327 return;
328 if (!innconf->tradindexedmmap) {
329 free(index->header);
330 free(index->entries);
331 } else {
332 if (munmap(index->header, index_file_size(index->count)) < 0)
333 syswarn("tradindexed: cannot munmap %s", index->path);
334 }
335 index->header = NULL;
336 index->entries = NULL;
337 }
338
339
340 /*
341 ** Expand the group.index file to hold more entries; also used to build the
342 ** initial file. The caller is expected to lock the group index.
343 */
344 static bool
index_expand(struct group_index * index)345 index_expand(struct group_index *index)
346 {
347 int i;
348
349 index_unmap(index);
350 index->count += 1024;
351 if (ftruncate(index->fd, index_file_size(index->count)) < 0) {
352 syswarn("tradindexed: cannot expand %s", index->path);
353 return false;
354 }
355
356 /* If mapping the index fails, we've already extended it but we haven't
357 done anything with the new portion of the file. That means that it's
358 all zeroes, which means that it contains index entries who all think
359 their next entry is entry 0. We don't want to leave things in this
360 state (particularly if this was the first expansion of the index file,
361 in which case entry 0 points to entry 0 and our walking functions may
362 go into infinite loops). Undo the file expansion. */
363 if (!index_map(index)) {
364 index->count -= 1024;
365 if (ftruncate(index->fd, index_file_size(index->count)) < 0) {
366 syswarn("tradindexed: cannot shrink %s", index->path);
367 }
368 return false;
369 }
370
371 /* If the magic isn't right, assume this is a new index file. */
372 if (index->header->magic != TDX_MAGIC) {
373 index->header->magic = TDX_MAGIC;
374 index->header->freelist.recno = -1;
375 for (i = 0; i < TDX_HASH_SIZE; i++)
376 index->header->hash[i].recno = -1;
377 }
378
379 /* Walk the new entries back to front, adding them to the free list. */
380 for (i = index->count - 1; i >= index->count - 1024; i--) {
381 index->entries[i].next = index->header->freelist;
382 index->header->freelist.recno = i;
383 }
384
385 inn_msync_page(index->header, index_file_size(index->count), MS_ASYNC);
386 return true;
387 }
388
389
390 /*
391 ** Open the group.index file and allocate a new struct for it, returning a
392 ** pointer to that struct. Takes a bool saying whether or not the overview
393 ** should be opened for write.
394 */
395 struct group_index *
tdx_index_open(bool writable)396 tdx_index_open(bool writable)
397 {
398 struct group_index *index;
399 struct stat st;
400
401 index = xmalloc(sizeof(struct group_index));
402 index->path = concatpath(innconf->pathoverview, "group.index");
403 index->writable = writable;
404 if (!file_open_group_index(index, &st)) {
405 goto fail;
406 }
407 if ((size_t) st.st_size > sizeof(struct group_header)) {
408 index->count = index_entry_count(st.st_size);
409 if (!index_map(index))
410 goto fail;
411 } else {
412 index->count = 0;
413 if (index->writable) {
414 if (st.st_size > 0)
415 warn("tradindexed: recreating truncated %s", index->path);
416 if (!index_expand(index))
417 goto fail;
418 } else {
419 index->header = NULL;
420 index->entries = NULL;
421 }
422 }
423 return index;
424
425 fail:
426 tdx_index_close(index);
427 return NULL;
428 }
429
430
431 /*
432 ** Given a group name hash, return an index into the hash table in the
433 ** group.index header.
434 */
435 static long
index_bucket(HASH hash)436 index_bucket(HASH hash)
437 {
438 unsigned int bucket;
439
440 memcpy(&bucket, &hash, sizeof(bucket));
441 return bucket % TDX_HASH_SIZE;
442 }
443
444
445 /*
446 ** Given a pointer to a group entry, return its location number.
447 */
448 static long
entry_loc(const struct group_index * index,const struct group_entry * entry)449 entry_loc(const struct group_index *index, const struct group_entry *entry)
450 {
451 return entry - index->entries;
452 }
453
454
455 /*
456 ** Splice out a particular group entry. Takes the entry and a pointer to the
457 ** location where a pointer to it is stored.
458 */
459 static void
entry_splice(struct group_entry * entry,int * parent)460 entry_splice(struct group_entry *entry, int *parent)
461 {
462 *parent = entry->next.recno;
463 entry->next.recno = -1;
464 inn_msync_page(parent, sizeof(*parent), MS_ASYNC);
465 }
466
467
468 /*
469 ** Add a new entry to the appropriate hash chain.
470 */
471 static void
index_add(struct group_index * index,struct group_entry * entry)472 index_add(struct group_index *index, struct group_entry *entry)
473 {
474 long bucket, loc;
475
476 bucket = index_bucket(entry->hash);
477 loc = entry_loc(index, entry);
478 if (loc == index->header->hash[bucket].recno) {
479 warn("tradindexed: refusing to add a loop for %ld in bucket %ld", loc,
480 bucket);
481 return;
482 }
483 entry->next.recno = index->header->hash[bucket].recno;
484 index->header->hash[bucket].recno = entry_loc(index, entry);
485 inn_msync_page(&index->header->hash[bucket], sizeof(struct loc), MS_ASYNC);
486 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
487 }
488
489
490 /*
491 ** Find a group in the index file, returning the group number for that group
492 ** or -1 if the group can't be found.
493 */
494 static long
index_find(struct group_index * index,const char * group)495 index_find(struct group_index *index, const char *group)
496 {
497 HASH hash;
498 long loc;
499
500 if (index->header == NULL || index->entries == NULL)
501 return -1;
502 hash = Hash(group, strlen(group));
503 if (innconf->nfsreader && !index_maybe_remap(index, LONG_MAX))
504 return -1;
505 loc = index->header->hash[index_bucket(hash)].recno;
506
507 while (loc >= 0) {
508 struct group_entry *entry;
509
510 if (loc >= index->count) {
511 if (!index_maybe_remap(index, loc)) {
512 return -1;
513 }
514 if (loc >= index->count) {
515 syswarn("tradindexed: entry %ld out of range", loc);
516 return -1;
517 }
518 }
519 entry = index->entries + loc;
520 if (entry->deleted == 0)
521 if (memcmp(&hash, &entry->hash, sizeof(hash)) == 0)
522 return loc;
523 if (loc == entry->next.recno) {
524 syswarn("tradindexed: index loop for entry %ld", loc);
525 return -1;
526 }
527 loc = entry->next.recno;
528 }
529 return -1;
530 }
531
532
533 /*
534 ** Add a given entry to the free list.
535 */
536 static void
freelist_add(struct group_index * index,struct group_entry * entry)537 freelist_add(struct group_index *index, struct group_entry *entry)
538 {
539 entry->next.recno = index->header->freelist.recno;
540 index->header->freelist.recno = entry_loc(index, entry);
541 inn_msync_page(&index->header->freelist, sizeof(struct loc), MS_ASYNC);
542 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
543 }
544
545
546 /*
547 ** Find an entry by hash value (rather than group name) and splice it out of
548 ** whatever chain it might belong to. This function is called by both
549 ** index_unlink and index_audit_group. Locking must be done by the caller.
550 ** Returns the group location of the spliced group.
551 */
552 static long
index_unlink_hash(struct group_index * index,HASH hash)553 index_unlink_hash(struct group_index *index, HASH hash)
554 {
555 int *parent;
556 long current;
557
558 parent = &index->header->hash[index_bucket(hash)].recno;
559 current = *parent;
560
561 while (current >= 0) {
562 struct group_entry *entry;
563
564 if (current >= index->count) {
565 if (!index_maybe_remap(index, current)) {
566 return -1;
567 }
568 parent = &index->header->hash[index_bucket(hash)].recno;
569 current = *parent;
570 if (current < 0 || current >= index->count) {
571 syswarn("tradindexed: entry %ld out of range", current);
572 return -1;
573 }
574 }
575 entry = &index->entries[current];
576 if (entry->deleted == 0)
577 if (memcmp(&hash, &entry->hash, sizeof(hash)) == 0) {
578 entry_splice(entry, parent);
579 return current;
580 }
581 if (current == entry->next.recno) {
582 syswarn("tradindexed: index loop for entry %ld", current);
583 return -1;
584 }
585 parent = &entry->next.recno;
586 current = *parent;
587 }
588 return -1;
589 }
590
591
592 /*
593 ** Like index_find, but also removes that entry out of whatever chain it
594 ** might belong to. This function is called by tdx_index_delete. Locking
595 ** must be done by the caller.
596 */
597 static long
index_unlink(struct group_index * index,const char * group)598 index_unlink(struct group_index *index, const char *group)
599 {
600 HASH hash;
601
602 hash = Hash(group, strlen(group));
603 return index_unlink_hash(index, hash);
604 }
605
606
607 /*
608 ** Return the information stored about a given group in the group index.
609 */
610 struct group_entry *
tdx_index_entry(struct group_index * index,const char * group)611 tdx_index_entry(struct group_index *index, const char *group)
612 {
613 long loc;
614 struct group_entry *entry;
615
616 loc = index_find(index, group);
617 if (loc == -1)
618 return NULL;
619 entry = index->entries + loc;
620 if (innconf->tradindexedmmap && innconf->nfsreader)
621 inn_msync_page(entry, sizeof *entry, MS_INVALIDATE);
622 return entry;
623 }
624
625
626 /*
627 ** Add a new newsgroup to the group.index file. Takes the newsgroup name,
628 ** its high and low water marks, and the newsgroup flag. Note that aliased
629 ** newsgroups are not currently handled. If the group already exists, just
630 ** update the flag (not the high and low water marks).
631 */
632 bool
tdx_index_add(struct group_index * index,const char * group,ARTNUM low,ARTNUM high,const char * flag)633 tdx_index_add(struct group_index *index, const char *group, ARTNUM low,
634 ARTNUM high, const char *flag)
635 {
636 HASH hash;
637 long loc;
638 struct group_entry *entry;
639 struct group_data *data;
640
641 if (!index->writable)
642 return false;
643
644 /* If the group already exists, update the flag as necessary and then
645 we're all done. */
646 loc = index_find(index, group);
647 if (loc != -1) {
648 entry = &index->entries[loc];
649 if (entry->flag != *flag) {
650 entry->flag = *flag;
651 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
652 }
653 return true;
654 }
655
656 index_lock(index->fd, INN_LOCK_WRITE);
657
658 /* Find a free entry. If we don't have any free space, make some. */
659 if (index->header->freelist.recno == -1)
660 if (!index_expand(index)) {
661 index_lock(index->fd, INN_LOCK_UNLOCK);
662 return false;
663 }
664 loc = index->header->freelist.recno;
665 index->header->freelist.recno = index->entries[loc].next.recno;
666 inn_msync_page(&index->header->freelist, sizeof(struct loc), MS_ASYNC);
667
668 /* Initialize the entry. */
669 entry = &index->entries[loc];
670 hash = Hash(group, strlen(group));
671 entry->hash = hash;
672 entry->low = (low == 0 && high != 0) ? high + 1 : low;
673 entry->high = high;
674 entry->deleted = 0;
675 entry->base = 0;
676 entry->count = 0;
677 entry->flag = *flag;
678 data = tdx_data_new(group, index->writable);
679 if (!tdx_data_open_files(data))
680 warn("tradindexed: unable to create data files for %s", group);
681 entry->indexinode = data->indexinode;
682 tdx_data_close(data);
683 index_add(index, entry);
684
685 index_lock(index->fd, INN_LOCK_UNLOCK);
686 return true;
687 }
688
689
690 /*
691 ** Delete a group index entry.
692 */
693 bool
tdx_index_delete(struct group_index * index,const char * group)694 tdx_index_delete(struct group_index *index, const char *group)
695 {
696 long loc;
697 struct group_entry *entry;
698
699 if (!index->writable)
700 return false;
701
702 /* Lock the header for the entire operation, mostly as prevention against
703 interfering with ongoing audits (which lock while they're running). */
704 index_lock(index->fd, INN_LOCK_WRITE);
705
706 /* Splice out the entry and mark it as deleted. */
707 loc = index_unlink(index, group);
708 if (loc == -1) {
709 index_lock(index->fd, INN_LOCK_UNLOCK);
710 return false;
711 }
712 entry = &index->entries[loc];
713 entry->deleted = time(NULL);
714 HashClear(&entry->hash);
715
716 /* Add the entry to the free list. */
717 freelist_add(index, entry);
718 index_lock(index->fd, INN_LOCK_UNLOCK);
719
720 /* Delete the group data files for this group. */
721 tdx_data_delete(group, NULL);
722
723 return true;
724 }
725
726
727 /*
728 ** Close an open handle to the group index file, freeing the group_index
729 ** structure at the same time. The argument to this function becomes invalid
730 ** after this call.
731 */
732 void
tdx_index_close(struct group_index * index)733 tdx_index_close(struct group_index *index)
734 {
735 index_unmap(index);
736 if (index->fd >= 0) {
737 close(index->fd);
738 index->fd = -1;
739 }
740 free(index->path);
741 free(index);
742 }
743
744
745 /*
746 ** Open the data files for a particular group. The interface to this has to
747 ** be in this file because we have to lock the group and retry if the inode
748 ** of the opened index file doesn't match the one recorded in the group index
749 ** file. Optionally take a pointer to the group index entry if the caller
750 ** has already gone to the work of finding it.
751 */
752 struct group_data *
tdx_data_open(struct group_index * index,const char * group,struct group_entry * entry)753 tdx_data_open(struct group_index *index, const char *group,
754 struct group_entry *entry)
755 {
756 struct group_data *data;
757 ARTNUM high, base;
758 ptrdiff_t offset;
759
760 if (entry == NULL) {
761 entry = tdx_index_entry(index, group);
762 if (entry == NULL)
763 return NULL;
764 }
765 offset = entry - index->entries;
766 data = tdx_data_new(group, index->writable);
767
768 /* Check to see if the inode of the index file matches. If it doesn't,
769 this probably means that as we were opening the index file, someone
770 else rewrote it (either expire or repack). Obtain a lock and try
771 again. If there's still a mismatch, go with what we get; there's some
772 sort of corruption.
773
774 This code is very sensitive to order and parallelism. See the comment
775 at the beginning of this file for methodology. */
776 if (!tdx_data_open_files(data))
777 goto fail;
778 high = entry->high;
779 base = entry->base;
780 if (entry->indexinode != data->indexinode) {
781 index_lock_group(index->fd, offset, INN_LOCK_READ);
782 if (!tdx_data_open_files(data)) {
783 index_lock_group(index->fd, offset, INN_LOCK_UNLOCK);
784 goto fail;
785 }
786 if (entry->indexinode != data->indexinode)
787 warn("tradindexed: index inode mismatch for %s", group);
788 high = entry->high;
789 base = entry->base;
790 index_lock_group(index->fd, offset, INN_LOCK_UNLOCK);
791 }
792 data->high = high;
793 data->base = base;
794 return data;
795
796 fail:
797 tdx_data_close(data);
798 return NULL;
799 }
800
801
802 /*
803 ** Add an overview record for a particular article. Takes the group entry,
804 ** the open overview data structure, and the information about the article
805 ** and returns true on success, false on failure. This function calls
806 ** tdx_data_store to do most of the real work and then updates the index
807 ** information.
808 */
809 bool
tdx_data_add(struct group_index * index,struct group_entry * entry,struct group_data * data,const struct article * article)810 tdx_data_add(struct group_index *index, struct group_entry *entry,
811 struct group_data *data, const struct article *article)
812 {
813 ARTNUM old_base;
814 ino_t old_inode;
815 ptrdiff_t offset = entry - index->entries;
816
817 if (!index->writable)
818 return false;
819 index_lock_group(index->fd, offset, INN_LOCK_WRITE);
820
821 /* Make sure we have the most current data files and that we have the
822 right base article number. */
823 if (entry->indexinode != data->indexinode) {
824 if (!tdx_data_open_files(data))
825 goto fail;
826 if (entry->indexinode != data->indexinode)
827 warn("tradindexed: index inode mismatch for %s",
828 HashToText(entry->hash));
829 data->base = entry->base;
830 }
831
832 /* If the article number is too low to store in the group index, repack
833 the group with a lower base index. */
834 if (entry->base > article->number) {
835 if (!tdx_data_pack_start(data, article->number))
836 goto fail;
837 old_inode = entry->indexinode;
838 old_base = entry->base;
839 entry->indexinode = data->indexinode;
840 entry->base = data->base;
841 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
842 if (!tdx_data_pack_finish(data)) {
843 entry->base = old_base;
844 entry->indexinode = old_inode;
845 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
846 goto fail;
847 }
848 }
849
850 /* Store the data. */
851 if (!tdx_data_store(data, article))
852 goto fail;
853 if (entry->base == 0)
854 entry->base = data->base;
855 if (entry->low == 0 || entry->low > article->number)
856 entry->low = article->number;
857 if (entry->high < article->number)
858 entry->high = article->number;
859 entry->count++;
860
861 /* Used to know that we have to remap the data file owing to our
862 OVSTATICSEARCH (an article whose number is lower than the highest has
863 been added at the end of the file). */
864 if (data->high > article->number)
865 data->remapoutoforder = true;
866
867 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
868 index_lock_group(index->fd, offset, INN_LOCK_UNLOCK);
869 return true;
870
871 fail:
872 index_lock_group(index->fd, offset, INN_LOCK_UNLOCK);
873 return false;
874 }
875
876
877 /*
878 ** Start a rebuild of the group data for a newsgroup. Right now, all this
879 ** does is lock the group index entry.
880 */
881 bool
tdx_index_rebuild_start(struct group_index * index,struct group_entry * entry)882 tdx_index_rebuild_start(struct group_index *index, struct group_entry *entry)
883 {
884 ptrdiff_t offset;
885
886 offset = entry - index->entries;
887 return index_lock_group(index->fd, offset, INN_LOCK_WRITE);
888 }
889
890
891 /*
892 ** Finish a rebuild of the group data for a newsgroup. Takes the old and new
893 ** entry and writes the data from the new entry into the group index, and
894 ** then unlocks it.
895 */
896 bool
tdx_index_rebuild_finish(struct group_index * index,struct group_entry * entry,struct group_entry * new)897 tdx_index_rebuild_finish(struct group_index *index, struct group_entry *entry,
898 struct group_entry *new)
899 {
900 ptrdiff_t offset;
901 ino_t new_inode;
902
903 new_inode = new->indexinode;
904 new->indexinode = entry->indexinode;
905 *entry = *new;
906 entry->indexinode = new_inode;
907 new->indexinode = new_inode;
908 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
909 offset = entry - index->entries;
910 index_lock_group(index->fd, offset, INN_LOCK_UNLOCK);
911 return true;
912 }
913
914
915 /*
916 ** Expire a single newsgroup. Most of the work is done by tdx_data_expire*,
917 ** but this routine has the responsibility to do locking (the same as would
918 ** be done for repacking, since the group base may change) and updating the
919 ** group entry.
920 */
921 bool
tdx_expire(const char * group,ARTNUM * low,struct history * history)922 tdx_expire(const char *group, ARTNUM *low, struct history *history)
923 {
924 struct group_index *index;
925 struct group_entry *entry;
926 struct group_entry new_entry;
927 struct group_data *data = NULL;
928 ptrdiff_t offset;
929 ARTNUM old_base;
930 ino_t old_inode;
931
932 index = tdx_index_open(true);
933 if (index == NULL)
934 return false;
935 entry = tdx_index_entry(index, group);
936 if (entry == NULL) {
937 tdx_index_close(index);
938 return false;
939 }
940 tdx_index_rebuild_start(index, entry);
941
942 /* tdx_data_expire_start builds the new IDX and DAT files and fills in the
943 struct group_entry that was passed to it. tdx_data_rebuild_finish does
944 the renaming of the new files to the final file names. */
945 new_entry = *entry;
946 new_entry.low = 0;
947 new_entry.count = 0;
948 new_entry.base = 0;
949 data = tdx_data_open(index, group, entry);
950 if (data == NULL)
951 goto fail;
952 if (!tdx_data_expire_start(group, data, &new_entry, history))
953 goto fail;
954 old_inode = entry->indexinode;
955 old_base = entry->base;
956 entry->indexinode = new_entry.indexinode;
957 entry->base = new_entry.base;
958 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
959 tdx_data_close(data);
960 if (!tdx_data_rebuild_finish(group)) {
961 entry->base = old_base;
962 entry->indexinode = old_inode;
963 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
964 goto fail;
965 }
966
967 /* Almost done. Update the group index. If there are no articles in the
968 group, the low water mark should be one more than the high water
969 mark. */
970 if (new_entry.low == 0)
971 new_entry.low = new_entry.high + 1;
972 tdx_index_rebuild_finish(index, entry, &new_entry);
973 if (low != NULL)
974 *low = entry->low;
975 tdx_index_close(index);
976 return true;
977
978 fail:
979 offset = entry - index->entries;
980 index_lock_group(index->fd, offset, INN_LOCK_UNLOCK);
981 if (data != NULL)
982 tdx_data_close(data);
983 tdx_index_close(index);
984 return false;
985 }
986
987
988 /*
989 ** RECOVERY AND AUDITING
990 **
991 ** All code below this point is not used in the normal operations of the
992 ** overview method. Instead, it's code to dump various data structures or
993 ** audit them for consistency, used by recovery tools and inspection tools.
994 */
995
996 /* Holds a newsgroup name and its hash, used to form a hash table mapping
997 newsgroup hash values to the actual names. */
998 struct hashmap {
999 HASH hash;
1000 char *name;
1001 char flag;
1002 };
1003
1004 /* Holds information needed by hash traversal functions. Right now, this is
1005 just the pointer to the group index and a flag saying whether to fix
1006 problems or not. */
1007 struct audit_data {
1008 struct group_index *index;
1009 bool fix;
1010 };
1011
1012
1013 /*
1014 ** Hash table functions for the mapping from group hashes to names.
1015 */
1016 static unsigned long
hashmap_hash(const void * entry)1017 hashmap_hash(const void *entry)
1018 {
1019 unsigned long hash;
1020 const struct hashmap *group = entry;
1021
1022 memcpy(&hash, &group->hash, sizeof(hash));
1023 return hash;
1024 }
1025
1026
1027 static const void *
hashmap_key(const void * entry)1028 hashmap_key(const void *entry)
1029 {
1030 return &((const struct hashmap *) entry)->hash;
1031 }
1032
1033
1034 static bool
hashmap_equal(const void * key,const void * entry)1035 hashmap_equal(const void *key, const void *entry)
1036 {
1037 const HASH *first = key;
1038 const HASH *second;
1039
1040 second = &((const struct hashmap *) entry)->hash;
1041 return memcmp(first, second, sizeof(HASH)) == 0;
1042 }
1043
1044
1045 static void
hashmap_delete(void * entry)1046 hashmap_delete(void *entry)
1047 {
1048 struct hashmap *group = entry;
1049
1050 free(group->name);
1051 free(group);
1052 }
1053
1054
1055 /*
1056 ** Construct a hash table of group hashes to group names by scanning the
1057 ** active file. Returns the constructed hash table.
1058 */
1059 static struct hash *
hashmap_load(void)1060 hashmap_load(void)
1061 {
1062 struct hash *hash;
1063 QIOSTATE *active;
1064 char *activepath, *line;
1065 struct cvector *data = NULL;
1066 struct stat st;
1067 size_t hash_size;
1068 struct hashmap *group;
1069 HASH grouphash;
1070
1071 activepath = concatpath(innconf->pathdb, INN_PATH_ACTIVE);
1072 active = QIOopen(activepath);
1073 free(activepath);
1074 if (active == NULL)
1075 return NULL;
1076 if (fstat(QIOfileno(active), &st) < 0)
1077 hash_size = 32 * 1024;
1078 else
1079 hash_size = st.st_size / 30;
1080 hash = hash_create(hash_size, hashmap_hash, hashmap_key, hashmap_equal,
1081 hashmap_delete);
1082
1083 line = QIOread(active);
1084 while (line != NULL) {
1085 data = cvector_split_space(line, data);
1086 if (data->count != 4) {
1087 warn("tradindexed: malformed active file line %s", line);
1088 continue;
1089 }
1090 group = xmalloc(sizeof(struct hashmap));
1091 group->name = xstrdup(data->strings[0]);
1092 group->flag = data->strings[3][0];
1093 grouphash = Hash(group->name, strlen(group->name));
1094 memcpy(&group->hash, &grouphash, sizeof(HASH));
1095 hash_insert(hash, &group->hash, group);
1096 line = QIOread(active);
1097 }
1098 if (data != NULL)
1099 cvector_free(data);
1100 QIOclose(active);
1101 return hash;
1102 }
1103
1104
1105 /*
1106 ** Print the stored information about a single group in human-readable form
1107 ** to stdout. The format is:
1108 **
1109 ** name high low base count flag deleted inode
1110 **
1111 ** all on one line. Name is passed into this function.
1112 */
1113 void
tdx_index_print(const char * name,const struct group_entry * entry,FILE * output)1114 tdx_index_print(const char *name, const struct group_entry *entry,
1115 FILE *output)
1116 {
1117 fprintf(output, "%s %lu %lu %lu %lu %c %lu %lu\n", name, entry->high,
1118 entry->low, entry->base, (unsigned long) entry->count, entry->flag,
1119 (unsigned long) entry->deleted, (unsigned long) entry->indexinode);
1120 }
1121
1122
1123 /*
1124 ** Dump the complete contents of the group.index file in human-readable form
1125 ** to the specified file, one line per group.
1126 */
1127 void
tdx_index_dump(struct group_index * index,FILE * output)1128 tdx_index_dump(struct group_index *index, FILE *output)
1129 {
1130 int bucket;
1131 long current;
1132 struct group_entry *entry;
1133 struct hash *hashmap;
1134 struct hashmap *group;
1135 char *name;
1136
1137 if (index->header == NULL || index->entries == NULL)
1138 return;
1139 hashmap = hashmap_load();
1140 for (bucket = 0; bucket < TDX_HASH_SIZE; bucket++) {
1141 current = index->header->hash[bucket].recno;
1142 while (current != -1) {
1143 if (!index_maybe_remap(index, current))
1144 return;
1145 entry = index->entries + current;
1146 name = NULL;
1147 if (hashmap != NULL) {
1148 group = hash_lookup(hashmap, &entry->hash);
1149 if (group != NULL)
1150 name = group->name;
1151 }
1152 if (name == NULL)
1153 name = HashToText(entry->hash);
1154 tdx_index_print(name, entry, output);
1155 if (current == entry->next.recno) {
1156 warn("tradindexed: index loop for entry %ld", current);
1157 return;
1158 }
1159 current = entry->next.recno;
1160 }
1161 }
1162 if (hashmap != NULL)
1163 hash_free(hashmap);
1164 }
1165
1166
1167 /*
1168 ** Audit a particular group entry location to ensure that it points to a
1169 ** valid entry within the group index file. Takes a pointer to the location,
1170 ** the number of the location, a pointer to the group entry if any (if not,
1171 ** the location is assumed to be part of the header hash table), and a flag
1172 ** saying whether to fix problems that are found.
1173 */
1174 static void
index_audit_loc(struct group_index * index,int * loc,long number,struct group_entry * entry,bool fix)1175 index_audit_loc(struct group_index *index, int *loc, long number,
1176 struct group_entry *entry, bool fix)
1177 {
1178 bool error = false;
1179
1180 if (*loc >= index->count) {
1181 warn("tradindexed: out of range index %d in %s %ld", *loc,
1182 (entry == NULL ? "bucket" : "entry"), number);
1183 error = true;
1184 }
1185 if (*loc < 0 && *loc != -1) {
1186 warn("tradindexed: invalid negative index %d in %s %ld", *loc,
1187 (entry == NULL ? "bucket" : "entry"), number);
1188 error = true;
1189 }
1190 if (entry != NULL && *loc == number) {
1191 warn("tradindexed: index loop for entry %ld", number);
1192 error = true;
1193 }
1194
1195 if (fix && error) {
1196 *loc = -1;
1197 inn_msync_page(loc, sizeof(*loc), MS_ASYNC);
1198 }
1199 }
1200
1201
1202 /*
1203 ** Check an entry to see if it was actually deleted. Make sure that all the
1204 ** information is consistent with a deleted group if it's not and the fix
1205 ** flag is set.
1206 */
1207 static void
index_audit_deleted(struct group_entry * entry,long number,bool fix)1208 index_audit_deleted(struct group_entry *entry, long number, bool fix)
1209 {
1210 if (entry->deleted != 0 && !HashEmpty(entry->hash)) {
1211 warn("tradindexed: entry %ld has a delete time but a non-zero hash",
1212 number);
1213 if (fix) {
1214 HashClear(&entry->hash);
1215 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
1216 }
1217 }
1218 }
1219
1220
1221 /*
1222 ** Audit the group header for any inconsistencies. This checks the
1223 ** reachability of all of the group entries, makes sure that deleted entries
1224 ** are on the free list, and otherwise checks the linked structure of the
1225 ** whole file. The data in individual entries is not examined. If the
1226 ** second argument is true, also attempt to fix inconsistencies.
1227 */
1228 static void
index_audit_header(struct group_index * index,bool fix)1229 index_audit_header(struct group_index *index, bool fix)
1230 {
1231 long bucket, current;
1232 struct group_entry *entry;
1233 int *parent, *next;
1234 bool *reachable;
1235
1236 /* First, walk all of the regular hash buckets, making sure that all of
1237 the group location pointers are valid and sane, that all groups that
1238 have been deleted are correctly marked as such, and that all groups are
1239 in their correct hash chain. Build reachability information as we go,
1240 used later to ensure that all group entries are reachable. */
1241 reachable = xcalloc(index->count, sizeof(bool));
1242 for (bucket = 0; bucket < TDX_HASH_SIZE; bucket++) {
1243 parent = &index->header->hash[bucket].recno;
1244 index_audit_loc(index, parent, bucket, NULL, fix);
1245 current = *parent;
1246 while (current >= 0 && current < index->count) {
1247 entry = &index->entries[current];
1248 next = &entry->next.recno;
1249 if (entry->deleted == 0 && bucket != index_bucket(entry->hash)) {
1250 warn("tradindexed: entry %ld is in bucket %ld instead of its"
1251 " correct bucket %ld",
1252 current, bucket, index_bucket(entry->hash));
1253 if (fix) {
1254 entry_splice(entry, parent);
1255 next = parent;
1256 }
1257 } else {
1258 if (reachable[current])
1259 warn("tradindexed: entry %ld is reachable from multiple"
1260 " paths",
1261 current);
1262 reachable[current] = true;
1263 }
1264 index_audit_deleted(entry, current, fix);
1265 index_audit_loc(index, &entry->next.recno, current, entry, fix);
1266 if (entry->deleted != 0) {
1267 warn("tradindexed: entry %ld is deleted but not in the free"
1268 " list",
1269 current);
1270 if (fix) {
1271 entry_splice(entry, parent);
1272 next = parent;
1273 reachable[current] = false;
1274 }
1275 }
1276 if (*next == current)
1277 break;
1278 parent = next;
1279 current = *parent;
1280 }
1281 }
1282
1283 /* Now, walk the free list. Make sure that each group in the free list is
1284 actually deleted, and update the reachability information. */
1285 index_audit_loc(index, &index->header->freelist.recno, 0, NULL, fix);
1286 parent = &index->header->freelist.recno;
1287 current = *parent;
1288 while (current >= 0 && current < index->count) {
1289 entry = &index->entries[current];
1290 index_audit_deleted(entry, current, fix);
1291 reachable[current] = true;
1292 if (!HashEmpty(entry->hash) && entry->deleted == 0) {
1293 warn("tradindexed: undeleted entry %ld in free list", current);
1294 if (fix) {
1295 entry_splice(entry, parent);
1296 reachable[current] = false;
1297 }
1298 }
1299 index_audit_loc(index, &entry->next.recno, current, entry, fix);
1300 if (entry->next.recno == current)
1301 break;
1302 parent = &entry->next.recno;
1303 current = *parent;
1304 }
1305
1306 /* Finally, check all of the unreachable entries and if fix is true, try
1307 to reattach them in the appropriate location. */
1308 for (current = 0; current < index->count; current++)
1309 if (!reachable[current]) {
1310 warn("tradindexed: unreachable entry %ld", current);
1311 if (fix) {
1312 entry = &index->entries[current];
1313 if (!HashEmpty(entry->hash) && entry->deleted == 0)
1314 index_add(index, entry);
1315 else {
1316 HashClear(&entry->hash);
1317 entry->deleted = 0;
1318 freelist_add(index, entry);
1319 }
1320 }
1321 }
1322
1323 /* All done. */
1324 free(reachable);
1325 }
1326
1327
1328 /*
1329 ** Audit a particular group entry for any inconsistencies. This doesn't
1330 ** check any of the structure, or whether the group is deleted, just the data
1331 ** as stored in the group data files (mostly by calling tdx_data_audit to do
1332 ** the real work). Note that while the low water mark may be updated, the
1333 ** high water mark is left unchanged.
1334 */
1335 static void
index_audit_group(struct group_index * index,struct group_entry * entry,struct hash * hashmap,bool fix)1336 index_audit_group(struct group_index *index, struct group_entry *entry,
1337 struct hash *hashmap, bool fix)
1338 {
1339 struct hashmap *group;
1340 ptrdiff_t offset;
1341
1342 offset = entry - index->entries;
1343 index_lock_group(index->fd, offset, INN_LOCK_WRITE);
1344 group = hash_lookup(hashmap, &entry->hash);
1345 if (group == NULL) {
1346 warn("tradindexed: group %ld not found in active file",
1347 entry_loc(index, entry));
1348 if (fix) {
1349 index_unlink_hash(index, entry->hash);
1350 HashClear(&entry->hash);
1351 entry->deleted = time(NULL);
1352 freelist_add(index, entry);
1353 }
1354 } else {
1355 if (entry->flag != group->flag) {
1356 entry->flag = group->flag;
1357 inn_msync_page(entry, sizeof(*entry), MS_ASYNC);
1358 }
1359 tdx_data_audit(group->name, entry, fix);
1360 }
1361 index_lock_group(index->fd, offset, INN_LOCK_UNLOCK);
1362 }
1363
1364
1365 /*
1366 ** Check to be sure that a given group exists in the overview index, and if
1367 ** missing, adds it. Assumes that the index isn't locked, since it calls the
1368 ** normal functions for adding new groups (this should only be called after
1369 ** the index has already been repaired, for the same reason). Called as a
1370 ** hash traversal function, walking the hash table of groups from the active
1371 ** file.
1372 */
1373 static void
index_audit_active(void * value,void * cookie)1374 index_audit_active(void *value, void *cookie)
1375 {
1376 struct hashmap *group = value;
1377 struct audit_data *data = cookie;
1378 struct group_entry *entry;
1379
1380 entry = tdx_index_entry(data->index, group->name);
1381 if (entry == NULL) {
1382 warn("tradindexed: group %s missing from overview", group->name);
1383 if (data->fix)
1384 tdx_index_add(data->index, group->name, 0, 0, &group->flag);
1385 }
1386 }
1387
1388
1389 /*
1390 ** Audit the group index for any inconsistencies. If the argument is true,
1391 ** also attempt to fix those inconsistencies.
1392 */
1393 void
tdx_index_audit(bool fix)1394 tdx_index_audit(bool fix)
1395 {
1396 struct group_index *index;
1397 struct stat st;
1398 off_t expected;
1399 int count;
1400 struct hash *hashmap;
1401 long bucket;
1402 struct group_entry *entry;
1403 struct audit_data data;
1404
1405 index = tdx_index_open(true);
1406 if (index == NULL)
1407 return;
1408
1409 /* Keep a lock on the header through the whole audit process. This will
1410 stall any newgroups or rmgroups, but not normal article reception. We
1411 don't want the structure of the group entries changing out from under
1412 us, although we don't mind if the data does until we're validating that
1413 particular group. */
1414 index_lock(index->fd, INN_LOCK_WRITE);
1415
1416 /* Make sure the size looks sensible. */
1417 if (fstat(index->fd, &st) < 0) {
1418 syswarn("tradindexed: cannot fstat %s", index->path);
1419 return;
1420 }
1421 count = index_entry_count(st.st_size);
1422 expected = index_file_size(count);
1423 if (expected != st.st_size) {
1424 syswarn("tradindexed: %ld bytes of trailing trash in %s",
1425 (unsigned long) (st.st_size - expected), index->path);
1426 if (fix)
1427 if (ftruncate(index->fd, expected) < 0)
1428 syswarn("tradindexed: cannot truncate %s", index->path);
1429 }
1430 index_maybe_remap(index, count);
1431
1432 /* Okay everything is now mapped and happy. Validate the header. */
1433 index_audit_header(index, fix);
1434 index_lock(index->fd, INN_LOCK_UNLOCK);
1435
1436 /* Walk all the group entries and check them individually. To do this, we
1437 need to map hashes to group names, so load a hash of the active file to
1438 do that resolution. */
1439 hashmap = hashmap_load();
1440 if (hashmap == NULL) {
1441 warn("tradindexed: cannot hash active file");
1442 return;
1443 }
1444 data.index = index;
1445 data.fix = fix;
1446 hash_traverse(hashmap, index_audit_active, &data);
1447 for (bucket = 0; bucket < index->count; bucket++) {
1448 entry = &index->entries[bucket];
1449 if (HashEmpty(entry->hash) || entry->deleted != 0)
1450 continue;
1451 index_audit_group(index, entry, hashmap, fix);
1452 }
1453 hash_free(hashmap);
1454 }
1455