1 /*
2  * Check and fix an arena partition.
3  *
4  * This is a lot grittier than the rest of Venti because
5  * it can't just give up if a byte here or there is wrong.
6  *
7  * The rule here (hopefully followed!) is that block corruption
8  * only ever has a local effect -- there are no blocks that you
9  * can wipe out that will cause large portions of
10  * uncorrupted data blocks to be useless.
11  */
12 
13 #include "stdinc.h"
14 #include "dat.h"
15 #include "fns.h"
16 #include "whack.h"
17 
18 #define ROUNDUP(x,n)		(((x)+(n)-1)&~((n)-1))
19 
20 #pragma varargck type "z" uvlong
21 #pragma varargck type "z" vlong
22 #pragma varargck type "t" uint
23 
24 enum
25 {
26 	K = 1024,
27 	M = 1024*1024,
28 	G = 1024*1024*1024,
29 
30 	Block = 4096,
31 };
32 
33 int debugsha1;
34 
35 int verbose;
36 Part *part;
37 char *file;
38 char *basename;
39 char *dumpbase;
40 int fix;
41 int badreads;
42 int unseal;
43 uchar zero[MaxDiskBlock];
44 
45 Arena lastarena;
46 ArenaPart ap;
47 uvlong arenasize;
48 int nbadread;
49 int nbad;
50 uvlong partend;
51 void checkarena(vlong, int);
52 
53 void
usage(void)54 usage(void)
55 {
56 	fprint(2, "usage: fixarenas [-fv] [-a arenasize] [-b blocksize] file [ranges]\n");
57 	threadexitsall(0);
58 }
59 
60 /*
61  * Format number in simplest way that is okay with unittoull.
62  */
63 static int
zfmt(Fmt * fmt)64 zfmt(Fmt *fmt)
65 {
66 	vlong x;
67 
68 	x = va_arg(fmt->args, vlong);
69 	if(x == 0)
70 		return fmtstrcpy(fmt, "0");
71 	if(x%G == 0)
72 		return fmtprint(fmt, "%lldG", x/G);
73 	if(x%M == 0)
74 		return fmtprint(fmt, "%lldM", x/M);
75 	if(x%K == 0)
76 		return fmtprint(fmt, "%lldK", x/K);
77 	return fmtprint(fmt, "%lld", x);
78 }
79 
80 /*
81  * Format time like ctime without newline.
82  */
83 static int
tfmt(Fmt * fmt)84 tfmt(Fmt *fmt)
85 {
86 	uint t;
87 	char buf[30];
88 
89 	t = va_arg(fmt->args, uint);
90 	strcpy(buf, ctime(t));
91 	buf[28] = 0;
92 	return fmtstrcpy(fmt, buf);
93 }
94 
95 /*
96  * Coalesce messages about unreadable sectors into larger ranges.
97  * bad(0, 0) flushes the buffer.
98  */
99 static void
bad(char * msg,vlong o,int len)100 bad(char *msg, vlong o, int len)
101 {
102 	static vlong lb0, lb1;
103 	static char *lmsg;
104 
105 	if(msg == nil)
106 		msg = lmsg;
107 	if(o == -1){
108 		lmsg = nil;
109 		lb0 = 0;
110 		lb1 = 0;
111 		return;
112 	}
113 	if(lb1 != o || (msg && lmsg && strcmp(msg, lmsg) != 0)){
114 		if(lb0 != lb1)
115 			print("%s %#llux+%#llux (%,lld+%,lld)\n",
116 				lmsg, lb0, lb1-lb0, lb0, lb1-lb0);
117 		lb0 = o;
118 	}
119 	lmsg = msg;
120 	lb1 = o+len;
121 }
122 
123 /*
124  * Read in the len bytes of data at the offset.  If can't for whatever reason,
125  * fill it with garbage but print an error.
126  */
127 static uchar*
readdisk(uchar * buf,vlong offset,int len)128 readdisk(uchar *buf, vlong offset, int len)
129 {
130 	int i, j, k, n;
131 
132 	if(offset >= partend){
133 		memset(buf, 0xFB, len);
134 		return buf;
135 	}
136 
137 	if(offset+len > partend){
138 		memset(buf, 0xFB, len);
139 		len = partend - offset;
140 	}
141 
142 	if(readpart(part, offset, buf, len) >= 0)
143 		return buf;
144 
145 	/*
146 	 * The read failed.  Clear the buffer to nonsense, and
147 	 * then try reading in smaller pieces.  If that fails,
148 	 * read in even smaller pieces.  And so on down to sectors.
149 	 */
150 	memset(buf, 0xFD, len);
151 	for(i=0; i<len; i+=64*K){
152 		n = 64*K;
153 		if(i+n > len)
154 			n = len-i;
155 		if(readpart(part, offset+i, buf+i, n) >= 0)
156 			continue;
157 		for(j=i; j<len && j<i+64*K; j+=4*K){
158 			n = 4*K;
159 			if(j+n > len)
160 				n = len-j;
161 			if(readpart(part, offset+j, buf+j, n) >= 0)
162 				continue;
163 			for(k=j; k<len && k<j+4*K; k+=512){
164 				if(readpart(part, offset+k, buf+k, 512) >= 0)
165 					continue;
166 				bad("disk read failed at", k, 512);
167 				badreads++;
168 			}
169 		}
170 	}
171 	bad(nil, 0, 0);
172 	return buf;
173 }
174 
175 /*
176  * Buffer to support running SHA1 hash of the disk.
177  */
178 typedef struct Shabuf Shabuf;
179 struct Shabuf
180 {
181 	int fd;
182 	vlong offset;
183 	DigestState state;
184 	int rollback;
185 	vlong r0;
186 	DigestState *hist;
187 	int nhist;
188 };
189 
190 void
sbdebug(Shabuf * sb,char * file)191 sbdebug(Shabuf *sb, char *file)
192 {
193 	int fd;
194 
195 	if(sb->fd > 0){
196 		close(sb->fd);
197 		sb->fd = 0;
198 	}
199 	if((fd = create(file, OWRITE, 0666)) < 0)
200 		return;
201 	if(fd == 0){
202 		fd = dup(fd, -1);
203 		close(0);
204 	}
205 	sb->fd = fd;
206 }
207 
208 void
sbupdate(Shabuf * sb,uchar * p,vlong offset,int len)209 sbupdate(Shabuf *sb, uchar *p, vlong offset, int len)
210 {
211 	int n, x;
212 	vlong o;
213 
214 	if(sb->rollback && !sb->hist){
215 		sb->r0 = offset;
216 		sb->nhist = 1;
217 		sb->hist = vtmalloc(sb->nhist*sizeof *sb->hist);
218 		memset(sb->hist, 0, sizeof sb->hist[0]);
219 	}
220 	if(sb->r0 == 0)
221 		sb->r0 = offset;
222 
223 	if(sb->offset < offset || sb->offset >= offset+len){
224 		if(0) print("sbupdate %p %#llux+%d but offset=%#llux\n",
225 			p, offset, len, sb->offset);
226 		return;
227 	}
228 	x = sb->offset - offset;
229 	if(0) print("sbupdate %p %#llux+%d skip %d\n",
230 		sb, offset, len, x);
231 	if(x){
232 		p += x;
233 		offset += x;
234 		len -= x;
235 	}
236 	assert(sb->offset == offset);
237 
238 	if(sb->fd > 0)
239 		pwrite(sb->fd, p, len, offset - sb->r0);
240 
241 	if(!sb->rollback){
242 		sha1(p, len, nil, &sb->state);
243 		sb->offset += len;
244 		return;
245 	}
246 
247 	/* save state every 4M so we can roll back quickly */
248 	o = offset - sb->r0;
249 	while(len > 0){
250 		n = 4*M - o%(4*M);
251 		if(n > len)
252 			n = len;
253 		sha1(p, n, nil, &sb->state);
254 		sb->offset += n;
255 		o += n;
256 		p += n;
257 		len -= n;
258 		if(o%(4*M) == 0){
259 			x = o/(4*M);
260 			if(x >= sb->nhist){
261 				if(x != sb->nhist)
262 					print("oops! x=%d nhist=%d\n", x, sb->nhist);
263 				sb->nhist += 32;
264 				sb->hist = vtrealloc(sb->hist, sb->nhist*sizeof *sb->hist);
265 			}
266 			sb->hist[x] = sb->state;
267 		}
268 	}
269 }
270 
271 void
sbdiskhash(Shabuf * sb,vlong eoffset)272 sbdiskhash(Shabuf *sb, vlong eoffset)
273 {
274 	static uchar dbuf[4*M];
275 	int n;
276 
277 	while(sb->offset < eoffset){
278 		n = sizeof dbuf;
279 		if(sb->offset+n > eoffset)
280 			n = eoffset - sb->offset;
281 		readdisk(dbuf, sb->offset, n);
282 		sbupdate(sb, dbuf, sb->offset, n);
283 	}
284 }
285 
286 void
sbrollback(Shabuf * sb,vlong offset)287 sbrollback(Shabuf *sb, vlong offset)
288 {
289 	int x;
290 	vlong o;
291 	Dir d;
292 
293 	if(!sb->rollback || !sb->r0){
294 		print("cannot rollback sha\n");
295 		return;
296 	}
297 	if(offset >= sb->offset)
298 		return;
299 	o = offset - sb->r0;
300 	x = o/(4*M);
301 	if(x >= sb->nhist){
302 		print("cannot rollback sha\n");
303 		return;
304 	}
305 	sb->state = sb->hist[x];
306 	sb->offset = sb->r0 + x*4*M;
307 	assert(sb->offset <= offset);
308 
309 	if(sb->fd > 0){
310 		nulldir(&d);
311 		d.length = sb->offset - sb->r0;
312 		dirfwstat(sb->fd, &d);
313 	}
314 }
315 
316 void
sbscore(Shabuf * sb,uchar * score)317 sbscore(Shabuf *sb, uchar *score)
318 {
319 	if(sb->hist){
320 		free(sb->hist);
321 		sb->hist = nil;
322 	}
323 	sha1(nil, 0, score, &sb->state);
324 }
325 
326 /*
327  * If we're fixing arenas, then editing this memory edits the disk!
328  * It will be written back out as new data is paged in.
329  */
330 uchar buf[4*M];
331 uchar sbuf[4*M];
332 vlong bufoffset;
333 int buflen;
334 
335 static void pageout(void);
336 static uchar*
pagein(vlong offset,int len)337 pagein(vlong offset, int len)
338 {
339 	pageout();
340 	if(offset >= partend){
341 		memset(buf, 0xFB, sizeof buf);
342 		return buf;
343 	}
344 
345 	if(offset+len > partend){
346 		memset(buf, 0xFB, sizeof buf);
347 		len = partend - offset;
348 	}
349 	bufoffset = offset;
350 	buflen = len;
351 	readdisk(buf, offset, len);
352 	memmove(sbuf, buf, len);
353 	return buf;
354 }
355 
356 static void
pageout(void)357 pageout(void)
358 {
359 	if(buflen==0 || !fix || memcmp(buf, sbuf, buflen) == 0){
360 		buflen = 0;
361 		return;
362 	}
363 	if(writepart(part, bufoffset, buf, buflen) < 0)
364 		print("disk write failed at %#llux+%#ux (%,lld+%,d)\n",
365 			bufoffset, buflen, bufoffset, buflen);
366 	buflen = 0;
367 }
368 
369 static void
zerorange(vlong offset,int len)370 zerorange(vlong offset, int len)
371 {
372 	int i;
373 	vlong ooff;
374 	int olen;
375 	enum { MinBlock = 4*K, MaxBlock = 8*K };
376 
377 	if(0)
378 	if(bufoffset <= offset && offset+len <= bufoffset+buflen){
379 		memset(buf+(offset-bufoffset), 0, len);
380 		return;
381 	}
382 
383 	ooff = bufoffset;
384 	olen = buflen;
385 
386 	i = offset%MinBlock;
387 	if(i+len < MaxBlock){
388 		pagein(offset-i, (len+MinBlock-1)&~(MinBlock-1));
389 		memset(buf+i, 0, len);
390 	}else{
391 		pagein(offset-i, MaxBlock);
392 		memset(buf+i, 0, MaxBlock-i);
393 		offset += MaxBlock-i;
394 		len -= MaxBlock-i;
395 		while(len >= MaxBlock){
396 			pagein(offset, MaxBlock);
397 			memset(buf, 0, MaxBlock);
398 			offset += MaxBlock;
399 			len -= MaxBlock;
400 		}
401 		pagein(offset, (len+MinBlock-1)&~(MinBlock-1));
402 		memset(buf, 0, len);
403 	}
404 	pagein(ooff, olen);
405 }
406 
407 /*
408  * read/write integers
409  *
410 static void
411 p16(uchar *p, u16int u)
412 {
413 	p[0] = (u>>8) & 0xFF;
414 	p[1] = u & 0xFF;
415 }
416 */
417 
418 static u16int
u16(uchar * p)419 u16(uchar *p)
420 {
421 	return (p[0]<<8)|p[1];
422 }
423 
424 static void
p32(uchar * p,u32int u)425 p32(uchar *p, u32int u)
426 {
427 	p[0] = (u>>24) & 0xFF;
428 	p[1] = (u>>16) & 0xFF;
429 	p[2] = (u>>8) & 0xFF;
430 	p[3] = u & 0xFF;
431 }
432 
433 static u32int
u32(uchar * p)434 u32(uchar *p)
435 {
436 	return (p[0]<<24)|(p[1]<<16)|(p[2]<<8)|p[3];
437 }
438 
439 /*
440 static void
441 p64(uchar *p, u64int u)
442 {
443 	p32(p, u>>32);
444 	p32(p, u);
445 }
446 */
447 
448 static u64int
u64(uchar * p)449 u64(uchar *p)
450 {
451 	return ((u64int)u32(p)<<32) | u32(p+4);
452 }
453 
454 static int
vlongcmp(const void * va,const void * vb)455 vlongcmp(const void *va, const void *vb)
456 {
457 	vlong a, b;
458 
459 	a = *(vlong*)va;
460 	b = *(vlong*)vb;
461 	if(a < b)
462 		return -1;
463 	if(b > a)
464 		return 1;
465 	return 0;
466 }
467 
468 /* D and S are in draw.h */
469 #define D VD
470 #define S VS
471 
472 enum
473 {
474 	D = 0x10000,
475 	Z = 0x20000,
476 	S = 0x30000,
477 	T = 0x40000,
478 	N = 0xFFFF
479 };
480 typedef struct Info Info;
481 struct Info
482 {
483 	int len;
484 	char *name;
485 };
486 
487 Info partinfo[] = {
488 	4,	"magic",
489 	D|4,	"version",
490 	Z|4,	"blocksize",
491 	4,	"arenabase",
492 	0
493 };
494 
495 Info headinfo4[] = {
496 	4,	"magic",
497 	D|4,	"version",
498 	S|ANameSize,	"name",
499 	Z|4,	"blocksize",
500 	Z|8,	"size",
501 	0
502 };
503 
504 Info headinfo5[] = {
505 	4,	"magic",
506 	D|4,	"version",
507 	S|ANameSize,	"name",
508 	Z|4,	"blocksize",
509 	Z|8,	"size",
510 	4,	"clumpmagic",
511 	0
512 };
513 
514 Info tailinfo4[] = {
515 	4,	"magic",
516 	D|4,	"version",
517 	S|ANameSize,	"name",
518 	D|4,	"clumps",
519 	D|4,	"cclumps",
520 	T|4,	"ctime",
521 	T|4,	"wtime",
522 	D|8,	"used",
523 	D|8,	"uncsize",
524 	1,	"sealed",
525 	0
526 };
527 
528 Info tailinfo4a[] = {
529 	/* tailinfo 4 */
530 	4,	"magic",
531 	D|4,	"version",
532 	S|ANameSize,	"name",
533 	D|4,	"clumps",
534 	D|4,	"cclumps",
535 	T|4,	"ctime",
536 	T|4,	"wtime",
537 	D|8,	"used",
538 	D|8,	"uncsize",
539 	1,	"sealed",
540 
541 	/* mem stats */
542 	1,	"extension",
543 	D|4,	"mem.clumps",
544 	D|4,	"mem.cclumps",
545 	D|8,	"mem.used",
546 	D|8,	"mem.uncsize",
547 	1,	"mem.sealed",
548 	0
549 };
550 
551 Info tailinfo5[] = {
552 	4,	"magic",
553 	D|4,	"version",
554 	S|ANameSize,	"name",
555 	D|4,	"clumps",
556 	D|4,	"cclumps",
557 	T|4,	"ctime",
558 	T|4,	"wtime",
559 	4,	"clumpmagic",
560 	D|8,	"used",
561 	D|8,	"uncsize",
562 	1,	"sealed",
563 	0
564 };
565 
566 Info tailinfo5a[] = {
567 	/* tailinfo 5 */
568 	4,	"magic",
569 	D|4,	"version",
570 	S|ANameSize,	"name",
571 	D|4,	"clumps",
572 	D|4,	"cclumps",
573 	T|4,	"ctime",
574 	T|4,	"wtime",
575 	4,	"clumpmagic",
576 	D|8,	"used",
577 	D|8,	"uncsize",
578 	1,	"sealed",
579 
580 	/* mem stats */
581 	1,	"extension",
582 	D|4,	"mem.clumps",
583 	D|4,	"mem.cclumps",
584 	D|8,	"mem.used",
585 	D|8,	"mem.uncsize",
586 	1,	"mem.sealed",
587 	0
588 };
589 
590 void
showdiffs(uchar * want,uchar * have,int len,Info * info)591 showdiffs(uchar *want, uchar *have, int len, Info *info)
592 {
593 	int n;
594 
595 	while(len > 0 && (n=info->len&N) > 0){
596 		if(memcmp(have, want, n) != 0){
597 			switch(info->len){
598 			case 1:
599 				print("\t%s: correct=%d disk=%d\n",
600 					info->name, *want, *have);
601 				break;
602 			case 4:
603 				print("\t%s: correct=%#ux disk=%#ux\n",
604 					info->name, u32(want), u32(have));
605 				break;
606 			case D|4:
607 				print("\t%s: correct=%,ud disk=%,ud\n",
608 					info->name, u32(want), u32(have));
609 				break;
610 			case T|4:
611 				print("\t%s: correct=%t\n\t\tdisk=%t\n",
612 					info->name, u32(want), u32(have));
613 				break;
614 			case Z|4:
615 				print("\t%s: correct=%z disk=%z\n",
616 					info->name, (uvlong)u32(want), (uvlong)u32(have));
617 				break;
618 			case D|8:
619 				print("\t%s: correct=%,lld disk=%,lld\n",
620 					info->name, u64(want), u64(have));
621 				break;
622 			case Z|8:
623 				print("\t%s: correct=%z disk=%z\n",
624 					info->name, u64(want), u64(have));
625 				break;
626 			case S|ANameSize:
627 				print("\t%s: correct=%s disk=%.*s\n",
628 					info->name, (char*)want,
629 					utfnlen((char*)have, ANameSize-1),
630 					(char*)have);
631 				break;
632 			default:
633 				print("\t%s: correct=%.*H disk=%.*H\n",
634 					info->name, n, want, n, have);
635 				break;
636 			}
637 		}
638 		have += n;
639 		want += n;
640 		len -= n;
641 		info++;
642 	}
643 	if(len > 0 && memcmp(have, want, len) != 0){
644 		if(memcmp(want, zero, len) != 0)
645 			print("!!\textra want data in showdiffs (bug in fixarenas)\n");
646 		else
647 			print("\tnon-zero data on disk after structure\n");
648 		if(verbose > 1){
649 			print("want: %.*H\n", len, want);
650 			print("have: %.*H\n", len, have);
651 		}
652 	}
653 }
654 
655 /*
656  * Does part begin with an arena?
657  */
658 int
isonearena(void)659 isonearena(void)
660 {
661 	return u32(pagein(0, Block)) == ArenaHeadMagic;
662 }
663 
664 static int tabsizes[] = { 16*1024, 64*1024, 512*1024, 768*1024, };
665 /*
666  * Poke around on the disk to guess what the ArenaPart numbers are.
667  */
668 void
guessgeometry(void)669 guessgeometry(void)
670 {
671 	int i, j, n, bestn, ndiff, nhead, ntail;
672 	uchar *p, *ep, *sp;
673 	u64int diff[100], head[20], tail[20];
674 	u64int offset, bestdiff;
675 
676 	ap.version = ArenaPartVersion;
677 
678 	if(arenasize == 0 || ap.blocksize == 0){
679 		/*
680 		 * The ArenaPart block at offset PartBlank may be corrupt or just wrong.
681 		 * Instead, look for the individual arena headers and tails, which there
682 		 * are many of, and once we've seen enough, infer the spacing.
683 		 *
684 		 * Of course, nothing in the file format requires that arenas be evenly
685 		 * spaced, but fmtarenas always does that for us.
686 		 */
687 		nhead = 0;
688 		ntail = 0;
689 		for(offset=PartBlank; offset<partend; offset+=4*M){
690 			p = pagein(offset, 4*M);
691 			for(sp=p, ep=p+4*M; p<ep; p+=K){
692 				if(u32(p) == ArenaHeadMagic && nhead < nelem(head)){
693 					if(verbose)
694 						print("arena head at %#llx\n", offset+(p-sp));
695 					head[nhead++] = offset+(p-sp);
696 				}
697 				if(u32(p) == ArenaMagic && ntail < nelem(tail)){
698 					tail[ntail++] = offset+(p-sp);
699 					if(verbose)
700 						print("arena tail at %#llx\n", offset+(p-sp));
701 				}
702 			}
703 			if(nhead == nelem(head) && ntail == nelem(tail))
704 				break;
705 		}
706 		if(nhead < 3 && ntail < 3)
707 			sysfatal("too few intact arenas: %d heads, %d tails", nhead, ntail);
708 
709 		/*
710 		 * Arena size is likely the most common
711 		 * inter-head or inter-tail spacing.
712 		 */
713 		ndiff = 0;
714 		for(i=1; i<nhead; i++)
715 			diff[ndiff++] = head[i] - head[i-1];
716 		for(i=1; i<ntail; i++)
717 			diff[ndiff++] = tail[i] - tail[i-1];
718 		qsort(diff, ndiff, sizeof diff[0], vlongcmp);
719 		bestn = 0;
720 		bestdiff = 0;
721 		for(i=1, n=1; i<=ndiff; i++, n++){
722 			if(i==ndiff || diff[i] != diff[i-1]){
723 				if(n > bestn){
724 					bestn = n;
725 					bestdiff = diff[i-1];
726 				}
727 				n = 0;
728 			}
729 		}
730 		print("arena size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
731 		if(arenasize != 0 && arenasize != bestdiff)
732 			print("using user-specified size %z instead\n", arenasize);
733 		else
734 			arenasize = bestdiff;
735 
736 		/*
737 		 * The arena tail for an arena is arenasize-blocksize from the head.
738 		 */
739 		ndiff = 0;
740 		for(i=j=0; i<nhead && j<ntail; ){
741 			if(tail[j] < head[i]){
742 				j++;
743 				continue;
744 			}
745 			if(tail[j] < head[i]+arenasize){
746 				diff[ndiff++] = head[i]+arenasize - tail[j];
747 				j++;
748 				continue;
749 			}
750 			i++;
751 		}
752 		if(ndiff < 3)
753 			sysfatal("too few intact arenas: %d head, tail pairs", ndiff);
754 		qsort(diff, ndiff, sizeof diff[0], vlongcmp);
755 		bestn = 0;
756 		bestdiff = 0;
757 		for(i=1, n=1; i<=ndiff; i++, n++){
758 			if(i==ndiff || diff[i] != diff[i-1]){
759 				if(n > bestn){
760 					bestn = n;
761 					bestdiff = diff[i-1];
762 				}
763 				n = 0;
764 			}
765 		}
766 		print("block size likely %z (%d of %d)\n", bestdiff, bestn, ndiff);
767 		if(ap.blocksize != 0 && ap.blocksize != bestdiff)
768 			print("using user-specified size %z instead\n", (vlong)ap.blocksize);
769 		else
770 			ap.blocksize = bestdiff;
771 		if(ap.blocksize == 0 || ap.blocksize&(ap.blocksize-1))
772 			sysfatal("block size not a power of two");
773 		if(ap.blocksize > MaxDiskBlock)
774 			sysfatal("block size too big (max=%d)", MaxDiskBlock);
775 
776 		/*
777 		 * Use head/tail information to deduce arena base.
778 		 */
779 		ndiff = 0;
780 		for(i=0; i<nhead; i++)
781 			diff[ndiff++] = head[i]%arenasize;
782 		for(i=0; i<ntail; i++)
783 			diff[ndiff++] = (tail[i]+ap.blocksize)%arenasize;
784 		qsort(diff, ndiff, sizeof diff[0], vlongcmp);
785 		bestn = 0;
786 		bestdiff = 0;
787 		for(i=1, n=1; i<=ndiff; i++, n++){
788 			if(i==ndiff || diff[i] != diff[i-1]){
789 				if(n > bestn){
790 					bestn = n;
791 					bestdiff = diff[i-1];
792 				}
793 				n = 0;
794 			}
795 		}
796 		ap.arenabase = bestdiff;
797 	}
798 
799 	ap.tabbase = ROUNDUP(PartBlank+HeadSize, ap.blocksize);
800 	/*
801 	 * XXX pick up table, check arenabase.
802 	 * XXX pick up table, record base name.
803 	 */
804 
805 	/*
806 	 * Somewhat standard computation.
807 	 * Fmtarenas used to use 64k tab, now uses 512k tab.
808 	 */
809 	if(ap.arenabase == 0){
810 		print("trying standard arena bases...\n");
811 		for(i=0; i<nelem(tabsizes); i++){
812 			ap.arenabase = ROUNDUP(PartBlank+HeadSize+tabsizes[i], ap.blocksize);
813 			p = pagein(ap.arenabase, Block);
814 			if(u32(p) == ArenaHeadMagic)
815 				break;
816 		}
817 	}
818 	p = pagein(ap.arenabase, Block);
819 	print("arena base likely %z%s\n", (vlong)ap.arenabase,
820 		u32(p)!=ArenaHeadMagic ? " (but no arena head there)" : "");
821 
822 	ap.tabsize = ap.arenabase - ap.tabbase;
823 }
824 
825 /*
826  * Check the arena partition blocks and then the arenas listed in range.
827  */
828 void
checkarenas(char * range)829 checkarenas(char *range)
830 {
831 	char *s, *t;
832 	int i, lo, hi, narena;
833 	uchar dbuf[HeadSize];
834 	uchar *p;
835 
836 	guessgeometry();
837 
838 	partend -= partend%ap.blocksize;
839 
840 	memset(dbuf, 0, sizeof dbuf);
841 	packarenapart(&ap, dbuf);
842 	p = pagein(PartBlank, Block);
843 	if(memcmp(p, dbuf, HeadSize) != 0){
844 		print("on-disk arena part superblock incorrect\n");
845 		showdiffs(dbuf, p, HeadSize, partinfo);
846 	}
847 	memmove(p, dbuf, HeadSize);
848 
849 	narena = (partend-ap.arenabase + arenasize-1)/arenasize;
850 	if(range == nil){
851 		for(i=0; i<narena; i++)
852 			checkarena(ap.arenabase+(vlong)i*arenasize, i);
853 	}else if(strcmp(range, "none") == 0){
854 		/* nothing */
855 	}else{
856 		/* parse, e.g., -4,8-9,10- */
857 		for(s=range; *s; s=t){
858 			t = strchr(s, ',');
859 			if(t)
860 				*t++ = 0;
861 			else
862 				t = s+strlen(s);
863 			if(*s == '-')
864 				lo = 0;
865 			else
866 				lo = strtol(s, &s, 0);
867 			hi = lo;
868 			if(*s == '-'){
869 				s++;
870 				if(*s == 0)
871 					hi = narena-1;
872 				else
873 					hi = strtol(s, &s, 0);
874 			}
875 			if(*s != 0){
876 				print("bad arena range: %s\n", s);
877 				continue;
878 			}
879 			for(i=lo; i<=hi; i++)
880 				checkarena(ap.arenabase+(vlong)i*arenasize, i);
881 		}
882 	}
883 }
884 
885 /*
886  * Is there a clump here at p?
887  */
888 static int
isclump(uchar * p,Clump * cl,u32int * pmagic)889 isclump(uchar *p, Clump *cl, u32int *pmagic)
890 {
891 	int n;
892 	u32int magic;
893 	uchar score[VtScoreSize], *bp;
894 	Unwhack uw;
895 	uchar ubuf[70*1024];
896 
897 	bp = p;
898 	magic = u32(p);
899 	if(magic == 0)
900 		return 0;
901 	p += U32Size;
902 
903 	cl->info.type = vtfromdisktype(*p);
904 	if(cl->info.type == 0xFF)
905 		return 0;
906 	p++;
907 	cl->info.size = u16(p);
908 	p += U16Size;
909 	cl->info.uncsize = u16(p);
910 	if(cl->info.size > cl->info.uncsize)
911 		return 0;
912 	p += U16Size;
913 	scorecp(cl->info.score, p);
914 	p += VtScoreSize;
915 	cl->encoding = *p;
916 	p++;
917 	cl->creator = u32(p);
918 	p += U32Size;
919 	cl->time = u32(p);
920 	p += U32Size;
921 
922 	switch(cl->encoding){
923 	case ClumpENone:
924 		if(cl->info.size != cl->info.uncsize)
925 			return 0;
926 		scoremem(score, p, cl->info.size);
927 		if(scorecmp(score, cl->info.score) != 0)
928 			return 0;
929 		break;
930 	case ClumpECompress:
931 		if(cl->info.size >= cl->info.uncsize)
932 			return 0;
933 		unwhackinit(&uw);
934 		n = unwhack(&uw, ubuf, cl->info.uncsize, p, cl->info.size);
935 		if(n != cl->info.uncsize)
936 			return 0;
937 		scoremem(score, ubuf, cl->info.uncsize);
938 		if(scorecmp(score, cl->info.score) != 0)
939 			return 0;
940 		break;
941 	default:
942 		return 0;
943 	}
944 	p += cl->info.size;
945 
946 	/* it all worked out in the end */
947 	*pmagic = magic;
948 	return p - bp;
949 }
950 
951 /*
952  * All ClumpInfos seen in this arena.
953  * Kept in binary tree so we can look up by score.
954  */
955 typedef struct Cit Cit;
956 struct Cit
957 {
958 	int left;
959 	int right;
960 	vlong corrupt;
961 	ClumpInfo ci;
962 };
963 Cit *cibuf;
964 int ciroot;
965 int ncibuf, mcibuf;
966 
967 void
resetcibuf(void)968 resetcibuf(void)
969 {
970 	ncibuf = 0;
971 	ciroot = -1;
972 }
973 
974 int*
ltreewalk(int * p,uchar * score)975 ltreewalk(int *p, uchar *score)
976 {
977 	int i;
978 
979 	for(;;){
980 		if(*p == -1)
981 			return p;
982 		i = scorecmp(cibuf[*p].ci.score, score);
983 		if(i == 0)
984 			return p;
985 		if(i < 0)
986 			p = &cibuf[*p].right;
987 		else
988 			p = &cibuf[*p].left;
989 	}
990 }
991 
992 void
addcibuf(ClumpInfo * ci,vlong corrupt)993 addcibuf(ClumpInfo *ci, vlong corrupt)
994 {
995 	Cit *cit;
996 
997 	if(ncibuf == mcibuf){
998 		mcibuf += 131072;
999 		cibuf = vtrealloc(cibuf, mcibuf*sizeof cibuf[0]);
1000 	}
1001 	cit = &cibuf[ncibuf];
1002 	cit->ci = *ci;
1003 	cit->left = -1;
1004 	cit->right = -1;
1005 	cit->corrupt = corrupt;
1006 	if(!corrupt)
1007 		*ltreewalk(&ciroot, ci->score) = ncibuf;
1008 	ncibuf++;
1009 }
1010 
1011 void
addcicorrupt(vlong len)1012 addcicorrupt(vlong len)
1013 {
1014 	static ClumpInfo zci;
1015 
1016 	addcibuf(&zci, len);
1017 }
1018 
1019 int
haveclump(uchar * score)1020 haveclump(uchar *score)
1021 {
1022 	int i;
1023 	int p;
1024 
1025 	p = ciroot;
1026 	for(;;){
1027 		if(p == -1)
1028 			return 0;
1029 		i = scorecmp(cibuf[p].ci.score, score);
1030 		if(i == 0)
1031 			return 1;
1032 		if(i < 0)
1033 			p = cibuf[p].right;
1034 		else
1035 			p = cibuf[p].left;
1036 	}
1037 }
1038 
1039 int
matchci(ClumpInfo * ci,uchar * p)1040 matchci(ClumpInfo *ci, uchar *p)
1041 {
1042 	if(ci->type != vtfromdisktype(p[0]))
1043 		return 0;
1044 	if(ci->size != u16(p+1))
1045 		return 0;
1046 	if(ci->uncsize != u16(p+3))
1047 		return 0;
1048 	if(scorecmp(ci->score, p+5) != 0)
1049 		return 0;
1050 	return 1;
1051 }
1052 
1053 int
sealedarena(uchar * p,int blocksize)1054 sealedarena(uchar *p, int blocksize)
1055 {
1056 	int v, n;
1057 
1058 	v = u32(p+4);
1059 	switch(v){
1060 	default:
1061 		return 0;
1062 	case ArenaVersion4:
1063 		n = ArenaSize4;
1064 		break;
1065 	case ArenaVersion5:
1066 		n = ArenaSize5;
1067 		break;
1068 	}
1069 	if(p[n-1] != 1){
1070 		print("arena tail says not sealed\n");
1071 		return 0;
1072 	}
1073 	if(memcmp(p+n, zero, blocksize-VtScoreSize-n) != 0){
1074 		print("arena tail followed by non-zero data\n");
1075 		return 0;
1076 	}
1077 	if(memcmp(p+blocksize-VtScoreSize, zero, VtScoreSize) == 0){
1078 		print("arena score zero\n");
1079 		return 0;
1080 	}
1081 	return 1;
1082 }
1083 
1084 int
okayname(char * name,int n)1085 okayname(char *name, int n)
1086 {
1087 	char buf[20];
1088 
1089 	if(nameok(name) < 0)
1090 		return 0;
1091 	sprint(buf, "%d", n);
1092 	if(n == 0)
1093 		buf[0] = 0;
1094 	if(strlen(name) < strlen(buf)
1095 	|| strcmp(name+strlen(name)-strlen(buf), buf) != 0)
1096 		return 0;
1097 	return 1;
1098 }
1099 
1100 int
clumpinfocmp(ClumpInfo * a,ClumpInfo * b)1101 clumpinfocmp(ClumpInfo *a, ClumpInfo *b)
1102 {
1103 	if(a->type != b->type)
1104 		return a->type - b->type;
1105 	if(a->size != b->size)
1106 		return a->size - b->size;
1107 	if(a->uncsize != b->uncsize)
1108 		return a->uncsize - b->uncsize;
1109 	return scorecmp(a->score, b->score);
1110 }
1111 
1112 ClumpInfo*
loadci(vlong offset,Arena * arena,int nci)1113 loadci(vlong offset, Arena *arena, int nci)
1114 {
1115 	int i, j, per;
1116 	uchar *p, *sp;
1117 	ClumpInfo *bci, *ci;
1118 
1119 	per = arena->blocksize/ClumpInfoSize;
1120 	bci = vtmalloc(nci*sizeof bci[0]);
1121 	ci = bci;
1122 	offset += arena->size - arena->blocksize;
1123 	p = sp = nil;
1124 	for(i=0; i<nci; i+=per){
1125 		if(p == sp){
1126 			sp = pagein(offset-4*M, 4*M);
1127 			p = sp+4*M;
1128 		}
1129 		p -= arena->blocksize;
1130 		offset -= arena->blocksize;
1131 		for(j=0; j<per && i+j<nci; j++)
1132 			unpackclumpinfo(ci++, p+j*ClumpInfoSize);
1133 	}
1134 	return bci;
1135 }
1136 
1137 vlong
writeci(vlong offset,Arena * arena,ClumpInfo * ci,int nci)1138 writeci(vlong offset, Arena *arena, ClumpInfo *ci, int nci)
1139 {
1140 	int i, j, per;
1141 	uchar *p, *sp;
1142 
1143 	per = arena->blocksize/ClumpInfoSize;
1144 	offset += arena->size - arena->blocksize;
1145 	p = sp = nil;
1146 	for(i=0; i<nci; i+=per){
1147 		if(p == sp){
1148 			sp = pagein(offset-4*M, 4*M);
1149 			p = sp+4*M;
1150 		}
1151 		p -= arena->blocksize;
1152 		offset -= arena->blocksize;
1153 		memset(p, 0, arena->blocksize);
1154 		for(j=0; j<per && i+j<nci; j++)
1155 			packclumpinfo(ci++, p+j*ClumpInfoSize);
1156 	}
1157 	pageout();
1158 	return offset;
1159 }
1160 
1161 void
loadarenabasics(vlong offset0,int anum,ArenaHead * head,Arena * arena)1162 loadarenabasics(vlong offset0, int anum, ArenaHead *head, Arena *arena)
1163 {
1164 	char dname[ANameSize];
1165 	static char lastbase[ANameSize];
1166 	uchar *p;
1167 	Arena oarena;
1168 	ArenaHead ohead;
1169 
1170 	/*
1171 	 * Fmtarenas makes all arenas the same size
1172 	 * except the last, which may be smaller.
1173 	 * It uses the same block size for arenas as for
1174 	 * the arena partition blocks.
1175 	 */
1176 	arena->size = arenasize;
1177 	if(offset0+arena->size > partend)
1178 		arena->size = partend - offset0;
1179 	head->size = arena->size;
1180 
1181 	arena->blocksize = ap.blocksize;
1182 	head->blocksize = arena->blocksize;
1183 
1184 	/*
1185 	 * Look for clump magic and name in head/tail blocks.
1186 	 * All the other info we will reconstruct just in case.
1187 	 */
1188 	p = pagein(offset0, arena->blocksize);
1189 	memset(&ohead, 0, sizeof ohead);
1190 	if(unpackarenahead(&ohead, p) >= 0){
1191 		head->version = ohead.version;
1192 		head->clumpmagic = ohead.clumpmagic;
1193 		if(okayname(ohead.name, anum))
1194 			strcpy(head->name, ohead.name);
1195 	}
1196 
1197 	p = pagein(offset0+arena->size-arena->blocksize,
1198 		arena->blocksize);
1199 	memset(&oarena, 0, sizeof oarena);
1200 	if(unpackarena(&oarena, p) >= 0){
1201 		arena->version = oarena.version;
1202 		arena->clumpmagic = oarena.clumpmagic;
1203 		if(okayname(oarena.name, anum))
1204 			strcpy(arena->name, oarena.name);
1205 		arena->diskstats.clumps = oarena.diskstats.clumps;
1206 print("old arena: sealed=%d\n", oarena.diskstats.sealed);
1207 		arena->diskstats.sealed = oarena.diskstats.sealed;
1208 	}
1209 
1210 	/* Head trumps arena. */
1211 	if(head->version){
1212 		arena->version = head->version;
1213 		arena->clumpmagic = head->clumpmagic;
1214 	}
1215 	if(arena->version == 0)
1216 		arena->version = ArenaVersion5;
1217 	if(basename){
1218 		if(anum == -1)
1219 			snprint(arena->name, ANameSize, "%s", basename);
1220 		else
1221 			snprint(arena->name, ANameSize, "%s%d", basename, anum);
1222 	}else if(lastbase[0])
1223 		snprint(arena->name, ANameSize, "%s%d", lastbase, anum);
1224 	else if(head->name[0])
1225 		strcpy(arena->name, head->name);
1226 	else if(arena->name[0] == 0)
1227 		sysfatal("cannot determine base name for arena; use -n");
1228 	strcpy(lastbase, arena->name);
1229 	sprint(dname, "%d", anum);
1230 	lastbase[strlen(lastbase)-strlen(dname)] = 0;
1231 
1232 	/* Was working in arena, now copy to head. */
1233 	head->version = arena->version;
1234 	memmove(head->name, arena->name, sizeof head->name);
1235 	head->blocksize = arena->blocksize;
1236 	head->size = arena->size;
1237 }
1238 
1239 void
shahead(Shabuf * sb,vlong offset0,ArenaHead * head)1240 shahead(Shabuf *sb, vlong offset0, ArenaHead *head)
1241 {
1242 	uchar headbuf[MaxDiskBlock];
1243 
1244 	sb->offset = offset0;
1245 	memset(headbuf, 0, sizeof headbuf);
1246 	packarenahead(head, headbuf);
1247 	sbupdate(sb, headbuf, offset0, head->blocksize);
1248 }
1249 
1250 u32int
newclumpmagic(int version)1251 newclumpmagic(int version)
1252 {
1253 	u32int m;
1254 
1255 	if(version == ArenaVersion4)
1256 		return _ClumpMagic;
1257 	do{
1258 		m = fastrand();
1259 	}while(m==0 || m == _ClumpMagic);
1260 	return m;
1261 }
1262 
1263 /*
1264  * Poke around in the arena to find the clump data
1265  * and compute the relevant statistics.
1266  */
1267 void
guessarena(vlong offset0,int anum,ArenaHead * head,Arena * arena,uchar * oldscore,uchar * score)1268 guessarena(vlong offset0, int anum, ArenaHead *head, Arena *arena,
1269 	uchar *oldscore, uchar *score)
1270 {
1271 	uchar dbuf[MaxDiskBlock];
1272 	int needtozero, clumps, nb1, nb2, minclumps;
1273 	int inbad, n, ncib, printed, sealing, smart;
1274 	u32int magic;
1275 	uchar *sp, *ep, *p;
1276 	vlong boffset, eoffset, lastclumpend, leaked;
1277 	vlong offset, toffset, totalcorrupt, v;
1278 	Clump cl;
1279 	ClumpInfo *bci, *ci, *eci, *xci;
1280 	Cit *bcit, *cit, *ecit;
1281 	Shabuf oldsha, newsha;
1282 
1283 	/*
1284 	 * We expect to find an arena, with data, between offset
1285 	 * and offset+arenasize.  With any luck, the data starts at
1286 	 * offset+ap.blocksize.  The blocks have variable size and
1287 	 * aren't padded at all, which doesn't give us any alignment
1288 	 * constraints.  The blocks are compressed or high entropy,
1289 	 * but the headers are pretty low entropy (except the score):
1290 	 *
1291 	 *	type[1] (range 0 thru 9, 13)
1292 	 *	size[2]
1293 	 *	uncsize[2] (<= size)
1294 	 *
1295 	 * so we can look for these.  We check the scores as we go,
1296 	 * so we can't make any wrong turns.  If we find ourselves
1297 	 * in a dead end, scan forward looking for a new start.
1298 	 */
1299 
1300 	resetcibuf();
1301 	memset(head, 0, sizeof *head);
1302 	memset(arena, 0, sizeof *arena);
1303 	memset(oldscore, 0, VtScoreSize);
1304 	memset(score, 0, VtScoreSize);
1305 	memset(&oldsha, 0, sizeof oldsha);
1306 	memset(&newsha, 0, sizeof newsha);
1307 	newsha.rollback = 1;
1308 
1309 	if(0){
1310 		sbdebug(&oldsha, "old.sha");
1311 		sbdebug(&newsha, "new.sha");
1312 	}
1313 
1314 	loadarenabasics(offset0, anum, head, arena);
1315 
1316 	/* start the clump hunt */
1317 
1318 	clumps = 0;
1319 	totalcorrupt = 0;
1320 	sealing = 1;
1321 	boffset = offset0 + arena->blocksize;
1322 	offset = boffset;
1323 	eoffset = offset0+arena->size - arena->blocksize;
1324 	toffset = eoffset;
1325 	sp = pagein(offset0, 4*M);
1326 
1327 	if(arena->diskstats.sealed){
1328 		oldsha.offset = offset0;
1329 		sbupdate(&oldsha, sp, offset0, 4*M);
1330 	}
1331 	ep = sp+4*M;
1332 	p = sp + (boffset - offset0);
1333 	ncib = arena->blocksize / ClumpInfoSize;	/* ci per block in index */
1334 	lastclumpend = offset;
1335 	nbad = 0;
1336 	inbad = 0;
1337 	needtozero = 0;
1338 	minclumps = 0;
1339 	while(offset < eoffset){
1340 		/*
1341 		 * Shift buffer if we're running out of room.
1342 		 */
1343 		if(p+70*K >= ep){
1344 			/*
1345 			 * Start the post SHA1 buffer.   By now we should know the
1346 			 * clumpmagic and arena version, so we can create a
1347 			 * correct head block to get things going.
1348 			 */
1349 			if(sealing && fix && newsha.offset == 0){
1350 				newsha.offset = offset0;
1351 				if(arena->clumpmagic == 0){
1352 					if(arena->version == 0)
1353 						arena->version = ArenaVersion5;
1354 					arena->clumpmagic = newclumpmagic(arena->version);
1355 				}
1356 				head->clumpmagic = arena->clumpmagic;
1357 				shahead(&newsha, offset0, head);
1358 			}
1359 			n = 4*M-256*K;
1360 			if(sealing && fix){
1361 				sbdiskhash(&newsha, bufoffset);
1362 				sbupdate(&newsha, buf, bufoffset, 4*M-256*K);
1363 			}
1364 			pagein(bufoffset+n, 4*M);
1365 			p -= n;
1366 			if(arena->diskstats.sealed)
1367 				sbupdate(&oldsha, buf, bufoffset, 4*M);
1368 		}
1369 
1370 		/*
1371 		 * Check for a clump at p, which is at offset in the disk.
1372 		 * Duplicate clumps happen in corrupted disks
1373 		 * (the same pattern gets written many times in a row)
1374 		 * and should never happen during regular use.
1375 		 */
1376 		magic = 0;
1377 		if((n = isclump(p, &cl, &magic)) > 0){
1378 			/*
1379 			 * If we were in the middle of some corrupted data,
1380 			 * flush a warning about it and then add any clump
1381 			 * info blocks as necessary.
1382 			 */
1383 			if(inbad){
1384 				inbad = 0;
1385 				v = offset-lastclumpend;
1386 				if(needtozero){
1387 					zerorange(lastclumpend, v);
1388 					sbrollback(&newsha, lastclumpend);
1389 					print("corrupt clump data - %#llux+%#llux (%,llud bytes)\n",
1390 						lastclumpend, v, v);
1391 				}
1392 				addcicorrupt(v);
1393 				totalcorrupt += v;
1394 				nb1 = (minclumps+ncib-1)/ncib;
1395 				minclumps += (v+ClumpSize+VtMaxLumpSize-1)/(ClumpSize+VtMaxLumpSize);
1396 				nb2 = (minclumps+ncib-1)/ncib;
1397 				eoffset -= (nb2-nb1)*arena->blocksize;
1398 			}
1399 
1400 			if(haveclump(cl.info.score))
1401 				print("warning: duplicate clump %d %V at %#llux+%#d\n", cl.info.type, cl.info.score, offset, n);
1402 
1403 			/*
1404 			 * If clumps use different magic numbers, we don't care.
1405 			 * We'll just use the first one we find and make the others
1406 			 * follow suit.
1407 			 */
1408 			if(arena->clumpmagic == 0){
1409 				print("clump type %d size %d score %V magic %x\n",
1410 					cl.info.type, cl.info.size, cl.info.score, magic);
1411 				arena->clumpmagic = magic;
1412 				if(magic == _ClumpMagic)
1413 					arena->version = ArenaVersion4;
1414 				else
1415 					arena->version = ArenaVersion5;
1416 			}
1417 			if(magic != arena->clumpmagic)
1418 				p32(p, arena->clumpmagic);
1419 			if(clumps == 0)
1420 				arena->ctime = cl.time;
1421 
1422 			/*
1423 			 * Record the clump, update arena stats,
1424 			 * grow clump info blocks if needed.
1425 			 */
1426 			if(verbose > 1)
1427 				print("\tclump %d: %d %V at %#llux+%#ux (%d)\n",
1428 					clumps, cl.info.type, cl.info.score, offset, n, n);
1429 			addcibuf(&cl.info, 0);
1430 			if(minclumps%ncib == 0)
1431 				eoffset -= arena->blocksize;
1432 			minclumps++;
1433 			clumps++;
1434 			if(cl.encoding != ClumpENone)
1435 				arena->diskstats.cclumps++;
1436 			arena->diskstats.uncsize += cl.info.uncsize;
1437 			arena->wtime = cl.time;
1438 
1439 			/*
1440 			 * Move to next clump.
1441 			 */
1442 			offset += n;
1443 			p += n;
1444 			lastclumpend = offset;
1445 		}else{
1446 			/*
1447 			 * Overwrite malformed clump data with zeros later.
1448 			 * For now, just record whether it needs to be overwritten.
1449 			 * Bad regions must be of size at least ClumpSize.
1450 			 * Postponing the overwriting keeps us from writing past
1451 			 * the end of the arena data (which might be directory data)
1452 			 * with zeros.
1453 			 */
1454 			if(!inbad){
1455 				inbad = 1;
1456 				needtozero = 0;
1457 				if(memcmp(p, zero, ClumpSize) != 0)
1458 					needtozero = 1;
1459 				p += ClumpSize;
1460 				offset += ClumpSize;
1461 				nbad++;
1462 			}else{
1463 				if(*p != 0)
1464 					needtozero = 1;
1465 				p++;
1466 				offset++;
1467 			}
1468 		}
1469 	}
1470 	pageout();
1471 
1472 	if(verbose)
1473 		print("readable clumps: %d; min. directory entries: %d\n",
1474 			clumps, minclumps);
1475 	arena->diskstats.used = lastclumpend - boffset;
1476 	leaked = eoffset - lastclumpend;
1477 	if(verbose)
1478 		print("used from %#llux to %#llux = %,lld (%,lld unused)\n",
1479 			boffset, lastclumpend, arena->diskstats.used, leaked);
1480 
1481 	/*
1482 	 * Finish the SHA1 of the old data.
1483 	 */
1484 	if(arena->diskstats.sealed){
1485 		sbdiskhash(&oldsha, toffset);
1486 		readdisk(dbuf, toffset, arena->blocksize);
1487 		scorecp(dbuf+arena->blocksize-VtScoreSize, zero);
1488 		sbupdate(&oldsha, dbuf, toffset, arena->blocksize);
1489 		sbscore(&oldsha, oldscore);
1490 	}
1491 
1492 	/*
1493 	 * If we still don't know the clump magic, the arena
1494 	 * must be empty.  It still needs a value, so make
1495 	 * something up.
1496 	 */
1497 	if(arena->version == 0)
1498 		arena->version = ArenaVersion5;
1499 	if(arena->clumpmagic == 0){
1500 		if(arena->version == ArenaVersion4)
1501 			arena->clumpmagic = _ClumpMagic;
1502 		else{
1503 			do
1504 				arena->clumpmagic = fastrand();
1505 			while(arena->clumpmagic==_ClumpMagic
1506 				||arena->clumpmagic==0);
1507 		}
1508 		head->clumpmagic = arena->clumpmagic;
1509 	}
1510 
1511 	/*
1512 	 * Guess at number of clumpinfo blocks to load.
1513 	 * If we guess high, it's no big deal.  If we guess low,
1514 	 * we'll be forced into rewriting the whole directory.
1515 	 * Still not such a big deal.
1516 	 */
1517 	if(clumps == 0 || arena->diskstats.used == totalcorrupt)
1518 		goto Nocib;
1519 	if(clumps < arena->diskstats.clumps)
1520 		clumps = arena->diskstats.clumps;
1521 	if(clumps < ncibuf)
1522 		clumps = ncibuf;
1523 	clumps += totalcorrupt/
1524 		((arena->diskstats.used - totalcorrupt)/clumps);
1525 	clumps += totalcorrupt/2000;
1526 	if(clumps < minclumps)
1527 		clumps = minclumps;
1528 	clumps += ncib-1;
1529 	clumps -= clumps%ncib;
1530 
1531 	/*
1532 	 * Can't write into the actual data.
1533 	 */
1534 	v = offset0 + arena->size - arena->blocksize;
1535 	v -= (clumps+ncib-1)/ncib * arena->blocksize;
1536 	if(v < lastclumpend){
1537 		v = offset0 + arena->size - arena->blocksize;
1538 		clumps = (v-lastclumpend)/arena->blocksize * ncib;
1539 	}
1540 
1541 	if(clumps < minclumps)
1542 		print("cannot happen?\n");
1543 
1544 	/*
1545 	 * Check clumpinfo blocks against directory we created.
1546 	 * The tricky part is handling the corrupt sections of arena.
1547 	 * If possible, we remark just the affected directory entries
1548 	 * rather than slide everything down.
1549 	 *
1550 	 * Allocate clumps+1 blocks and check that we don't need
1551 	 * the last one at the end.
1552 	 */
1553 	bci = loadci(offset0, arena, clumps+1);
1554 	eci = bci+clumps+1;
1555 	bcit = cibuf;
1556 	ecit = cibuf+ncibuf;
1557 
1558 	smart = 0;	/* Somehow the smart code doesn't do corrupt clumps right. */
1559 Again:
1560 	nbad = 0;
1561 	ci = bci;
1562 	for(cit=bcit; cit<ecit && ci<eci; cit++){
1563 		if(cit->corrupt){
1564 			vlong n, m;
1565 			if(smart){
1566 				/*
1567 				 * If we can, just mark existing entries as corrupt.
1568 				 */
1569 				n = cit->corrupt;
1570 				for(xci=ci; n>0 && xci<eci; xci++)
1571 					n -= ClumpSize+xci->size;
1572 				if(n > 0 || xci >= eci)
1573 					goto Dumb;
1574 				printed = 0;
1575 				for(; ci<xci; ci++){
1576 					if(verbose && ci->type != VtCorruptType){
1577 						if(!printed){
1578 							print("marking directory %d-%d as corrupt\n",
1579 								(int)(ci-bci), (int)(xci-bci));
1580 							printed = 1;
1581 						}
1582 						print("\ttype=%d size=%d uncsize=%d score=%V\n",
1583 							ci->type, ci->size, ci->uncsize, ci->score);
1584 					}
1585 					ci->type = VtCorruptType;
1586 				}
1587 			}else{
1588 			Dumb:
1589 				print("\trewriting clump directory\n");
1590 				/*
1591 				 * Otherwise, blaze a new trail.
1592 				 */
1593 				n = cit->corrupt;
1594 				while(n > 0 && ci < eci){
1595 					if(n < ClumpSize)
1596 						sysfatal("bad math in clump corrupt");
1597 					if(n <= VtMaxLumpSize+ClumpSize)
1598 						m = n;
1599 					else{
1600 						m = VtMaxLumpSize+ClumpSize;
1601 						if(n-m < ClumpSize)
1602 							m -= ClumpSize;
1603 					}
1604 					ci->type = VtCorruptType;
1605 					ci->size = m-ClumpSize;
1606 					ci->uncsize = m-ClumpSize;
1607 					memset(ci->score, 0, VtScoreSize);
1608 					ci++;
1609 					n -= m;
1610 				}
1611 			}
1612 			continue;
1613 		}
1614 		if(clumpinfocmp(&cit->ci, ci) != 0){
1615 			if(verbose && (smart || verbose>1)){
1616 				print("clumpinfo %d\n", (int)(ci-bci));
1617 				print("\twant: %d %d %d %V\n",
1618 					cit->ci.type, cit->ci.size,
1619 					cit->ci.uncsize, cit->ci.score);
1620 				print("\thave: %d %d %d %V\n",
1621 					ci->type, ci->size,
1622 					ci->uncsize, ci->score);
1623 			}
1624 			*ci = cit->ci;
1625 			nbad++;
1626 		}
1627 		ci++;
1628 	}
1629 	if(ci >= eci || cit < ecit){
1630 		print("ran out of space editing existing directory; rewriting\n");
1631 		print("# eci %ld ci %ld ecit %ld cit %ld\n", eci-bci, ci-bci, ecit-bcit, cit-bcit);
1632 		assert(smart);	/* can't happen second time thru */
1633 		smart = 0;
1634 		goto Again;
1635 	}
1636 
1637 	assert(ci <= eci);
1638 	arena->diskstats.clumps = ci-bci;
1639 	eoffset = writeci(offset0, arena, bci, ci-bci);
1640 	if(sealing && fix)
1641 		sbrollback(&newsha, v);
1642 print("eoffset=%lld lastclumpend=%lld diff=%lld unseal=%d\n", eoffset, lastclumpend, eoffset-lastclumpend, unseal);
1643 	if(lastclumpend > eoffset)
1644 		print("arena directory overwrote blocks!  cannot happen!\n");
1645 	free(bci);
1646 	if(smart && nbad)
1647 		print("arena directory has %d bad or missing entries\n", nbad);
1648 Nocib:
1649 	if(eoffset - lastclumpend > 64*1024 && (!arena->diskstats.sealed || unseal)){
1650 		if(arena->diskstats.sealed)
1651 			print("unsealing arena\n");
1652 		sealing = 0;
1653 		memset(oldscore, 0, VtScoreSize);
1654 	}
1655 
1656 	/*
1657 	 * Finish the SHA1 of the new data - only meaningful
1658 	 * if we've been writing to disk (`fix').
1659 	 */
1660 	arena->diskstats.sealed = sealing;
1661 	arena->memstats = arena->diskstats;
1662 	if(sealing && fix){
1663 		uchar tbuf[MaxDiskBlock];
1664 
1665 		sbdiskhash(&newsha, toffset);
1666 		memset(tbuf, 0, sizeof tbuf);
1667 		packarena(arena, tbuf);
1668 		sbupdate(&newsha, tbuf, toffset, arena->blocksize);
1669 		sbscore(&newsha, score);
1670 	}
1671 }
1672 
1673 void
dumparena(vlong offset,int anum,Arena * arena)1674 dumparena(vlong offset, int anum, Arena *arena)
1675 {
1676 	char buf[1000];
1677 	vlong o, e;
1678 	int fd, n;
1679 
1680 	snprint(buf, sizeof buf, "%s.%d", dumpbase, anum);
1681 	if((fd = create(buf, OWRITE, 0666)) < 0){
1682 		fprint(2, "create %s: %r\n", buf);
1683 		return;
1684 	}
1685 	e = offset+arena->size;
1686 	for(o=offset; o<e; o+=n){
1687 		n = 4*M;
1688 		if(o+n > e)
1689 			n = e-o;
1690 		if(pwrite(fd, pagein(o, n), n, o-offset) != n){
1691 			fprint(2, "write %s at %#llux: %r\n", buf, o-offset);
1692 			return;
1693 		}
1694 	}
1695 }
1696 
1697 void
checkarena(vlong offset,int anum)1698 checkarena(vlong offset, int anum)
1699 {
1700 	uchar dbuf[MaxDiskBlock];
1701 	uchar *p, oldscore[VtScoreSize], score[VtScoreSize];
1702 	Arena arena, oarena;
1703 	ArenaHead head;
1704 	Info *fmt, *fmta;
1705 	int sz;
1706 
1707 	print("# arena %d: offset %#llux\n", anum, offset);
1708 
1709 	if(offset >= partend){
1710 		print("arena offset out of bounds\n");
1711 		return;
1712 	}
1713 
1714 	guessarena(offset, anum, &head, &arena, oldscore, score);
1715 
1716 	if(verbose){
1717 		print("#\tversion=%d name=%s blocksize=%d size=%z",
1718 			head.version, head.name, head.blocksize, head.size);
1719 		if(head.clumpmagic)
1720 			print(" clumpmagic=%#.8ux", head.clumpmagic);
1721 		print("\n#\tclumps=%d cclumps=%d used=%,lld uncsize=%,lld\n",
1722 			arena.diskstats.clumps, arena.diskstats.cclumps,
1723 			arena.diskstats.used, arena.diskstats.uncsize);
1724 		print("#\tctime=%t\n", arena.ctime);
1725 		print("#\twtime=%t\n", arena.wtime);
1726 		if(arena.diskstats.sealed)
1727 			print("#\tsealed score=%V\n", score);
1728 	}
1729 
1730 	if(dumpbase){
1731 		dumparena(offset, anum, &arena);
1732 		return;
1733 	}
1734 
1735 	memset(dbuf, 0, sizeof dbuf);
1736 	packarenahead(&head, dbuf);
1737 	p = pagein(offset, arena.blocksize);
1738 	if(memcmp(dbuf, p, arena.blocksize) != 0){
1739 		print("on-disk arena header incorrect\n");
1740 		showdiffs(dbuf, p, arena.blocksize,
1741 			arena.version==ArenaVersion4 ? headinfo4 : headinfo5);
1742 	}
1743 	memmove(p, dbuf, arena.blocksize);
1744 
1745 	memset(dbuf, 0, sizeof dbuf);
1746 	packarena(&arena, dbuf);
1747 	if(arena.diskstats.sealed)
1748 		scorecp(dbuf+arena.blocksize-VtScoreSize, score);
1749 	p = pagein(offset+arena.size-arena.blocksize, arena.blocksize);
1750 	memset(&oarena, 0, sizeof oarena);
1751 	unpackarena(&oarena, p);
1752 	if(arena.version == ArenaVersion4){
1753 		sz = ArenaSize4;
1754 		fmt = tailinfo4;
1755 		fmta = tailinfo4a;
1756 	}else{
1757 		sz = ArenaSize5;
1758 		fmt = tailinfo5;
1759 		fmta = tailinfo5a;
1760 	}
1761 	if(p[sz] == 1){
1762 		fmt = fmta;
1763 		if(oarena.diskstats.sealed){
1764 			/*
1765 			 * some arenas were sealed with the extension
1766 			 * before we adopted the convention that if it didn't
1767 			 * add new information it gets dropped.
1768 			 */
1769 			_packarena(&arena, dbuf, 1);
1770 		}
1771 	}
1772 	if(memcmp(dbuf, p, arena.blocksize-VtScoreSize) != 0){
1773 		print("on-disk arena tail incorrect\n");
1774 		showdiffs(dbuf, p, arena.blocksize-VtScoreSize, fmt);
1775 	}
1776 	if(arena.diskstats.sealed){
1777 		if(oarena.diskstats.sealed)
1778 		if(scorecmp(p+arena.blocksize-VtScoreSize, oldscore) != 0){
1779 			print("on-disk arena seal score incorrect\n");
1780 			print("\tcorrect=%V\n", oldscore);
1781 			print("\t   disk=%V\n", p+arena.blocksize-VtScoreSize);
1782 		}
1783 		if(fix && scorecmp(p+arena.blocksize-VtScoreSize, score) != 0){
1784 			print("%ssealing arena%s: %V\n",
1785 				oarena.diskstats.sealed ? "re" : "",
1786 				scorecmp(oldscore, score) == 0 ?
1787 					"" : " after changes", score);
1788 		}
1789 	}
1790 	memmove(p, dbuf, arena.blocksize);
1791 
1792 	pageout();
1793 }
1794 
1795 AMapN*
buildamap(void)1796 buildamap(void)
1797 {
1798 	uchar *p;
1799 	vlong o;
1800 	ArenaHead h;
1801 	AMapN *an;
1802 	AMap *m;
1803 
1804 	an = vtmallocz(sizeof *an);
1805 	for(o=ap.arenabase; o<partend; o+=arenasize){
1806 		p = pagein(o, Block);
1807 		if(unpackarenahead(&h, p) >= 0){
1808 			an->map = vtrealloc(an->map, (an->n+1)*sizeof an->map[0]);
1809 			m = &an->map[an->n++];
1810 			m->start = o;
1811 			m->stop = o+h.size;
1812 			strcpy(m->name, h.name);
1813 		}
1814 	}
1815 	return an;
1816 }
1817 
1818 void
checkmap(void)1819 checkmap(void)
1820 {
1821 	char *s;
1822 	uchar *p;
1823 	int i, len;
1824 	AMapN *an;
1825 	Fmt fmt;
1826 
1827 	an = buildamap();
1828 	fmtstrinit(&fmt);
1829 	fmtprint(&fmt, "%ud\n", an->n);
1830 	for(i=0; i<an->n; i++)
1831 		fmtprint(&fmt, "%s\t%lld\t%lld\n",
1832 			an->map[i].name, an->map[i].start, an->map[i].stop);
1833 	s = fmtstrflush(&fmt);
1834 	len = strlen(s);
1835 	if(len > ap.tabsize){
1836 		print("arena partition map too long: need %z bytes have %z\n",
1837 			(vlong)len, (vlong)ap.tabsize);
1838 		len = ap.tabsize;
1839 	}
1840 
1841 	if(ap.tabsize >= 4*M){	/* can't happen - max arenas is 2000 */
1842 		print("arena partition map *way* too long\n");
1843 		return;
1844 	}
1845 
1846 	p = pagein(ap.tabbase, ap.tabsize);
1847 	if(memcmp(p, s, len) != 0){
1848 		print("arena partition map incorrect; rewriting.\n");
1849 		memmove(p, s, len);
1850 	}
1851 	pageout();
1852 }
1853 
1854 int mainstacksize = 512*1024;
1855 
1856 void
threadmain(int argc,char ** argv)1857 threadmain(int argc, char **argv)
1858 {
1859 	int mode;
1860 
1861 	mode = OREAD;
1862 	readonly = 1;
1863 	ARGBEGIN{
1864 	case 'U':
1865 		unseal = 1;
1866 		break;
1867 	case 'a':
1868 		arenasize = unittoull(EARGF(usage()));
1869 		break;
1870 	case 'b':
1871 		ap.blocksize = unittoull(EARGF(usage()));
1872 		break;
1873 	case 'f':
1874 		fix = 1;
1875 		mode = ORDWR;
1876 		readonly = 0;
1877 		break;
1878 	case 'n':
1879 		basename = EARGF(usage());
1880 		break;
1881 	case 'v':
1882 		verbose++;
1883 		break;
1884 	case 'x':
1885 		dumpbase = EARGF(usage());
1886 		break;
1887 	default:
1888 		usage();
1889 	}ARGEND
1890 
1891 	if(argc != 1 && argc != 2)
1892 		usage();
1893 
1894 	file = argv[0];
1895 
1896 	ventifmtinstall();
1897 	fmtinstall('z', zfmt);
1898 	fmtinstall('t', tfmt);
1899 	quotefmtinstall();
1900 
1901 	part = initpart(file, mode|ODIRECT);
1902 	if(part == nil)
1903 		sysfatal("can't open %s: %r", file);
1904 	partend = part->size;
1905 
1906 	if(isonearena()){
1907 		checkarena(0, -1);
1908 		threadexitsall(nil);
1909 	}
1910 	checkarenas(argc > 1 ? argv[1] : nil);
1911 	checkmap();
1912 	threadexitsall(nil);
1913 }
1914