1 /*
2  * Disk cache.
3  *
4  * Caches raw disk blocks.  Getdblock() gets a block, putdblock puts it back.
5  * Getdblock has a mode parameter that determines i/o and access to a block:
6  * if mode is OREAD or ORDWR, it is read from disk if not already in memory.
7  * If mode is ORDWR or OWRITE, it is locked for exclusive use before being returned.
8  * It is *not* marked dirty -- once changes have been made, they should be noted
9  * by using dirtydblock() before putdblock().
10  *
11  * There is a global cache lock as well as a lock on each block.
12  * Within a thread, the cache lock can be acquired while holding a block lock,
13  * but not vice versa; and a block cannot be locked if you already hold the lock
14  * on another block.
15  *
16  * The flush proc writes out dirty blocks in batches, one batch per dirty tag.
17  * For example, the DirtyArena blocks are all written to disk before any of the
18  * DirtyArenaCib blocks.
19  *
20  * This code used to be in charge of flushing the dirty index blocks out to
21  * disk, but updating the index turned out to benefit from extra care.
22  * Now cached index blocks are never marked dirty.  The index.c code takes
23  * care of updating them behind our back, and uses _getdblock to update any
24  * cached copies of the blocks as it changes them on disk.
25  */
26 
27 #include "stdinc.h"
28 #include "dat.h"
29 #include "fns.h"
30 
31 typedef struct DCache	DCache;
32 
33 enum
34 {
35 	HashLog		= 9,
36 	HashSize	= 1<<HashLog,
37 	HashMask	= HashSize - 1,
38 };
39 
40 struct DCache
41 {
42 	QLock		lock;
43 	RWLock		dirtylock;		/* must be held to inspect or set b->dirty */
44 	Rendez		full;
45 	Round		round;
46 	DBlock		*free;			/* list of available lumps */
47 	u32int		now;			/* ticks for usage timestamps */
48 	int		size;			/* max. size of any block; allocated to each block */
49 	DBlock		**heads;		/* hash table for finding address */
50 	int		nheap;			/* number of available victims */
51 	DBlock		**heap;			/* heap for locating victims */
52 	int		nblocks;		/* number of blocks allocated */
53 	DBlock		*blocks;		/* array of block descriptors */
54 	DBlock		**write;		/* array of block pointers to be written */
55 	u8int		*mem;			/* memory for all block descriptors */
56 	int		ndirty;			/* number of dirty blocks */
57 	int		maxdirty;		/* max. number of dirty blocks */
58 };
59 
60 typedef struct Ra Ra;
61 struct Ra
62 {
63 	Part *part;
64 	u64int addr;
65 };
66 
67 static DCache	dcache;
68 
69 static int	downheap(int i, DBlock *b);
70 static int	upheap(int i, DBlock *b);
71 static DBlock	*bumpdblock(void);
72 static void	delheap(DBlock *db);
73 static void	fixheap(int i, DBlock *b);
74 static void	flushproc(void*);
75 static void	writeproc(void*);
76 
77 void
initdcache(u32int mem)78 initdcache(u32int mem)
79 {
80 	DBlock *b, *last;
81 	u32int nblocks, blocksize;
82 	int i;
83 	u8int *p;
84 
85 	if(mem < maxblocksize * 2)
86 		sysfatal("need at least %d bytes for the disk cache", maxblocksize * 2);
87 	if(maxblocksize == 0)
88 		sysfatal("no max. block size given for disk cache");
89 	blocksize = maxblocksize;
90 	nblocks = mem / blocksize;
91 	dcache.full.l = &dcache.lock;
92 	dcache.nblocks = nblocks;
93 	dcache.maxdirty = (nblocks * 2) / 3;
94 	trace(TraceProc, "initialize disk cache with %d blocks of %d bytes, maximum %d dirty blocks\n",
95 			nblocks, blocksize, dcache.maxdirty);
96 	dcache.size = blocksize;
97 	dcache.heads = MKNZ(DBlock*, HashSize);
98 	dcache.heap = MKNZ(DBlock*, nblocks);
99 	dcache.blocks = MKNZ(DBlock, nblocks);
100 	dcache.write = MKNZ(DBlock*, nblocks);
101 	dcache.mem = MKNZ(u8int, (nblocks+1+128) * blocksize);
102 
103 	last = nil;
104 	p = (u8int*)(((uintptr)dcache.mem+blocksize-1)&~(uintptr)(blocksize-1));
105 	for(i = 0; i < nblocks; i++){
106 		b = &dcache.blocks[i];
107 		b->data = &p[i * blocksize];
108 		b->heap = TWID32;
109 		b->writedonechan = chancreate(sizeof(void*), 1);
110 		b->next = last;
111 		last = b;
112 	}
113 
114 	dcache.free = last;
115 	dcache.nheap = 0;
116 	setstat(StatDcacheSize, nblocks);
117 	initround(&dcache.round, "dcache", 120*1000);
118 
119 	vtproc(flushproc, nil);
120 	vtproc(delaykickroundproc, &dcache.round);
121 }
122 
123 static u32int
pbhash(u64int addr)124 pbhash(u64int addr)
125 {
126 	u32int h;
127 
128 #define hashit(c)	((((c) * 0x6b43a9b5) >> (32 - HashLog)) & HashMask)
129 	h = (addr >> 32) ^ addr;
130 	return hashit(h);
131 }
132 
133 DBlock*
getdblock(Part * part,u64int addr,int mode)134 getdblock(Part *part, u64int addr, int mode)
135 {
136 	DBlock *b;
137 
138 	b = _getdblock(part, addr, mode, 1);
139 	if(mode == OREAD || mode == ORDWR)
140 		addstat(StatDcacheRead, 1);
141 	if(mode == OWRITE || mode == ORDWR)
142 		addstat(StatDcacheWrite, 1);
143 	return b;
144 }
145 
146 DBlock*
_getdblock(Part * part,u64int addr,int mode,int load)147 _getdblock(Part *part, u64int addr, int mode, int load)
148 {
149 	DBlock *b;
150 	u32int h, size, ms;
151 
152 	ms = 0;
153 	trace(TraceBlock, "getdblock enter %s 0x%llux", part->name, addr);
154 	size = part->blocksize;
155 	if(size > dcache.size){
156 		seterr(EAdmin, "block size %d too big for cache with size %d", size, dcache.size);
157 		if(load)
158 			addstat(StatDcacheLookup, 1);
159 		return nil;
160 	}
161 	h = pbhash(addr);
162 
163 	/*
164 	 * look for the block in the cache
165 	 */
166 	qlock(&dcache.lock);
167 again:
168 	for(b = dcache.heads[h]; b != nil; b = b->next){
169 		if(b->part == part && b->addr == addr){
170 			if(load)
171 				addstat2(StatDcacheHit, 1, StatDcacheLookup, 1);
172 			goto found;
173 		}
174 	}
175 
176 	/*
177 	 * missed: locate the block with the oldest second to last use.
178 	 * remove it from the heap, and fix up the heap.
179 	 */
180 	if(!load){
181 		qunlock(&dcache.lock);
182 		return nil;
183 	}
184 
185 	/*
186 	 * Only start timer here, on cache miss - calling msec() on plain cache hits
187 	 * makes cache hits system-call bound.
188 	 */
189 	ms = msec();
190 	addstat2(StatDcacheLookup, 1, StatDcacheMiss, 1);
191 
192 	b = bumpdblock();
193 	if(b == nil){
194 		trace(TraceBlock, "all disk cache blocks in use");
195 		addstat(StatDcacheStall, 1);
196 		rsleep(&dcache.full);
197 		addstat(StatDcacheStall, -1);
198 		goto again;
199 	}
200 
201 	assert(!b->dirty);
202 
203 	/*
204 	 * the new block has no last use, so assume it happens sometime in the middle
205 ZZZ this is not reasonable
206 	 */
207 	b->used = (b->used2 + dcache.now) / 2;
208 
209 	/*
210 	 * rechain the block on the correct hash chain
211 	 */
212 	b->next = dcache.heads[h];
213 	dcache.heads[h] = b;
214 	if(b->next != nil)
215 		b->next->prev = b;
216 	b->prev = nil;
217 
218 	b->addr = addr;
219 	b->part = part;
220 	b->size = 0;
221 
222 found:
223 	b->ref++;
224 	b->used2 = b->used;
225 	b->used = dcache.now++;
226 	if(b->heap != TWID32)
227 		fixheap(b->heap, b);
228 
229 	if((mode == ORDWR || mode == OWRITE) && part->writechan == nil){
230 		trace(TraceBlock, "getdblock allocwriteproc %s", part->name);
231 		part->writechan = chancreate(sizeof(DBlock*), dcache.nblocks);
232 		vtproc(writeproc, part);
233 	}
234 	qunlock(&dcache.lock);
235 
236 	trace(TraceBlock, "getdblock lock");
237 	addstat(StatDblockStall, 1);
238 	if(mode == OREAD)
239 		rlock(&b->lock);
240 	else
241 		wlock(&b->lock);
242 	addstat(StatDblockStall, -1);
243 	trace(TraceBlock, "getdblock locked");
244 
245 	if(b->size != size){
246 		if(mode == OREAD){
247 			addstat(StatDblockStall, 1);
248 			runlock(&b->lock);
249 			wlock(&b->lock);
250 			addstat(StatDblockStall, -1);
251 		}
252 		if(b->size < size){
253 			if(mode == OWRITE)
254 				memset(&b->data[b->size], 0, size - b->size);
255 			else{
256 				trace(TraceBlock, "getdblock readpart %s 0x%llux", part->name, addr);
257 				diskaccess(0);
258 				if(readpart(part, addr + b->size, &b->data[b->size], size - b->size) < 0){
259 					b->mode = ORDWR;	/* so putdblock wunlocks */
260 					putdblock(b);
261 					return nil;
262 				}
263 				trace(TraceBlock, "getdblock readpartdone");
264 				addstat(StatApartRead, 1);
265 				addstat(StatApartReadBytes, size-b->size);
266 			}
267 		}
268 		b->size = size;
269 		if(mode == OREAD){
270 			addstat(StatDblockStall, 1);
271 			wunlock(&b->lock);
272 			rlock(&b->lock);
273 			addstat(StatDblockStall, -1);
274 		}
275 	}
276 
277 	b->mode = mode;
278 	trace(TraceBlock, "getdblock exit");
279 	if(ms)
280 		addstat(StatDcacheLookupTime, msec() - ms);
281 	return b;
282 }
283 
284 void
putdblock(DBlock * b)285 putdblock(DBlock *b)
286 {
287 	if(b == nil)
288 		return;
289 
290 	trace(TraceBlock, "putdblock %s 0x%llux", b->part->name, b->addr);
291 
292 	if(b->mode == OREAD)
293 		runlock(&b->lock);
294 	else
295 		wunlock(&b->lock);
296 
297 	qlock(&dcache.lock);
298 	if(--b->ref == 0 && !b->dirty){
299 		if(b->heap == TWID32)
300 			upheap(dcache.nheap++, b);
301 		rwakeupall(&dcache.full);
302 	}
303 	qunlock(&dcache.lock);
304 }
305 
306 void
dirtydblock(DBlock * b,int dirty)307 dirtydblock(DBlock *b, int dirty)
308 {
309 	int odirty;
310 
311 	trace(TraceBlock, "dirtydblock enter %s 0x%llux %d from 0x%lux",
312 		b->part->name, b->addr, dirty, getcallerpc(&b));
313 	assert(b->ref != 0);
314 	assert(b->mode==ORDWR || b->mode==OWRITE);
315 
316 	odirty = b->dirty;
317 	if(b->dirty)
318 		assert(b->dirty == dirty);
319 	else
320 		b->dirty = dirty;
321 
322 	qlock(&dcache.lock);
323 	if(!odirty){
324 		dcache.ndirty++;
325 		setstat(StatDcacheDirty, dcache.ndirty);
326 		if(dcache.ndirty >= dcache.maxdirty)
327 			kickround(&dcache.round, 0);
328 		else
329 			delaykickround(&dcache.round);
330 	}
331 	qunlock(&dcache.lock);
332 }
333 
334 static void
unchain(DBlock * b)335 unchain(DBlock *b)
336 {
337 	ulong h;
338 
339 	/*
340 	 * unchain the block
341 	 */
342 	if(b->prev == nil){
343 		h = pbhash(b->addr);
344 		if(dcache.heads[h] != b)
345 			sysfatal("bad hash chains in disk cache");
346 		dcache.heads[h] = b->next;
347 	}else
348 		b->prev->next = b->next;
349 	if(b->next != nil)
350 		b->next->prev = b->prev;
351 }
352 
353 /*
354  * remove some block from use and update the free list and counters
355  */
356 static DBlock*
bumpdblock(void)357 bumpdblock(void)
358 {
359 	DBlock *b;
360 
361 	trace(TraceBlock, "bumpdblock enter");
362 	b = dcache.free;
363 	if(b != nil){
364 		dcache.free = b->next;
365 		return b;
366 	}
367 
368 	if(dcache.ndirty >= dcache.maxdirty)
369 		kickdcache();
370 
371 	/*
372 	 * remove blocks until we find one that is unused
373 	 * referenced blocks are left in the heap even though
374 	 * they can't be scavenged; this is simple a speed optimization
375 	 */
376 	for(;;){
377 		if(dcache.nheap == 0){
378 			kickdcache();
379 			trace(TraceBlock, "bumpdblock gotnothing");
380 			return nil;
381 		}
382 		b = dcache.heap[0];
383 		delheap(b);
384 		if(!b->ref && !b->dirty)
385 			break;
386 	}
387 
388 	trace(TraceBlock, "bumpdblock bumping %s 0x%llux", b->part->name, b->addr);
389 
390 	unchain(b);
391 	return b;
392 }
393 
394 void
emptydcache(void)395 emptydcache(void)
396 {
397 	DBlock *b;
398 
399 	qlock(&dcache.lock);
400 	while(dcache.nheap > 0){
401 		b = dcache.heap[0];
402 		delheap(b);
403 		if(!b->ref && !b->dirty){
404 			unchain(b);
405 			b->next = dcache.free;
406 			dcache.free = b;
407 		}
408 	}
409 	qunlock(&dcache.lock);
410 }
411 
412 /*
413  * delete an arbitrary block from the heap
414  */
415 static void
delheap(DBlock * db)416 delheap(DBlock *db)
417 {
418 	if(db->heap == TWID32)
419 		return;
420 	fixheap(db->heap, dcache.heap[--dcache.nheap]);
421 	db->heap = TWID32;
422 }
423 
424 /*
425  * push an element up or down to it's correct new location
426  */
427 static void
fixheap(int i,DBlock * b)428 fixheap(int i, DBlock *b)
429 {
430 	if(upheap(i, b) == i)
431 		downheap(i, b);
432 }
433 
434 static int
upheap(int i,DBlock * b)435 upheap(int i, DBlock *b)
436 {
437 	DBlock *bb;
438 	u32int now;
439 	int p;
440 
441 	now = dcache.now;
442 	for(; i != 0; i = p){
443 		p = (i - 1) >> 1;
444 		bb = dcache.heap[p];
445 		if(b->used2 - now >= bb->used2 - now)
446 			break;
447 		dcache.heap[i] = bb;
448 		bb->heap = i;
449 	}
450 
451 	dcache.heap[i] = b;
452 	b->heap = i;
453 	return i;
454 }
455 
456 static int
downheap(int i,DBlock * b)457 downheap(int i, DBlock *b)
458 {
459 	DBlock *bb;
460 	u32int now;
461 	int k;
462 
463 	now = dcache.now;
464 	for(; ; i = k){
465 		k = (i << 1) + 1;
466 		if(k >= dcache.nheap)
467 			break;
468 		if(k + 1 < dcache.nheap && dcache.heap[k]->used2 - now > dcache.heap[k + 1]->used2 - now)
469 			k++;
470 		bb = dcache.heap[k];
471 		if(b->used2 - now <= bb->used2 - now)
472 			break;
473 		dcache.heap[i] = bb;
474 		bb->heap = i;
475 	}
476 
477 	dcache.heap[i] = b;
478 	b->heap = i;
479 	return i;
480 }
481 
482 static void
findblock(DBlock * bb)483 findblock(DBlock *bb)
484 {
485 	DBlock *b, *last;
486 	int h;
487 
488 	last = nil;
489 	h = pbhash(bb->addr);
490 	for(b = dcache.heads[h]; b != nil; b = b->next){
491 		if(last != b->prev)
492 			sysfatal("bad prev link");
493 		if(b == bb)
494 			return;
495 		last = b;
496 	}
497 	sysfatal("block missing from hash table");
498 }
499 
500 void
checkdcache(void)501 checkdcache(void)
502 {
503 	DBlock *b;
504 	u32int size, now;
505 	int i, k, refed, nfree;
506 
507 	qlock(&dcache.lock);
508 	size = dcache.size;
509 	now = dcache.now;
510 	for(i = 0; i < dcache.nheap; i++){
511 		if(dcache.heap[i]->heap != i)
512 			sysfatal("dc: mis-heaped at %d: %d", i, dcache.heap[i]->heap);
513 		if(i > 0 && dcache.heap[(i - 1) >> 1]->used2 - now > dcache.heap[i]->used2 - now)
514 			sysfatal("dc: bad heap ordering");
515 		k = (i << 1) + 1;
516 		if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
517 			sysfatal("dc: bad heap ordering");
518 		k++;
519 		if(k < dcache.nheap && dcache.heap[i]->used2 - now > dcache.heap[k]->used2 - now)
520 			sysfatal("dc: bad heap ordering");
521 	}
522 
523 	refed = 0;
524 	for(i = 0; i < dcache.nblocks; i++){
525 		b = &dcache.blocks[i];
526 		if(b->data != &dcache.mem[i * size])
527 			sysfatal("dc: mis-blocked at %d", i);
528 		if(b->ref && b->heap == TWID32)
529 			refed++;
530 		if(b->addr)
531 			findblock(b);
532 		if(b->heap != TWID32
533 		&& dcache.heap[b->heap] != b)
534 			sysfatal("dc: spurious heap value");
535 	}
536 
537 	nfree = 0;
538 	for(b = dcache.free; b != nil; b = b->next){
539 		if(b->addr != 0 || b->heap != TWID32)
540 			sysfatal("dc: bad free list");
541 		nfree++;
542 	}
543 
544 	if(dcache.nheap + nfree + refed != dcache.nblocks)
545 		sysfatal("dc: missing blocks: %d %d %d", dcache.nheap, refed, dcache.nblocks);
546 	qunlock(&dcache.lock);
547 }
548 
549 void
flushdcache(void)550 flushdcache(void)
551 {
552 	trace(TraceProc, "flushdcache enter");
553 	kickround(&dcache.round, 1);
554 	trace(TraceProc, "flushdcache exit");
555 }
556 
557 void
kickdcache(void)558 kickdcache(void)
559 {
560 	kickround(&dcache.round, 0);
561 }
562 
563 static int
parallelwrites(DBlock ** b,DBlock ** eb,int dirty)564 parallelwrites(DBlock **b, DBlock **eb, int dirty)
565 {
566 	DBlock **p, **q;
567 	Part *part;
568 
569 	for(p=b; p<eb && (*p)->dirty == dirty; p++){
570 		assert(b<=p && p<eb);
571 		sendp((*p)->part->writechan, *p);
572 	}
573 	q = p;
574 	for(p=b; p<q; p++){
575 		assert(b<=p && p<eb);
576 		recvp((*p)->writedonechan);
577 	}
578 
579 	/*
580 	 * Flush the partitions that have been written to.
581 	 */
582 	part = nil;
583 	for(p=b; p<q; p++){
584 		if(part != (*p)->part){
585 			part = (*p)->part;
586 			flushpart(part);	/* what if it fails? */
587 		}
588 	}
589 
590 	return p-b;
591 }
592 
593 /*
594  * Sort first by dirty flag, then by partition, then by address in partition.
595  */
596 static int
writeblockcmp(const void * va,const void * vb)597 writeblockcmp(const void *va, const void *vb)
598 {
599 	DBlock *a, *b;
600 
601 	a = *(DBlock**)va;
602 	b = *(DBlock**)vb;
603 
604 	if(a->dirty != b->dirty)
605 		return a->dirty - b->dirty;
606 	if(a->part != b->part){
607 		if(a->part < b->part)
608 			return -1;
609 		if(a->part > b->part)
610 			return 1;
611 	}
612 	if(a->addr < b->addr)
613 		return -1;
614 	return 1;
615 }
616 
617 static void
flushproc(void * v)618 flushproc(void *v)
619 {
620 	int i, j, n;
621 	ulong t0;
622 	DBlock *b, **write;
623 
624 	USED(v);
625 	threadsetname("flushproc");
626 	for(;;){
627 		waitforkick(&dcache.round);
628 
629 		trace(TraceWork, "start");
630 		t0 = nsec()/1000;
631 		trace(TraceProc, "build t=%lud", (ulong)(nsec()/1000)-t0);
632 
633 		write = dcache.write;
634 		n = 0;
635 		for(i=0; i<dcache.nblocks; i++){
636 			b = &dcache.blocks[i];
637 			if(b->dirty)
638 				write[n++] = b;
639 		}
640 
641 		qsort(write, n, sizeof(write[0]), writeblockcmp);
642 
643 		/* Write each stage of blocks out. */
644 		trace(TraceProc, "writeblocks t=%lud", (ulong)(nsec()/1000)-t0);
645 		i = 0;
646 		for(j=1; j<DirtyMax; j++){
647 			trace(TraceProc, "writeblocks.%d t=%lud",
648 				j, (ulong)(nsec()/1000)-t0);
649 			i += parallelwrites(write+i, write+n, j);
650 		}
651 		if(i != n){
652 			fprint(2, "in flushproc i=%d n=%d\n", i, n);
653 			for(i=0; i<n; i++)
654 				fprint(2, "\tblock %d: dirty=%d\n",
655 					i, write[i]->dirty);
656 			abort();
657 		}
658 
659 		/*
660 		 * b->dirty is protected by b->lock while ndirty is protected
661 		 * by dcache.lock, so the --ndirty below is the delayed one
662 		 * from clearing b->dirty in the write proc.  It may happen
663 		 * that some other proc has come along and redirtied b since
664 		 * the write.  That's okay, it just means that ndirty may be
665 		 * one too high until we catch up and do the decrement.
666 		 */
667 		trace(TraceProc, "undirty.%d t=%lud", j, (ulong)(nsec()/1000)-t0);
668 		qlock(&dcache.lock);
669 		for(i=0; i<n; i++){
670 			b = write[i];
671 			--dcache.ndirty;
672 			if(b->ref == 0 && b->heap == TWID32){
673 				upheap(dcache.nheap++, b);
674 				rwakeupall(&dcache.full);
675 			}
676 		}
677 		setstat(StatDcacheDirty, dcache.ndirty);
678 		qunlock(&dcache.lock);
679 		addstat(StatDcacheFlush, 1);
680 		trace(TraceWork, "finish");
681 	}
682 }
683 
684 static void
writeproc(void * v)685 writeproc(void *v)
686 {
687 	DBlock *b;
688 	Part *p;
689 
690 	p = v;
691 
692 	threadsetname("writeproc:%s", p->name);
693 	for(;;){
694 		b = recvp(p->writechan);
695 		trace(TraceWork, "start");
696 		assert(b->part == p);
697 		trace(TraceProc, "wlock %s 0x%llux", p->name, b->addr);
698 		wlock(&b->lock);
699 		trace(TraceProc, "writepart %s 0x%llux", p->name, b->addr);
700 		diskaccess(0);
701 		if(writepart(p, b->addr, b->data, b->size) < 0)
702 			fprint(2, "%s: writeproc: part %s addr 0x%llux: write error: %r\n",
703 				argv0, p->name, b->addr);
704 		addstat(StatApartWrite, 1);
705 		addstat(StatApartWriteBytes, b->size);
706 		b->dirty = 0;
707 		wunlock(&b->lock);
708 		trace(TraceProc, "finish %s 0x%llux", p->name, b->addr);
709 		trace(TraceWork, "finish");
710 		sendp(b->writedonechan, b);
711 	}
712 }
713