1 /* $NetBSD: rcache.c,v 1.11 2002/02/19 23:11:28 lukem Exp $ */ 2 3 /*- 4 * Copyright (c) 1999 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Martin J. Laubach <mjl@emsi.priv.at> and 9 * Manuel Bouyer <Manuel.Bouyer@lip6.fr>. 10 * 11 * Redistribution and use in source and binary forms, with or without 12 * modification, are permitted provided that the following conditions 13 * are met: 14 * 1. Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 3. All advertising materials mentioning features or use of this software 20 * must display the following acknowledgement: 21 * This product includes software developed by the NetBSD 22 * Foundation, Inc. and its contributors. 23 * 4. Neither the name of The NetBSD Foundation nor the names of its 24 * contributors may be used to endorse or promote products derived 25 * from this software without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 28 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 29 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 30 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 31 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 32 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 33 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 34 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 35 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 36 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 37 * POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 #include <sys/cdefs.h> 41 #ifndef lint 42 __RCSID("$NetBSD: rcache.c,v 1.11 2002/02/19 23:11:28 lukem Exp $"); 43 #endif /* not lint */ 44 45 #include <sys/types.h> 46 #include <sys/uio.h> 47 #include <sys/mman.h> 48 #include <sys/param.h> 49 #include <sys/sysctl.h> 50 #include <ufs/ufs/dinode.h> 51 52 #include <stdio.h> 53 #include <stdlib.h> 54 #include <unistd.h> 55 #include <fcntl.h> 56 #include <errno.h> 57 #include <string.h> 58 59 #include "dump.h" 60 61 /*-----------------------------------------------------------------------*/ 62 #define MAXCACHEBUFS 512 /* max 512 buffers */ 63 #define MAXMEMPART 6 /* max 15% of the user mem */ 64 65 /*-----------------------------------------------------------------------*/ 66 struct cheader { 67 volatile size_t count; 68 }; 69 70 struct cdesc { 71 volatile daddr_t blkstart; 72 volatile daddr_t blkend;/* start + nblksread */ 73 volatile daddr_t blocksRead; 74 volatile size_t time; 75 #ifdef DIAGNOSTICS 76 volatile pid_t owner; 77 #endif 78 }; 79 80 static int findlru(void); 81 82 static void *shareBuffer = NULL; 83 static struct cheader *cheader; 84 static struct cdesc *cdesc; 85 static char *cdata; 86 static int cachebufs; 87 static int nblksread; 88 89 #ifdef STATS 90 static int nreads; 91 static int nphysread; 92 static int64_t readsize; 93 static int64_t physreadsize; 94 #endif 95 96 #define CDATA(i) (cdata + ((i) * nblksread * dev_bsize)) 97 98 void 99 initcache(int cachesize, int readblksize) 100 { 101 size_t len; 102 size_t sharedSize; 103 104 nblksread = (readblksize + ufsib->ufs_bsize - 1) / ufsib->ufs_bsize; 105 if(cachesize == -1) { /* Compute from memory available */ 106 int usermem; 107 int mib[2] = { CTL_HW, HW_USERMEM }; 108 109 len = sizeof(usermem); 110 if (sysctl(mib, 2, &usermem, &len, NULL, 0) < 0) { 111 msg("sysctl(hw.usermem) failed: %s\n", strerror(errno)); 112 return; 113 } 114 cachebufs = (usermem / MAXMEMPART) / (nblksread * dev_bsize); 115 } else { /* User specified */ 116 cachebufs = cachesize; 117 } 118 119 if(cachebufs) { /* Don't allocate if zero --> no caching */ 120 if (cachebufs > MAXCACHEBUFS) 121 cachebufs = MAXCACHEBUFS; 122 123 sharedSize = sizeof(struct cheader) + 124 sizeof(struct cdesc) * cachebufs + 125 nblksread * cachebufs * dev_bsize; 126 #ifdef STATS 127 fprintf(stderr, "Using %d buffers (%d bytes)\n", cachebufs, 128 sharedSize); 129 #endif 130 shareBuffer = mmap(NULL, sharedSize, PROT_READ | PROT_WRITE, 131 MAP_ANON | MAP_SHARED, -1, 0); 132 if (shareBuffer == (void *)-1) { 133 msg("can't mmap shared memory for buffer: %s\n", 134 strerror(errno)); 135 return; 136 } 137 cheader = shareBuffer; 138 cdesc = (struct cdesc *) (((char *) shareBuffer) + 139 sizeof(struct cheader)); 140 cdata = ((char *) shareBuffer) + sizeof(struct cheader) + 141 sizeof(struct cdesc) * cachebufs; 142 143 memset(shareBuffer, '\0', sharedSize); 144 } 145 } 146 147 /* 148 * Find the cache buffer descriptor that shows the minimal access time 149 */ 150 static int 151 findlru(void) 152 { 153 int i; 154 size_t minTime = cdesc[0].time; 155 int minIdx = 0; 156 157 for (i = 0; i < cachebufs; i++) { 158 if (cdesc[i].time < minTime) { 159 minIdx = i; 160 minTime = cdesc[i].time; 161 } 162 } 163 164 return minIdx; 165 } 166 167 /* 168 * Read data directly from disk, with smart error handling. 169 * Try to recover from hard errors by reading in sector sized pieces. 170 * Error recovery is attempted at most BREADEMAX times before seeking 171 * consent from the operator to continue. 172 */ 173 174 static int breaderrors = 0; 175 #define BREADEMAX 32 176 177 void 178 rawread(daddr_t blkno, char *buf, int size) 179 { 180 int cnt, i; 181 #ifdef STATS 182 nphysread++; 183 physreadsize += size; 184 #endif 185 186 loop: 187 if (lseek(diskfd, ((off_t) blkno << dev_bshift), 0) < 0) { 188 msg("rawread: lseek fails\n"); 189 goto err; 190 } 191 if ((cnt = read(diskfd, buf, size)) == size) 192 return; 193 if (blkno + (size / dev_bsize) > ufsib->ufs_dsize) { 194 /* 195 * Trying to read the final fragment. 196 * 197 * NB - dump only works in TP_BSIZE blocks, hence 198 * rounds `dev_bsize' fragments up to TP_BSIZE pieces. 199 * It should be smarter about not actually trying to 200 * read more than it can get, but for the time being 201 * we punt and scale back the read only when it gets 202 * us into trouble. (mkm 9/25/83) 203 */ 204 size -= dev_bsize; 205 goto loop; 206 } 207 if (cnt == -1) 208 msg("read error from %s: %s: [block %d]: count=%d\n", 209 disk, strerror(errno), blkno, size); 210 else 211 msg("short read error from %s: [block %d]: count=%d, got=%d\n", 212 disk, blkno, size, cnt); 213 err: 214 if (++breaderrors > BREADEMAX) { 215 msg("More than %d block read errors from %s\n", 216 BREADEMAX, disk); 217 broadcast("DUMP IS AILING!\n"); 218 msg("This is an unrecoverable error.\n"); 219 if (!query("Do you want to attempt to continue?")){ 220 dumpabort(0); 221 /*NOTREACHED*/ 222 } else 223 breaderrors = 0; 224 } 225 /* 226 * Zero buffer, then try to read each sector of buffer separately. 227 */ 228 memset(buf, 0, size); 229 for (i = 0; i < size; i += dev_bsize, buf += dev_bsize, blkno++) { 230 if (lseek(diskfd, ((off_t)blkno << dev_bshift), 0) < 0) { 231 msg("rawread: lseek2 fails: %s!\n", 232 strerror(errno)); 233 continue; 234 } 235 if ((cnt = read(diskfd, buf, (int)dev_bsize)) == dev_bsize) 236 continue; 237 if (cnt == -1) { 238 msg("read error from %s: %s: [sector %d]: count=%ld: " 239 "%s\n", disk, strerror(errno), blkno, dev_bsize, 240 strerror(errno)); 241 continue; 242 } 243 msg("short read error from %s: [sector %d]: count=%ld, got=%d\n", 244 disk, blkno, dev_bsize, cnt); 245 } 246 } 247 248 void 249 bread(daddr_t blkno, char *buf, int size) 250 { 251 int osize = size; 252 daddr_t oblkno = blkno; 253 char *obuf = buf; 254 daddr_t numBlocks = (size + dev_bsize -1) / dev_bsize; 255 256 #ifdef STATS 257 nreads++; 258 readsize += size; 259 #endif 260 261 if (!shareBuffer) { 262 rawread(blkno, buf, size); 263 return; 264 } 265 266 if (flock(diskfd, LOCK_EX)) { 267 msg("flock(LOCK_EX) failed: %s\n", 268 strerror(errno)); 269 rawread(blkno, buf, size); 270 return; 271 } 272 273 retry: 274 while(size > 0) { 275 int i; 276 277 for (i = 0; i < cachebufs; i++) { 278 struct cdesc *curr = &cdesc[i]; 279 280 #ifdef DIAGNOSTICS 281 if (curr->owner) { 282 fprintf(stderr, "Owner is set (%d, me=%d), can" 283 "not happen.\n", curr->owner, getpid()); 284 } 285 #endif 286 287 if (curr->blkend == 0) 288 continue; 289 /* 290 * If we find a bit of the read in the buffers, 291 * now compute how many blocks we can copy, 292 * copy them out, adjust blkno, buf and size, 293 * and restart 294 */ 295 if (curr->blkstart <= blkno && 296 blkno < curr->blkend) { 297 /* Number of data blocks to be copied */ 298 int toCopy = MIN(size, 299 (curr->blkend - blkno) * dev_bsize); 300 #ifdef DIAGNOSTICS 301 if (toCopy <= 0 || 302 toCopy > nblksread * dev_bsize) { 303 fprintf(stderr, "toCopy %d !\n", 304 toCopy); 305 dumpabort(0); 306 } 307 if (CDATA(i) + (blkno - curr->blkstart) * 308 dev_bsize < CDATA(i) || 309 CDATA(i) + (blkno - curr->blkstart) * 310 dev_bsize > 311 CDATA(i) + nblksread * dev_bsize) { 312 fprintf(stderr, "%p < %p !!!\n", 313 CDATA(i) + (blkno - 314 curr->blkstart) * dev_bsize, 315 CDATA(i)); 316 fprintf(stderr, "cdesc[i].blkstart %d " 317 "blkno %d dev_bsize %ld\n", 318 curr->blkstart, blkno, dev_bsize); 319 dumpabort(0); 320 } 321 #endif 322 memcpy(buf, CDATA(i) + 323 (blkno - curr->blkstart) * dev_bsize, 324 toCopy); 325 326 buf += toCopy; 327 size -= toCopy; 328 blkno += (toCopy + dev_bsize - 1) / dev_bsize; 329 numBlocks -= 330 (toCopy + dev_bsize - 1) / dev_bsize; 331 332 curr->time = cheader->count++; 333 334 /* 335 * If all data of a cache block have been 336 * read, chances are good no more reads 337 * will occur, so expire the cache immediately 338 */ 339 340 curr->blocksRead += 341 (toCopy + dev_bsize -1) / dev_bsize; 342 if (curr->blocksRead >= nblksread) 343 curr->time = 0; 344 345 goto retry; 346 } 347 } 348 349 /* No more to do? */ 350 if (size == 0) 351 break; 352 353 /* 354 * This does actually not happen if fs blocks are not greater 355 * than nblksread. 356 */ 357 if (numBlocks > nblksread || blkno >= ufsib->ufs_dsize) { 358 rawread(oblkno, obuf, osize); 359 break; 360 } else { 361 int idx; 362 ssize_t rsize; 363 daddr_t blockBlkNo; 364 365 blockBlkNo = (blkno / nblksread) * nblksread; 366 idx = findlru(); 367 rsize = MIN(nblksread, 368 ufsib->ufs_dsize - blockBlkNo) * 369 dev_bsize; 370 371 #ifdef DIAGNOSTICS 372 if (cdesc[idx].owner) 373 fprintf(stderr, "Owner is set (%d, me=%d), can" 374 "not happen(2).\n", cdesc[idx].owner, 375 getpid()); 376 cdesc[idx].owner = getpid(); 377 #endif 378 cdesc[idx].time = cheader->count++; 379 cdesc[idx].blkstart = blockBlkNo; 380 cdesc[idx].blocksRead = 0; 381 382 if (lseek(diskfd, 383 ((off_t) (blockBlkNo) << dev_bshift), 0) < 0) { 384 msg("readBlocks: lseek fails: %s\n", 385 strerror(errno)); 386 rsize = -1; 387 } else { 388 rsize = read(diskfd, CDATA(idx), rsize); 389 if (rsize < 0) { 390 msg("readBlocks: read fails: %s\n", 391 strerror(errno)); 392 } 393 } 394 395 /* On errors, panic, punt, try to read without 396 * cache and let raw read routine do the rest. 397 */ 398 399 if (rsize <= 0) { 400 rawread(oblkno, obuf, osize); 401 #ifdef DIAGNOSTICS 402 if (cdesc[idx].owner != getpid()) 403 fprintf(stderr, "Owner changed from " 404 "%d to %d, can't happen\n", 405 getpid(), cdesc[idx].owner); 406 cdesc[idx].owner = 0; 407 #endif 408 break; 409 } 410 411 /* On short read, just note the fact and go on */ 412 cdesc[idx].blkend = blockBlkNo + rsize / dev_bsize; 413 414 #ifdef STATS 415 nphysread++; 416 physreadsize += rsize; 417 #endif 418 #ifdef DIAGNOSTICS 419 if (cdesc[idx].owner != getpid()) 420 fprintf(stderr, "Owner changed from " 421 "%d to %d, can't happen\n", 422 getpid(), cdesc[idx].owner); 423 cdesc[idx].owner = 0; 424 #endif 425 /* 426 * We swapped some of data in, let the loop fetch 427 * them from cache 428 */ 429 } 430 } 431 432 if (flock(diskfd, LOCK_UN)) 433 msg("flock(LOCK_UN) failed: %s\n", 434 strerror(errno)); 435 return; 436 } 437 438 void 439 printcachestats(void) 440 { 441 #ifdef STATS 442 fprintf(stderr, "Pid %d: %d reads (%u bytes) " 443 "%d physical reads (%u bytes) %d%% hits, %d%% overhead\n", 444 getpid(), nreads, (u_int) readsize, nphysread, 445 (u_int) physreadsize, (nreads - nphysread) * 100 / nreads, 446 (int) (((physreadsize - readsize) * 100) / readsize)); 447 #endif 448 } 449