1 /* $NetBSD: dev-io.c,v 1.6 2009/12/02 01:53:25 haad Exp $ */ 2 3 /* 4 * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved. 5 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 6 * 7 * This file is part of LVM2. 8 * 9 * This copyrighted material is made available to anyone wishing to use, 10 * modify, copy, or redistribute it subject to the terms and conditions 11 * of the GNU Lesser General Public License v.2.1. 12 * 13 * You should have received a copy of the GNU Lesser General Public License 14 * along with this program; if not, write to the Free Software Foundation, 15 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 */ 17 18 #include "lib.h" 19 #include "lvm-types.h" 20 #include "device.h" 21 #include "metadata.h" 22 #include "lvmcache.h" 23 #include "memlock.h" 24 #include "locking.h" 25 26 #include <limits.h> 27 #include <sys/stat.h> 28 #include <fcntl.h> 29 #include <unistd.h> 30 #include <sys/ioctl.h> 31 32 #ifdef linux 33 # define u64 uint64_t /* Missing without __KERNEL__ */ 34 # undef WNOHANG /* Avoid redefinition */ 35 # undef WUNTRACED /* Avoid redefinition */ 36 # include <linux/fs.h> /* For block ioctl definitions */ 37 # define BLKSIZE_SHIFT SECTOR_SHIFT 38 # ifndef BLKGETSIZE64 /* fs.h out-of-date */ 39 # define BLKGETSIZE64 _IOR(0x12, 114, size_t) 40 # endif /* BLKGETSIZE64 */ 41 #elif __NetBSD__ 42 # include <sys/disk.h> 43 # include <sys/disklabel.h> 44 # include <sys/param.h> 45 #elif __DragonFly__ 46 # include <sys/diskslice.h> 47 # include <sys/param.h> 48 #else 49 # include <sys/disk.h> 50 # define BLKBSZGET DKIOCGETBLOCKSIZE 51 # define BLKSSZGET DKIOCGETBLOCKSIZE 52 # define BLKGETSIZE64 DKIOCGETBLOCKCOUNT 53 # define BLKFLSBUF DKIOCSYNCHRONIZECACHE 54 # define BLKSIZE_SHIFT 0 55 #endif 56 57 #ifdef O_DIRECT_SUPPORT 58 # ifndef O_DIRECT 59 # error O_DIRECT support configured but O_DIRECT definition not found in headers 60 # endif 61 #endif 62 63 static DM_LIST_INIT(_open_devices); 64 65 /*----------------------------------------------------------------- 66 * The standard io loop that keeps submitting an io until it's 67 * all gone. 68 *---------------------------------------------------------------*/ 69 static int _io(struct device_area *where, void *buffer, int should_write) 70 { 71 int fd = dev_fd(where->dev); 72 ssize_t n = 0; 73 size_t total = 0; 74 75 if (fd < 0) { 76 log_error("Attempt to read an unopened device (%s).", 77 dev_name(where->dev)); 78 return 0; 79 } 80 81 /* 82 * Skip all writes in test mode. 83 */ 84 if (should_write && test_mode()) 85 return 1; 86 87 if (where->size > SSIZE_MAX) { 88 log_error("Read size too large: %" PRIu64, where->size); 89 return 0; 90 } 91 92 if (lseek(fd, (off_t) where->start, SEEK_SET) < 0) { 93 log_error("%s: lseek %" PRIu64 " failed: %s", 94 dev_name(where->dev), (uint64_t) where->start, 95 strerror(errno)); 96 return 0; 97 } 98 99 while (total < (size_t) where->size) { 100 do 101 n = should_write ? 102 write(fd, buffer, (size_t) where->size - total) : 103 read(fd, buffer, (size_t) where->size - total); 104 while ((n < 0) && ((errno == EINTR) || (errno == EAGAIN))); 105 106 if (n < 0) 107 log_error("%s: %s failed after %" PRIu64 " of %" PRIu64 108 " at %" PRIu64 ": %s", dev_name(where->dev), 109 should_write ? "write" : "read", 110 (uint64_t) total, 111 (uint64_t) where->size, 112 (uint64_t) where->start, strerror(errno)); 113 114 if (n <= 0) 115 break; 116 117 total += n; 118 buffer += n; 119 } 120 121 return (total == (size_t) where->size); 122 } 123 124 /*----------------------------------------------------------------- 125 * LVM2 uses O_DIRECT when performing metadata io, which requires 126 * block size aligned accesses. If any io is not aligned we have 127 * to perform the io via a bounce buffer, obviously this is quite 128 * inefficient. 129 *---------------------------------------------------------------*/ 130 131 /* 132 * Get the sector size from an _open_ device. 133 */ 134 static int _get_block_size(struct device *dev, unsigned int *size) 135 { 136 const char *name = dev_name(dev); 137 #ifdef __NetBSD__ 138 struct disklabel lab; 139 #elif __DragonFly__ 140 struct partinfo pinfo; 141 #endif 142 143 if ((dev->block_size == -1)) { 144 #ifdef __NetBSD__ 145 if (ioctl(dev_fd(dev), DIOCGDINFO, &lab) < 0) { 146 dev->block_size = DEV_BSIZE; 147 } else 148 dev->block_size = lab.d_secsize; 149 #elif __DragonFly__ 150 if (ioctl(dev_fd(dev), DIOCGPART, &pinfo) < 0) { 151 dev->block_size = DEV_BSIZE; 152 } else 153 dev->block_size = pinfo.media_blksize; 154 #else 155 if (ioctl(dev_fd(dev), BLKBSZGET, &dev->block_size) < 0) { 156 log_sys_error("ioctl BLKBSZGET", name); 157 return 0; 158 } 159 #endif 160 log_debug("%s: block size is %u bytes", name, dev->block_size); 161 } 162 163 *size = (unsigned int) dev->block_size; 164 165 return 1; 166 } 167 168 /* 169 * Widens a region to be an aligned region. 170 */ 171 static void _widen_region(unsigned int block_size, struct device_area *region, 172 struct device_area *result) 173 { 174 uint64_t mask = block_size - 1, delta; 175 memcpy(result, region, sizeof(*result)); 176 177 /* adjust the start */ 178 delta = result->start & mask; 179 if (delta) { 180 result->start -= delta; 181 result->size += delta; 182 } 183 184 /* adjust the end */ 185 delta = (result->start + result->size) & mask; 186 if (delta) 187 result->size += block_size - delta; 188 } 189 190 static int _aligned_io(struct device_area *where, void *buffer, 191 int should_write) 192 { 193 void *bounce; 194 unsigned int block_size = 0; 195 uintptr_t mask; 196 struct device_area widened; 197 198 if (!(where->dev->flags & DEV_REGULAR) && 199 !_get_block_size(where->dev, &block_size)) 200 return_0; 201 202 if (!block_size) 203 block_size = lvm_getpagesize(); 204 205 _widen_region(block_size, where, &widened); 206 207 /* Do we need to use a bounce buffer? */ 208 mask = block_size - 1; 209 if (!memcmp(where, &widened, sizeof(widened)) && 210 !((uintptr_t) buffer & mask)) 211 return _io(where, buffer, should_write); 212 213 /* Allocate a bounce buffer with an extra block */ 214 if (!(bounce = alloca((size_t) widened.size + block_size))) { 215 log_error("Bounce buffer alloca failed"); 216 return 0; 217 } 218 219 /* 220 * Realign start of bounce buffer (using the extra sector) 221 */ 222 if (((uintptr_t) bounce) & mask) 223 bounce = (void *) ((((uintptr_t) bounce) + mask) & ~mask); 224 225 /* channel the io through the bounce buffer */ 226 if (!_io(&widened, bounce, 0)) { 227 if (!should_write) 228 return_0; 229 /* FIXME pre-extend the file */ 230 memset(bounce, '\n', widened.size); 231 } 232 233 if (should_write) { 234 memcpy(bounce + (where->start - widened.start), buffer, 235 (size_t) where->size); 236 237 /* ... then we write */ 238 return _io(&widened, bounce, 1); 239 } 240 241 memcpy(buffer, bounce + (where->start - widened.start), 242 (size_t) where->size); 243 244 return 1; 245 } 246 247 static int _dev_get_size_file(const struct device *dev, uint64_t *size) 248 { 249 const char *name = dev_name(dev); 250 struct stat info; 251 252 if (stat(name, &info)) { 253 log_sys_error("stat", name); 254 return 0; 255 } 256 257 *size = info.st_size; 258 *size >>= SECTOR_SHIFT; /* Convert to sectors */ 259 260 log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size); 261 262 return 1; 263 } 264 265 static int _dev_get_size_dev(const struct device *dev, uint64_t *size) 266 { 267 int fd; 268 const char *name = dev_name(dev); 269 #ifdef __NetBSD__ 270 struct disklabel lab; 271 struct dkwedge_info dkw; 272 #elif __DragonFly__ 273 struct partinfo pinfo; 274 #endif 275 276 if ((fd = open(name, O_RDONLY)) < 0) { 277 #ifndef __NetBSD__ 278 log_sys_error("open", name); 279 #endif 280 return 0; 281 } 282 283 #ifdef __NetBSD__ 284 if ((*size = lseek (fd, 0, SEEK_END)) < 0) { 285 log_sys_error("lseek SEEK_END", name); 286 close(fd); 287 return 0; 288 } 289 290 if (ioctl(fd, DIOCGDINFO, &lab) < 0) { 291 if (ioctl(fd, DIOCGWEDGEINFO, &dkw) < 0) { 292 log_debug("ioctl DIOCGWEDGEINFO", name); 293 close(fd); 294 return 0; 295 } else 296 if (dkw.dkw_size) 297 *size = dkw.dkw_size; 298 } else 299 if (lab.d_secsize) 300 *size /= lab.d_secsize; 301 #elif __DragonFly__ 302 if ((*size = lseek (fd, 0, SEEK_END)) < 0) { 303 log_sys_error("lseek SEEK_END", name); 304 close(fd); 305 return 0; 306 } 307 308 if (ioctl(fd, DIOCGPART, &pinfo) < 0) { 309 log_debug("ioctl DIOCGPART", name); 310 close(fd); 311 return 0; 312 } else { 313 #if 0 314 /* XXX: we could also get the size this way, instead of lseek */ 315 if (pinfo.media_blocks) 316 *size = pinfo.media_blocks; 317 #endif 318 if (pinfo.media_blksize) 319 *size /= pinfo.media_blksize; 320 } 321 #else 322 if (ioctl(fd, BLKGETSIZE64, size) < 0) { 323 log_sys_error("ioctl BLKGETSIZE64", name); 324 if (close(fd)) 325 log_sys_error("close", name); 326 return 0; 327 } 328 329 *size >>= BLKSIZE_SHIFT; /* Convert to sectors */ 330 #endif 331 if (close(fd)) 332 log_sys_error("close", name); 333 334 log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size); 335 336 return 1; 337 } 338 339 static int _dev_read_ahead_dev(struct device *dev, uint32_t *read_ahead) 340 { 341 #ifdef linux 342 long read_ahead_long; 343 344 if (dev->read_ahead != -1) { 345 *read_ahead = (uint32_t) dev->read_ahead; 346 return 1; 347 } 348 349 if (!dev_open(dev)) 350 return_0; 351 352 if (ioctl(dev->fd, BLKRAGET, &read_ahead_long) < 0) { 353 log_sys_error("ioctl BLKRAGET", dev_name(dev)); 354 if (!dev_close(dev)) 355 stack; 356 return 0; 357 } 358 359 if (!dev_close(dev)) 360 stack; 361 362 *read_ahead = (uint32_t) read_ahead_long; 363 dev->read_ahead = read_ahead_long; 364 365 log_very_verbose("%s: read_ahead is %u sectors", 366 dev_name(dev), *read_ahead); 367 #endif 368 return 1; 369 } 370 371 /*----------------------------------------------------------------- 372 * Public functions 373 *---------------------------------------------------------------*/ 374 375 int dev_get_size(const struct device *dev, uint64_t *size) 376 { 377 if (!dev) 378 return 0; 379 380 if ((dev->flags & DEV_REGULAR)) 381 return _dev_get_size_file(dev, size); 382 else 383 return _dev_get_size_dev(dev, size); 384 } 385 386 int dev_get_read_ahead(struct device *dev, uint32_t *read_ahead) 387 { 388 if (!dev) 389 return 0; 390 391 if (dev->flags & DEV_REGULAR) { 392 *read_ahead = 0; 393 return 1; 394 } 395 396 return _dev_read_ahead_dev(dev, read_ahead); 397 } 398 399 /* FIXME Unused 400 int dev_get_sectsize(struct device *dev, uint32_t *size) 401 { 402 int fd; 403 int s; 404 const char *name = dev_name(dev); 405 406 if ((fd = open(name, O_RDONLY)) < 0) { 407 log_sys_error("open", name); 408 return 0; 409 } 410 411 if (ioctl(fd, BLKSSZGET, &s) < 0) { 412 log_sys_error("ioctl BLKSSZGET", name); 413 if (close(fd)) 414 log_sys_error("close", name); 415 return 0; 416 } 417 418 if (close(fd)) 419 log_sys_error("close", name); 420 421 *size = (uint32_t) s; 422 423 log_very_verbose("%s: sector size is %" PRIu32 " bytes", name, *size); 424 425 return 1; 426 } 427 */ 428 429 void dev_flush(struct device *dev) 430 { 431 #ifdef __linux__ 432 if (!(dev->flags & DEV_REGULAR) && ioctl(dev->fd, BLKFLSBUF, 0) >= 0) 433 return; 434 #endif 435 436 if (fsync(dev->fd) >= 0) 437 return; 438 439 sync(); 440 } 441 442 int dev_open_flags(struct device *dev, int flags, int direct, int quiet) 443 { 444 struct stat buf; 445 const char *name; 446 int need_excl = 0, need_rw = 0; 447 448 if ((flags & O_ACCMODE) == O_RDWR) 449 need_rw = 1; 450 451 if ((flags & O_EXCL)) 452 need_excl = 1; 453 454 if (dev->fd >= 0) { 455 if (((dev->flags & DEV_OPENED_RW) || !need_rw) && 456 ((dev->flags & DEV_OPENED_EXCL) || !need_excl)) { 457 dev->open_count++; 458 return 1; 459 } 460 461 if (dev->open_count && !need_excl) { 462 /* FIXME Ensure we never get here */ 463 log_debug("WARNING: %s already opened read-only", 464 dev_name(dev)); 465 dev->open_count++; 466 } 467 468 dev_close_immediate(dev); 469 } 470 471 if (memlock()) 472 log_error("WARNING: dev_open(%s) called while suspended", 473 dev_name(dev)); 474 475 if (dev->flags & DEV_REGULAR) 476 name = dev_name(dev); 477 else if (!(name = dev_name_confirmed(dev, quiet))) 478 return_0; 479 480 if (!(dev->flags & DEV_REGULAR)) { 481 if (stat(name, &buf) < 0) { 482 log_sys_error("%s: stat failed", name); 483 return 0; 484 } 485 if (buf.st_rdev != dev->dev) { 486 log_error("%s: device changed", name); 487 return 0; 488 } 489 } 490 491 #ifdef O_DIRECT_SUPPORT 492 if (direct) { 493 if (!(dev->flags & DEV_O_DIRECT_TESTED)) 494 dev->flags |= DEV_O_DIRECT; 495 496 if ((dev->flags & DEV_O_DIRECT)) 497 flags |= O_DIRECT; 498 } 499 #endif 500 501 #ifdef O_NOATIME 502 /* Don't update atime on device inodes */ 503 if (!(dev->flags & DEV_REGULAR)) 504 flags |= O_NOATIME; 505 #endif 506 507 if ((dev->fd = open(name, flags, 0777)) < 0) { 508 #ifdef O_DIRECT_SUPPORT 509 if (direct && !(dev->flags & DEV_O_DIRECT_TESTED)) { 510 flags &= ~O_DIRECT; 511 if ((dev->fd = open(name, flags, 0777)) >= 0) { 512 dev->flags &= ~DEV_O_DIRECT; 513 log_debug("%s: Not using O_DIRECT", name); 514 goto opened; 515 } 516 } 517 #endif 518 if (quiet) 519 log_sys_debug("open", name); 520 else 521 log_sys_error("open", name); 522 523 return 0; 524 } 525 526 #ifdef O_DIRECT_SUPPORT 527 opened: 528 if (direct) 529 dev->flags |= DEV_O_DIRECT_TESTED; 530 #endif 531 dev->open_count++; 532 dev->flags &= ~DEV_ACCESSED_W; 533 534 if (need_rw) 535 dev->flags |= DEV_OPENED_RW; 536 else 537 dev->flags &= ~DEV_OPENED_RW; 538 539 if (need_excl) 540 dev->flags |= DEV_OPENED_EXCL; 541 else 542 dev->flags &= ~DEV_OPENED_EXCL; 543 544 if (!(dev->flags & DEV_REGULAR) && 545 ((fstat(dev->fd, &buf) < 0) || (buf.st_rdev != dev->dev))) { 546 log_error("%s: fstat failed: Has device name changed?", name); 547 dev_close_immediate(dev); 548 return 0; 549 } 550 551 #ifndef O_DIRECT_SUPPORT 552 if (!(dev->flags & DEV_REGULAR)) 553 dev_flush(dev); 554 #endif 555 556 if ((flags & O_CREAT) && !(flags & O_TRUNC)) 557 dev->end = lseek(dev->fd, (off_t) 0, SEEK_END); 558 559 dm_list_add(&_open_devices, &dev->open_list); 560 561 log_debug("Opened %s %s%s%s", dev_name(dev), 562 dev->flags & DEV_OPENED_RW ? "RW" : "RO", 563 dev->flags & DEV_OPENED_EXCL ? " O_EXCL" : "", 564 dev->flags & DEV_O_DIRECT ? " O_DIRECT" : ""); 565 566 return 1; 567 } 568 569 int dev_open_quiet(struct device *dev) 570 { 571 int flags; 572 573 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY; 574 575 return dev_open_flags(dev, flags, 1, 1); 576 } 577 578 int dev_open(struct device *dev) 579 { 580 int flags; 581 582 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY; 583 584 return dev_open_flags(dev, flags, 1, 0); 585 } 586 587 int dev_test_excl(struct device *dev) 588 { 589 int flags; 590 int r; 591 592 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY; 593 flags |= O_EXCL; 594 595 r = dev_open_flags(dev, flags, 1, 1); 596 if (r) 597 dev_close_immediate(dev); 598 599 return r; 600 } 601 602 static void _close(struct device *dev) 603 { 604 if (close(dev->fd)) 605 log_sys_error("close", dev_name(dev)); 606 dev->fd = -1; 607 dev->block_size = -1; 608 dm_list_del(&dev->open_list); 609 610 log_debug("Closed %s", dev_name(dev)); 611 612 if (dev->flags & DEV_ALLOCED) { 613 dm_free((void *) dm_list_item(dev->aliases.n, struct str_list)-> 614 str); 615 dm_free(dev->aliases.n); 616 dm_free(dev); 617 } 618 } 619 620 static int _dev_close(struct device *dev, int immediate) 621 { 622 struct lvmcache_info *info; 623 624 if (dev->fd < 0) { 625 log_error("Attempt to close device '%s' " 626 "which is not open.", dev_name(dev)); 627 return 0; 628 } 629 630 #ifndef O_DIRECT_SUPPORT 631 if (dev->flags & DEV_ACCESSED_W) 632 dev_flush(dev); 633 #endif 634 635 if (dev->open_count > 0) 636 dev->open_count--; 637 638 if (immediate && dev->open_count) 639 log_debug("%s: Immediate close attempt while still referenced", 640 dev_name(dev)); 641 642 /* Close unless device is known to belong to a locked VG */ 643 if (immediate || 644 (dev->open_count < 1 && 645 (!(info = info_from_pvid(dev->pvid, 0)) || 646 !info->vginfo || 647 !vgname_is_locked(info->vginfo->vgname)))) 648 _close(dev); 649 650 return 1; 651 } 652 653 int dev_close(struct device *dev) 654 { 655 return _dev_close(dev, 0); 656 } 657 658 int dev_close_immediate(struct device *dev) 659 { 660 return _dev_close(dev, 1); 661 } 662 663 void dev_close_all(void) 664 { 665 struct dm_list *doh, *doht; 666 struct device *dev; 667 668 dm_list_iterate_safe(doh, doht, &_open_devices) { 669 dev = dm_list_struct_base(doh, struct device, open_list); 670 if (dev->open_count < 1) 671 _close(dev); 672 } 673 } 674 675 int dev_read(struct device *dev, uint64_t offset, size_t len, void *buffer) 676 { 677 struct device_area where; 678 679 if (!dev->open_count) 680 return_0; 681 682 where.dev = dev; 683 where.start = offset; 684 where.size = len; 685 686 return _aligned_io(&where, buffer, 0); 687 } 688 689 /* 690 * Read from 'dev' into 'buf', possibly in 2 distinct regions, denoted 691 * by (offset,len) and (offset2,len2). Thus, the total size of 692 * 'buf' should be len+len2. 693 */ 694 int dev_read_circular(struct device *dev, uint64_t offset, size_t len, 695 uint64_t offset2, size_t len2, void *buf) 696 { 697 if (!dev_read(dev, offset, len, buf)) { 698 log_error("Read from %s failed", dev_name(dev)); 699 return 0; 700 } 701 702 /* 703 * The second region is optional, and allows for 704 * a circular buffer on the device. 705 */ 706 if (!len2) 707 return 1; 708 709 if (!dev_read(dev, offset2, len2, buf + len)) { 710 log_error("Circular read from %s failed", 711 dev_name(dev)); 712 return 0; 713 } 714 715 return 1; 716 } 717 718 /* FIXME If O_DIRECT can't extend file, dev_extend first; dev_truncate after. 719 * But fails if concurrent processes writing 720 */ 721 722 /* FIXME pre-extend the file */ 723 int dev_append(struct device *dev, size_t len, void *buffer) 724 { 725 int r; 726 727 if (!dev->open_count) 728 return_0; 729 730 r = dev_write(dev, dev->end, len, buffer); 731 dev->end += (uint64_t) len; 732 733 #ifndef O_DIRECT_SUPPORT 734 dev_flush(dev); 735 #endif 736 return r; 737 } 738 739 int dev_write(struct device *dev, uint64_t offset, size_t len, void *buffer) 740 { 741 struct device_area where; 742 743 if (!dev->open_count) 744 return_0; 745 746 where.dev = dev; 747 where.start = offset; 748 where.size = len; 749 750 dev->flags |= DEV_ACCESSED_W; 751 752 return _aligned_io(&where, buffer, 1); 753 } 754 755 int dev_set(struct device *dev, uint64_t offset, size_t len, int value) 756 { 757 size_t s; 758 char buffer[4096] __attribute((aligned(8))); 759 760 if (!dev_open(dev)) 761 return_0; 762 763 if ((offset % SECTOR_SIZE) || (len % SECTOR_SIZE)) 764 log_debug("Wiping %s at %" PRIu64 " length %" PRIsize_t, 765 dev_name(dev), offset, len); 766 else 767 log_debug("Wiping %s at sector %" PRIu64 " length %" PRIsize_t 768 " sectors", dev_name(dev), offset >> SECTOR_SHIFT, 769 len >> SECTOR_SHIFT); 770 771 memset(buffer, value, sizeof(buffer)); 772 while (1) { 773 s = len > sizeof(buffer) ? sizeof(buffer) : len; 774 if (!dev_write(dev, offset, s, buffer)) 775 break; 776 777 len -= s; 778 if (!len) 779 break; 780 781 offset += s; 782 } 783 784 dev->flags |= DEV_ACCESSED_W; 785 786 if (!dev_close(dev)) 787 stack; 788 789 return (len == 0); 790 } 791