1 /* $NetBSD: dev-io.c,v 1.10 2010/12/29 23:14:21 haad Exp $ */
2
3 /*
4 * Copyright (C) 2001-2004 Sistina Software, Inc. All rights reserved.
5 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved.
6 *
7 * This file is part of LVM2.
8 *
9 * This copyrighted material is made available to anyone wishing to use,
10 * modify, copy, or redistribute it subject to the terms and conditions
11 * of the GNU Lesser General Public License v.2.1.
12 *
13 * You should have received a copy of the GNU Lesser General Public License
14 * along with this program; if not, write to the Free Software Foundation,
15 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
16 */
17
18 #include "lib.h"
19 #include "lvm-types.h"
20 #include "device.h"
21 #include "metadata.h"
22 #include "lvmcache.h"
23 #include "memlock.h"
24 #include "locking.h"
25
26 #include <limits.h>
27 #include <sys/stat.h>
28 #include <fcntl.h>
29 #include <unistd.h>
30 #include <sys/ioctl.h>
31
32 #ifdef linux
33 # define u64 uint64_t /* Missing without __KERNEL__ */
34 # undef WNOHANG /* Avoid redefinition */
35 # undef WUNTRACED /* Avoid redefinition */
36 # include <linux/fs.h> /* For block ioctl definitions */
37 # define BLKSIZE_SHIFT SECTOR_SHIFT
38 # ifndef BLKGETSIZE64 /* fs.h out-of-date */
39 # define BLKGETSIZE64 _IOR(0x12, 114, size_t)
40 # endif /* BLKGETSIZE64 */
41 #elif __NetBSD__
42 # include <sys/disk.h>
43 # include <sys/disklabel.h>
44 # include <prop/proplib.h>
45 # include <sys/param.h>
46 #else
47 # include <sys/disk.h>
48 # define BLKBSZGET DKIOCGETBLOCKSIZE
49 # define BLKSSZGET DKIOCGETBLOCKSIZE
50 # define BLKGETSIZE64 DKIOCGETBLOCKCOUNT
51 # define BLKFLSBUF DKIOCSYNCHRONIZECACHE
52 # define BLKSIZE_SHIFT 0
53 #endif
54
55 #ifdef O_DIRECT_SUPPORT
56 # ifndef O_DIRECT
57 # error O_DIRECT support configured but O_DIRECT definition not found in headers
58 # endif
59 #endif
60
61 static DM_LIST_INIT(_open_devices);
62
63 /*-----------------------------------------------------------------
64 * The standard io loop that keeps submitting an io until it's
65 * all gone.
66 *---------------------------------------------------------------*/
_io(struct device_area * where,void * buffer,int should_write)67 static int _io(struct device_area *where, void *buffer, int should_write)
68 {
69 int fd = dev_fd(where->dev);
70 ssize_t n = 0;
71 size_t total = 0;
72
73 if (fd < 0) {
74 log_error("Attempt to read an unopened device (%s).",
75 dev_name(where->dev));
76 return 0;
77 }
78
79 /*
80 * Skip all writes in test mode.
81 */
82 if (should_write && test_mode())
83 return 1;
84
85 if (where->size > SSIZE_MAX) {
86 log_error("Read size too large: %" PRIu64, where->size);
87 return 0;
88 }
89
90 if (lseek(fd, (off_t) where->start, SEEK_SET) < 0) {
91 log_error("%s: lseek %" PRIu64 " failed: %s",
92 dev_name(where->dev), (uint64_t) where->start,
93 strerror(errno));
94 return 0;
95 }
96
97 while (total < (size_t) where->size) {
98 do
99 n = should_write ?
100 write(fd, buffer, (size_t) where->size - total) :
101 read(fd, buffer, (size_t) where->size - total);
102 while ((n < 0) && ((errno == EINTR) || (errno == EAGAIN)));
103
104 if (n < 0)
105 log_error("%s: %s failed after %" PRIu64 " of %" PRIu64
106 " at %" PRIu64 ": %s", dev_name(where->dev),
107 should_write ? "write" : "read",
108 (uint64_t) total,
109 (uint64_t) where->size,
110 (uint64_t) where->start, strerror(errno));
111
112 if (n <= 0)
113 break;
114
115 total += n;
116 buffer += n;
117 }
118
119 return (total == (size_t) where->size);
120 }
121
122 /*-----------------------------------------------------------------
123 * LVM2 uses O_DIRECT when performing metadata io, which requires
124 * block size aligned accesses. If any io is not aligned we have
125 * to perform the io via a bounce buffer, obviously this is quite
126 * inefficient.
127 *---------------------------------------------------------------*/
128
129 /*
130 * Get the sector size from an _open_ device.
131 */
_get_block_size(struct device * dev,unsigned int * size)132 static int _get_block_size(struct device *dev, unsigned int *size)
133 {
134 const char *name = dev_name(dev);
135 #ifdef __NetBSD__
136 struct disklabel lab;
137 prop_dictionary_t disk_dict, geom_dict;
138 uint32_t secsize;
139 #endif
140
141 if ((dev->block_size == -1)) {
142 #ifdef __NetBSD__
143 if (prop_dictionary_recv_ioctl(dev_fd(dev), DIOCGDISKINFO, &disk_dict)) {
144 if (ioctl(dev_fd(dev), DIOCGDINFO, &lab) < 0) {
145 dev->block_size = DEV_BSIZE;
146 } else
147 dev->block_size = lab.d_secsize;
148 } else {
149 geom_dict = prop_dictionary_get(disk_dict, "geometry");
150 prop_dictionary_get_uint32(geom_dict, "sector-size", &secsize);
151 dev->block_size = secsize;
152 }
153 #else
154 if (ioctl(dev_fd(dev), BLKBSZGET, &dev->block_size) < 0) {
155 log_sys_error("ioctl BLKBSZGET", name);
156 return 0;
157 }
158 #endif
159 log_debug("%s: block size is %u bytes", name, dev->block_size);
160 }
161
162 *size = (unsigned int) dev->block_size;
163
164 return 1;
165 }
166
167 /*
168 * Widens a region to be an aligned region.
169 */
_widen_region(unsigned int block_size,struct device_area * region,struct device_area * result)170 static void _widen_region(unsigned int block_size, struct device_area *region,
171 struct device_area *result)
172 {
173 uint64_t mask = block_size - 1, delta;
174 memcpy(result, region, sizeof(*result));
175
176 /* adjust the start */
177 delta = result->start & mask;
178 if (delta) {
179 result->start -= delta;
180 result->size += delta;
181 }
182
183 /* adjust the end */
184 delta = (result->start + result->size) & mask;
185 if (delta)
186 result->size += block_size - delta;
187 }
188
_aligned_io(struct device_area * where,void * buffer,int should_write)189 static int _aligned_io(struct device_area *where, void *buffer,
190 int should_write)
191 {
192 void *bounce;
193 unsigned int block_size = 0;
194 uintptr_t mask;
195 struct device_area widened;
196
197 if (!(where->dev->flags & DEV_REGULAR) &&
198 !_get_block_size(where->dev, &block_size))
199 return_0;
200
201 if (!block_size)
202 block_size = lvm_getpagesize();
203
204 _widen_region(block_size, where, &widened);
205
206 /* Do we need to use a bounce buffer? */
207 mask = block_size - 1;
208 if (!memcmp(where, &widened, sizeof(widened)) &&
209 !((uintptr_t) buffer & mask))
210 return _io(where, buffer, should_write);
211
212 /* Allocate a bounce buffer with an extra block */
213 if (!(bounce = alloca((size_t) widened.size + block_size))) {
214 log_error("Bounce buffer alloca failed");
215 return 0;
216 }
217
218 /*
219 * Realign start of bounce buffer (using the extra sector)
220 */
221 if (((uintptr_t) bounce) & mask)
222 bounce = (void *) ((((uintptr_t) bounce) + mask) & ~mask);
223
224 /* channel the io through the bounce buffer */
225 if (!_io(&widened, bounce, 0)) {
226 if (!should_write)
227 return_0;
228 /* FIXME pre-extend the file */
229 memset(bounce, '\n', widened.size);
230 }
231
232 if (should_write) {
233 memcpy(bounce + (where->start - widened.start), buffer,
234 (size_t) where->size);
235
236 /* ... then we write */
237 return _io(&widened, bounce, 1);
238 }
239
240 memcpy(buffer, bounce + (where->start - widened.start),
241 (size_t) where->size);
242
243 return 1;
244 }
245
_dev_get_size_file(const struct device * dev,uint64_t * size)246 static int _dev_get_size_file(const struct device *dev, uint64_t *size)
247 {
248 const char *name = dev_name(dev);
249 struct stat info;
250
251 if (stat(name, &info)) {
252 log_sys_error("stat", name);
253 return 0;
254 }
255
256 *size = info.st_size;
257 *size >>= SECTOR_SHIFT; /* Convert to sectors */
258
259 log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size);
260
261 return 1;
262 }
263
_dev_get_size_dev(const struct device * dev,uint64_t * size)264 static int _dev_get_size_dev(const struct device *dev, uint64_t *size)
265 {
266 int fd;
267 const char *name = dev_name(dev);
268 #ifdef __NetBSD__
269 struct disklabel lab;
270 struct dkwedge_info dkw;
271 struct stat stat;
272 #endif
273
274 if ((fd = open(name, O_RDONLY)) < 0) {
275 #ifndef __NetBSD__
276 log_sys_error("open", name);
277 #endif
278 return 0;
279 }
280
281 #ifdef __NetBSD__
282 /* Get info about partition/wedge */
283 if (ioctl(fd, DIOCGWEDGEINFO, &dkw) == -1) {
284 if (ioctl(fd, DIOCGDINFO, &lab) == -1) {
285 log_debug("Please implement DIOCGWEDGEINFO or "
286 "DIOCGDINFO for disk device %s", name);
287 close(fd);
288 return 0;
289 } else {
290 if (fstat(fd, &stat) < 0)
291 log_debug("fstat on device %s failure", name);
292
293 *size = lab.d_partitions[DISKPART(stat.st_rdev)].p_size;
294 }
295 } else
296 *size = dkw.dkw_size;
297 #else
298 if (ioctl(fd, BLKGETSIZE64, size) < 0) {
299 log_sys_error("ioctl BLKGETSIZE64", name);
300 if (close(fd))
301 log_sys_error("close", name);
302 return 0;
303 }
304
305 *size >>= BLKSIZE_SHIFT; /* Convert to sectors */
306 #endif
307 if (close(fd))
308 log_sys_error("close", name);
309
310 log_very_verbose("%s: size is %" PRIu64 " sectors", name, *size);
311
312 return 1;
313 }
314
_dev_read_ahead_dev(struct device * dev,uint32_t * read_ahead)315 static int _dev_read_ahead_dev(struct device *dev, uint32_t *read_ahead)
316 {
317 #ifdef linux
318 long read_ahead_long;
319
320 if (dev->read_ahead != -1) {
321 *read_ahead = (uint32_t) dev->read_ahead;
322 return 1;
323 }
324
325 if (!dev_open(dev))
326 return_0;
327
328 if (ioctl(dev->fd, BLKRAGET, &read_ahead_long) < 0) {
329 log_sys_error("ioctl BLKRAGET", dev_name(dev));
330 if (!dev_close(dev))
331 stack;
332 return 0;
333 }
334
335 if (!dev_close(dev))
336 stack;
337
338 *read_ahead = (uint32_t) read_ahead_long;
339 dev->read_ahead = read_ahead_long;
340
341 log_very_verbose("%s: read_ahead is %u sectors",
342 dev_name(dev), *read_ahead);
343 #endif
344 return 1;
345 }
346
347 /*-----------------------------------------------------------------
348 * Public functions
349 *---------------------------------------------------------------*/
350
dev_get_size(const struct device * dev,uint64_t * size)351 int dev_get_size(const struct device *dev, uint64_t *size)
352 {
353 if (!dev)
354 return 0;
355
356 if ((dev->flags & DEV_REGULAR))
357 return _dev_get_size_file(dev, size);
358 else
359 return _dev_get_size_dev(dev, size);
360 }
361
dev_get_read_ahead(struct device * dev,uint32_t * read_ahead)362 int dev_get_read_ahead(struct device *dev, uint32_t *read_ahead)
363 {
364 if (!dev)
365 return 0;
366
367 if (dev->flags & DEV_REGULAR) {
368 *read_ahead = 0;
369 return 1;
370 }
371
372 return _dev_read_ahead_dev(dev, read_ahead);
373 }
374
375 /* FIXME Unused
376 int dev_get_sectsize(struct device *dev, uint32_t *size)
377 {
378 int fd;
379 int s;
380 const char *name = dev_name(dev);
381
382 if ((fd = open(name, O_RDONLY)) < 0) {
383 log_sys_error("open", name);
384 return 0;
385 }
386
387 if (ioctl(fd, BLKSSZGET, &s) < 0) {
388 log_sys_error("ioctl BLKSSZGET", name);
389 if (close(fd))
390 log_sys_error("close", name);
391 return 0;
392 }
393
394 if (close(fd))
395 log_sys_error("close", name);
396
397 *size = (uint32_t) s;
398
399 log_very_verbose("%s: sector size is %" PRIu32 " bytes", name, *size);
400
401 return 1;
402 }
403 */
404
dev_flush(struct device * dev)405 void dev_flush(struct device *dev)
406 {
407 #ifdef __linux__
408 if (!(dev->flags & DEV_REGULAR) && ioctl(dev->fd, BLKFLSBUF, 0) >= 0)
409 return;
410 #endif
411
412 if (fsync(dev->fd) >= 0)
413 return;
414
415 sync();
416 }
417
dev_open_flags(struct device * dev,int flags,int direct,int quiet)418 int dev_open_flags(struct device *dev, int flags, int direct, int quiet)
419 {
420 struct stat buf;
421 const char *name;
422 int need_excl = 0, need_rw = 0;
423
424 if ((flags & O_ACCMODE) == O_RDWR)
425 need_rw = 1;
426
427 if ((flags & O_EXCL))
428 need_excl = 1;
429
430 if (dev->fd >= 0) {
431 if (((dev->flags & DEV_OPENED_RW) || !need_rw) &&
432 ((dev->flags & DEV_OPENED_EXCL) || !need_excl)) {
433 dev->open_count++;
434 return 1;
435 }
436
437 if (dev->open_count && !need_excl) {
438 /* FIXME Ensure we never get here */
439 log_debug("WARNING: %s already opened read-only",
440 dev_name(dev));
441 dev->open_count++;
442 }
443
444 dev_close_immediate(dev);
445 }
446
447 if (memlock())
448 log_error("WARNING: dev_open(%s) called while suspended",
449 dev_name(dev));
450
451 if (dev->flags & DEV_REGULAR)
452 name = dev_name(dev);
453 else if (!(name = dev_name_confirmed(dev, quiet)))
454 return_0;
455
456 if (!(dev->flags & DEV_REGULAR)) {
457 if (stat(name, &buf) < 0) {
458 log_sys_error("%s: stat failed", name);
459 return 0;
460 }
461 if (buf.st_rdev != dev->dev) {
462 log_error("%s: device changed", name);
463 return 0;
464 }
465 }
466
467 #ifdef O_DIRECT_SUPPORT
468 if (direct) {
469 if (!(dev->flags & DEV_O_DIRECT_TESTED))
470 dev->flags |= DEV_O_DIRECT;
471
472 if ((dev->flags & DEV_O_DIRECT))
473 flags |= O_DIRECT;
474 }
475 #endif
476
477 #ifdef O_NOATIME
478 /* Don't update atime on device inodes */
479 if (!(dev->flags & DEV_REGULAR))
480 flags |= O_NOATIME;
481 #endif
482
483 if ((dev->fd = open(name, flags, 0777)) < 0) {
484 #ifdef O_DIRECT_SUPPORT
485 if (direct && !(dev->flags & DEV_O_DIRECT_TESTED)) {
486 flags &= ~O_DIRECT;
487 if ((dev->fd = open(name, flags, 0777)) >= 0) {
488 dev->flags &= ~DEV_O_DIRECT;
489 log_debug("%s: Not using O_DIRECT", name);
490 goto opened;
491 }
492 }
493 #endif
494 if (quiet)
495 log_sys_debug("open", name);
496 else
497 log_sys_error("open", name);
498
499 return 0;
500 }
501
502 #ifdef O_DIRECT_SUPPORT
503 opened:
504 if (direct)
505 dev->flags |= DEV_O_DIRECT_TESTED;
506 #endif
507 dev->open_count++;
508 dev->flags &= ~DEV_ACCESSED_W;
509
510 if (need_rw)
511 dev->flags |= DEV_OPENED_RW;
512 else
513 dev->flags &= ~DEV_OPENED_RW;
514
515 if (need_excl)
516 dev->flags |= DEV_OPENED_EXCL;
517 else
518 dev->flags &= ~DEV_OPENED_EXCL;
519
520 if (!(dev->flags & DEV_REGULAR) &&
521 ((fstat(dev->fd, &buf) < 0) || (buf.st_rdev != dev->dev))) {
522 log_error("%s: fstat failed: Has device name changed?", name);
523 dev_close_immediate(dev);
524 return 0;
525 }
526
527 #ifndef O_DIRECT_SUPPORT
528 if (!(dev->flags & DEV_REGULAR))
529 dev_flush(dev);
530 #endif
531
532 if ((flags & O_CREAT) && !(flags & O_TRUNC))
533 dev->end = lseek(dev->fd, (off_t) 0, SEEK_END);
534
535 dm_list_add(&_open_devices, &dev->open_list);
536
537 log_debug("Opened %s %s%s%s", dev_name(dev),
538 dev->flags & DEV_OPENED_RW ? "RW" : "RO",
539 dev->flags & DEV_OPENED_EXCL ? " O_EXCL" : "",
540 dev->flags & DEV_O_DIRECT ? " O_DIRECT" : "");
541
542 return 1;
543 }
544
dev_open_quiet(struct device * dev)545 int dev_open_quiet(struct device *dev)
546 {
547 int flags;
548
549 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY;
550
551 return dev_open_flags(dev, flags, 1, 1);
552 }
553
dev_open(struct device * dev)554 int dev_open(struct device *dev)
555 {
556 int flags;
557
558 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY;
559
560 return dev_open_flags(dev, flags, 1, 0);
561 }
562
dev_test_excl(struct device * dev)563 int dev_test_excl(struct device *dev)
564 {
565 int flags;
566 int r;
567
568 flags = vg_write_lock_held() ? O_RDWR : O_RDONLY;
569 flags |= O_EXCL;
570
571 r = dev_open_flags(dev, flags, 1, 1);
572 if (r)
573 dev_close_immediate(dev);
574
575 return r;
576 }
577
_close(struct device * dev)578 static void _close(struct device *dev)
579 {
580 if (close(dev->fd))
581 log_sys_error("close", dev_name(dev));
582 dev->fd = -1;
583 dev->block_size = -1;
584 dm_list_del(&dev->open_list);
585
586 log_debug("Closed %s", dev_name(dev));
587
588 if (dev->flags & DEV_ALLOCED) {
589 dm_free((void *) dm_list_item(dev->aliases.n, struct str_list)->
590 str);
591 dm_free(dev->aliases.n);
592 dm_free(dev);
593 }
594 }
595
_dev_close(struct device * dev,int immediate)596 static int _dev_close(struct device *dev, int immediate)
597 {
598 struct lvmcache_info *info;
599
600 if (dev->fd < 0) {
601 log_error("Attempt to close device '%s' "
602 "which is not open.", dev_name(dev));
603 return 0;
604 }
605
606 #ifndef O_DIRECT_SUPPORT
607 if (dev->flags & DEV_ACCESSED_W)
608 dev_flush(dev);
609 #endif
610
611 if (dev->open_count > 0)
612 dev->open_count--;
613
614 if (immediate && dev->open_count)
615 log_debug("%s: Immediate close attempt while still referenced",
616 dev_name(dev));
617
618 /* Close unless device is known to belong to a locked VG */
619 if (immediate ||
620 (dev->open_count < 1 &&
621 (!(info = info_from_pvid(dev->pvid, 0)) ||
622 !info->vginfo ||
623 !vgname_is_locked(info->vginfo->vgname))))
624 _close(dev);
625
626 return 1;
627 }
628
dev_close(struct device * dev)629 int dev_close(struct device *dev)
630 {
631 return _dev_close(dev, 0);
632 }
633
dev_close_immediate(struct device * dev)634 int dev_close_immediate(struct device *dev)
635 {
636 return _dev_close(dev, 1);
637 }
638
dev_close_all(void)639 void dev_close_all(void)
640 {
641 struct dm_list *doh, *doht;
642 struct device *dev;
643
644 dm_list_iterate_safe(doh, doht, &_open_devices) {
645 dev = dm_list_struct_base(doh, struct device, open_list);
646 if (dev->open_count < 1)
647 _close(dev);
648 }
649 }
650
dev_read(struct device * dev,uint64_t offset,size_t len,void * buffer)651 int dev_read(struct device *dev, uint64_t offset, size_t len, void *buffer)
652 {
653 struct device_area where;
654
655 if (!dev->open_count)
656 return_0;
657
658 where.dev = dev;
659 where.start = offset;
660 where.size = len;
661
662 return _aligned_io(&where, buffer, 0);
663 }
664
665 /*
666 * Read from 'dev' into 'buf', possibly in 2 distinct regions, denoted
667 * by (offset,len) and (offset2,len2). Thus, the total size of
668 * 'buf' should be len+len2.
669 */
dev_read_circular(struct device * dev,uint64_t offset,size_t len,uint64_t offset2,size_t len2,void * buf)670 int dev_read_circular(struct device *dev, uint64_t offset, size_t len,
671 uint64_t offset2, size_t len2, void *buf)
672 {
673 if (!dev_read(dev, offset, len, buf)) {
674 log_error("Read from %s failed", dev_name(dev));
675 return 0;
676 }
677
678 /*
679 * The second region is optional, and allows for
680 * a circular buffer on the device.
681 */
682 if (!len2)
683 return 1;
684
685 if (!dev_read(dev, offset2, len2, buf + len)) {
686 log_error("Circular read from %s failed",
687 dev_name(dev));
688 return 0;
689 }
690
691 return 1;
692 }
693
694 /* FIXME If O_DIRECT can't extend file, dev_extend first; dev_truncate after.
695 * But fails if concurrent processes writing
696 */
697
698 /* FIXME pre-extend the file */
dev_append(struct device * dev,size_t len,void * buffer)699 int dev_append(struct device *dev, size_t len, void *buffer)
700 {
701 int r;
702
703 if (!dev->open_count)
704 return_0;
705
706 r = dev_write(dev, dev->end, len, buffer);
707 dev->end += (uint64_t) len;
708
709 #ifndef O_DIRECT_SUPPORT
710 dev_flush(dev);
711 #endif
712 return r;
713 }
714
dev_write(struct device * dev,uint64_t offset,size_t len,void * buffer)715 int dev_write(struct device *dev, uint64_t offset, size_t len, void *buffer)
716 {
717 struct device_area where;
718
719 if (!dev->open_count)
720 return_0;
721
722 where.dev = dev;
723 where.start = offset;
724 where.size = len;
725
726 dev->flags |= DEV_ACCESSED_W;
727
728 return _aligned_io(&where, buffer, 1);
729 }
730
dev_set(struct device * dev,uint64_t offset,size_t len,int value)731 int dev_set(struct device *dev, uint64_t offset, size_t len, int value)
732 {
733 size_t s;
734 char buffer[4096] __attribute((aligned(8)));
735
736 if (!dev_open(dev))
737 return_0;
738
739 if ((offset % SECTOR_SIZE) || (len % SECTOR_SIZE))
740 log_debug("Wiping %s at %" PRIu64 " length %" PRIsize_t,
741 dev_name(dev), offset, len);
742 else
743 log_debug("Wiping %s at sector %" PRIu64 " length %" PRIsize_t
744 " sectors", dev_name(dev), offset >> SECTOR_SHIFT,
745 len >> SECTOR_SHIFT);
746
747 memset(buffer, value, sizeof(buffer));
748 while (1) {
749 s = len > sizeof(buffer) ? sizeof(buffer) : len;
750 if (!dev_write(dev, offset, s, buffer))
751 break;
752
753 len -= s;
754 if (!len)
755 break;
756
757 offset += s;
758 }
759
760 dev->flags |= DEV_ACCESSED_W;
761
762 if (!dev_close(dev))
763 stack;
764
765 return (len == 0);
766 }
767