xref: /linux/fs/nfs/blocklayout/dev.c (revision 450b4b3b)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2014-2016 Christoph Hellwig.
4  */
5 #include <linux/sunrpc/svc.h>
6 #include <linux/blkdev.h>
7 #include <linux/nfs4.h>
8 #include <linux/nfs_fs.h>
9 #include <linux/nfs_xdr.h>
10 #include <linux/pr.h>
11 
12 #include "blocklayout.h"
13 
14 #define NFSDBG_FACILITY		NFSDBG_PNFS_LD
15 
16 static void bl_unregister_scsi(struct pnfs_block_dev *dev)
17 {
18 	struct block_device *bdev = file_bdev(dev->bdev_file);
19 	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
20 
21 	if (!test_and_clear_bit(PNFS_BDEV_REGISTERED, &dev->flags))
22 		return;
23 
24 	if (ops->pr_register(bdev, dev->pr_key, 0, false))
25 		pr_err("failed to unregister PR key.\n");
26 }
27 
28 static bool bl_register_scsi(struct pnfs_block_dev *dev)
29 {
30 	struct block_device *bdev = file_bdev(dev->bdev_file);
31 	const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
32 	int status;
33 
34 	if (test_and_set_bit(PNFS_BDEV_REGISTERED, &dev->flags))
35 		return true;
36 
37 	status = ops->pr_register(bdev, 0, dev->pr_key, true);
38 	if (status) {
39 		pr_err("pNFS: failed to register key for block device %s.",
40 		       bdev->bd_disk->disk_name);
41 		return false;
42 	}
43 	return true;
44 }
45 
46 static void bl_unregister_dev(struct pnfs_block_dev *dev)
47 {
48 	u32 i;
49 
50 	if (dev->nr_children) {
51 		for (i = 0; i < dev->nr_children; i++)
52 			bl_unregister_dev(&dev->children[i]);
53 		return;
54 	}
55 
56 	if (dev->type == PNFS_BLOCK_VOLUME_SCSI)
57 		bl_unregister_scsi(dev);
58 }
59 
60 bool bl_register_dev(struct pnfs_block_dev *dev)
61 {
62 	u32 i;
63 
64 	if (dev->nr_children) {
65 		for (i = 0; i < dev->nr_children; i++) {
66 			if (!bl_register_dev(&dev->children[i])) {
67 				while (i > 0)
68 					bl_unregister_dev(&dev->children[--i]);
69 				return false;
70 			}
71 		}
72 		return true;
73 	}
74 
75 	if (dev->type == PNFS_BLOCK_VOLUME_SCSI)
76 		return bl_register_scsi(dev);
77 	return true;
78 }
79 
80 static void
81 bl_free_device(struct pnfs_block_dev *dev)
82 {
83 	bl_unregister_dev(dev);
84 
85 	if (dev->nr_children) {
86 		int i;
87 
88 		for (i = 0; i < dev->nr_children; i++)
89 			bl_free_device(&dev->children[i]);
90 		kfree(dev->children);
91 	} else {
92 		if (dev->bdev_file)
93 			fput(dev->bdev_file);
94 	}
95 }
96 
97 void
98 bl_free_deviceid_node(struct nfs4_deviceid_node *d)
99 {
100 	struct pnfs_block_dev *dev =
101 		container_of(d, struct pnfs_block_dev, node);
102 
103 	bl_free_device(dev);
104 	kfree_rcu(dev, node.rcu);
105 }
106 
107 static int
108 nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
109 {
110 	__be32 *p;
111 	int i;
112 
113 	p = xdr_inline_decode(xdr, 4);
114 	if (!p)
115 		return -EIO;
116 	b->type = be32_to_cpup(p++);
117 
118 	switch (b->type) {
119 	case PNFS_BLOCK_VOLUME_SIMPLE:
120 		p = xdr_inline_decode(xdr, 4);
121 		if (!p)
122 			return -EIO;
123 		b->simple.nr_sigs = be32_to_cpup(p++);
124 		if (!b->simple.nr_sigs || b->simple.nr_sigs > PNFS_BLOCK_MAX_UUIDS) {
125 			dprintk("Bad signature count: %d\n", b->simple.nr_sigs);
126 			return -EIO;
127 		}
128 
129 		b->simple.len = 4 + 4;
130 		for (i = 0; i < b->simple.nr_sigs; i++) {
131 			p = xdr_inline_decode(xdr, 8 + 4);
132 			if (!p)
133 				return -EIO;
134 			p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
135 			b->simple.sigs[i].sig_len = be32_to_cpup(p++);
136 			if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) {
137 				pr_info("signature too long: %d\n",
138 					b->simple.sigs[i].sig_len);
139 				return -EIO;
140 			}
141 
142 			p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
143 			if (!p)
144 				return -EIO;
145 			memcpy(&b->simple.sigs[i].sig, p,
146 				b->simple.sigs[i].sig_len);
147 
148 			b->simple.len += 8 + 4 + \
149 				(XDR_QUADLEN(b->simple.sigs[i].sig_len) << 2);
150 		}
151 		break;
152 	case PNFS_BLOCK_VOLUME_SLICE:
153 		p = xdr_inline_decode(xdr, 8 + 8 + 4);
154 		if (!p)
155 			return -EIO;
156 		p = xdr_decode_hyper(p, &b->slice.start);
157 		p = xdr_decode_hyper(p, &b->slice.len);
158 		b->slice.volume = be32_to_cpup(p++);
159 		break;
160 	case PNFS_BLOCK_VOLUME_CONCAT:
161 		p = xdr_inline_decode(xdr, 4);
162 		if (!p)
163 			return -EIO;
164 
165 		b->concat.volumes_count = be32_to_cpup(p++);
166 		if (b->concat.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
167 			dprintk("Too many volumes: %d\n", b->concat.volumes_count);
168 			return -EIO;
169 		}
170 
171 		p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
172 		if (!p)
173 			return -EIO;
174 		for (i = 0; i < b->concat.volumes_count; i++)
175 			b->concat.volumes[i] = be32_to_cpup(p++);
176 		break;
177 	case PNFS_BLOCK_VOLUME_STRIPE:
178 		p = xdr_inline_decode(xdr, 8 + 4);
179 		if (!p)
180 			return -EIO;
181 
182 		p = xdr_decode_hyper(p, &b->stripe.chunk_size);
183 		b->stripe.volumes_count = be32_to_cpup(p++);
184 		if (b->stripe.volumes_count > PNFS_BLOCK_MAX_DEVICES) {
185 			dprintk("Too many volumes: %d\n", b->stripe.volumes_count);
186 			return -EIO;
187 		}
188 
189 		p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
190 		if (!p)
191 			return -EIO;
192 		for (i = 0; i < b->stripe.volumes_count; i++)
193 			b->stripe.volumes[i] = be32_to_cpup(p++);
194 		break;
195 	case PNFS_BLOCK_VOLUME_SCSI:
196 		p = xdr_inline_decode(xdr, 4 + 4 + 4);
197 		if (!p)
198 			return -EIO;
199 		b->scsi.code_set = be32_to_cpup(p++);
200 		b->scsi.designator_type = be32_to_cpup(p++);
201 		b->scsi.designator_len = be32_to_cpup(p++);
202 		p = xdr_inline_decode(xdr, b->scsi.designator_len);
203 		if (!p)
204 			return -EIO;
205 		if (b->scsi.designator_len > 256)
206 			return -EIO;
207 		memcpy(&b->scsi.designator, p, b->scsi.designator_len);
208 		p = xdr_inline_decode(xdr, 8);
209 		if (!p)
210 			return -EIO;
211 		p = xdr_decode_hyper(p, &b->scsi.pr_key);
212 		break;
213 	default:
214 		dprintk("unknown volume type!\n");
215 		return -EIO;
216 	}
217 
218 	return 0;
219 }
220 
221 static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
222 		struct pnfs_block_dev_map *map)
223 {
224 	map->start = dev->start;
225 	map->len = dev->len;
226 	map->disk_offset = dev->disk_offset;
227 	map->bdev = file_bdev(dev->bdev_file);
228 	return true;
229 }
230 
231 static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
232 		struct pnfs_block_dev_map *map)
233 {
234 	int i;
235 
236 	for (i = 0; i < dev->nr_children; i++) {
237 		struct pnfs_block_dev *child = &dev->children[i];
238 
239 		if (child->start > offset ||
240 		    child->start + child->len <= offset)
241 			continue;
242 
243 		child->map(child, offset - child->start, map);
244 		return true;
245 	}
246 
247 	dprintk("%s: ran off loop!\n", __func__);
248 	return false;
249 }
250 
251 static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
252 		struct pnfs_block_dev_map *map)
253 {
254 	struct pnfs_block_dev *child;
255 	u64 chunk;
256 	u32 chunk_idx;
257 	u64 disk_offset;
258 
259 	chunk = div_u64(offset, dev->chunk_size);
260 	div_u64_rem(chunk, dev->nr_children, &chunk_idx);
261 
262 	if (chunk_idx >= dev->nr_children) {
263 		dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
264 			__func__, chunk_idx, offset, dev->chunk_size);
265 		/* error, should not happen */
266 		return false;
267 	}
268 
269 	/* truncate offset to the beginning of the stripe */
270 	offset = chunk * dev->chunk_size;
271 
272 	/* disk offset of the stripe */
273 	disk_offset = div_u64(offset, dev->nr_children);
274 
275 	child = &dev->children[chunk_idx];
276 	child->map(child, disk_offset, map);
277 
278 	map->start += offset;
279 	map->disk_offset += disk_offset;
280 	map->len = dev->chunk_size;
281 	return true;
282 }
283 
284 static int
285 bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
286 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);
287 
288 
289 static int
290 bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
291 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
292 {
293 	struct pnfs_block_volume *v = &volumes[idx];
294 	struct file *bdev_file;
295 	dev_t dev;
296 
297 	dev = bl_resolve_deviceid(server, v, gfp_mask);
298 	if (!dev)
299 		return -EIO;
300 
301 	bdev_file = bdev_file_open_by_dev(dev, BLK_OPEN_READ | BLK_OPEN_WRITE,
302 				       NULL, NULL);
303 	if (IS_ERR(bdev_file)) {
304 		printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
305 			MAJOR(dev), MINOR(dev), PTR_ERR(bdev_file));
306 		return PTR_ERR(bdev_file);
307 	}
308 	d->bdev_file = bdev_file;
309 	d->len = bdev_nr_bytes(file_bdev(bdev_file));
310 	d->map = bl_map_simple;
311 
312 	printk(KERN_INFO "pNFS: using block device %s\n",
313 		file_bdev(bdev_file)->bd_disk->disk_name);
314 	return 0;
315 }
316 
317 static bool
318 bl_validate_designator(struct pnfs_block_volume *v)
319 {
320 	switch (v->scsi.designator_type) {
321 	case PS_DESIGNATOR_EUI64:
322 		if (v->scsi.code_set != PS_CODE_SET_BINARY)
323 			return false;
324 
325 		if (v->scsi.designator_len != 8 &&
326 		    v->scsi.designator_len != 10 &&
327 		    v->scsi.designator_len != 16)
328 			return false;
329 
330 		return true;
331 	case PS_DESIGNATOR_NAA:
332 		if (v->scsi.code_set != PS_CODE_SET_BINARY)
333 			return false;
334 
335 		if (v->scsi.designator_len != 8 &&
336 		    v->scsi.designator_len != 16)
337 			return false;
338 
339 		return true;
340 	case PS_DESIGNATOR_T10:
341 	case PS_DESIGNATOR_NAME:
342 		pr_err("pNFS: unsupported designator "
343 			"(code set %d, type %d, len %d.\n",
344 			v->scsi.code_set,
345 			v->scsi.designator_type,
346 			v->scsi.designator_len);
347 		return false;
348 	default:
349 		pr_err("pNFS: invalid designator "
350 			"(code set %d, type %d, len %d.\n",
351 			v->scsi.code_set,
352 			v->scsi.designator_type,
353 			v->scsi.designator_len);
354 		return false;
355 	}
356 }
357 
358 static struct file *
359 bl_open_path(struct pnfs_block_volume *v, const char *prefix)
360 {
361 	struct file *bdev_file;
362 	const char *devname;
363 
364 	devname = kasprintf(GFP_KERNEL, "/dev/disk/by-id/%s%*phN",
365 			prefix, v->scsi.designator_len, v->scsi.designator);
366 	if (!devname)
367 		return ERR_PTR(-ENOMEM);
368 
369 	bdev_file = bdev_file_open_by_path(devname, BLK_OPEN_READ | BLK_OPEN_WRITE,
370 					NULL, NULL);
371 	if (IS_ERR(bdev_file)) {
372 		dprintk("failed to open device %s (%ld)\n",
373 			devname, PTR_ERR(bdev_file));
374 	}
375 
376 	kfree(devname);
377 	return bdev_file;
378 }
379 
380 static int
381 bl_parse_scsi(struct nfs_server *server, struct pnfs_block_dev *d,
382 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
383 {
384 	struct pnfs_block_volume *v = &volumes[idx];
385 	struct file *bdev_file;
386 	const struct pr_ops *ops;
387 	int error;
388 
389 	if (!bl_validate_designator(v))
390 		return -EINVAL;
391 
392 	/*
393 	 * Try to open the RH/Fedora specific dm-mpath udev path first, as the
394 	 * wwn- links will only point to the first discovered SCSI device there.
395 	 * On other distributions like Debian, the default SCSI by-id path will
396 	 * point to the dm-multipath device if one exists.
397 	 */
398 	bdev_file = bl_open_path(v, "dm-uuid-mpath-0x");
399 	if (IS_ERR(bdev_file))
400 		bdev_file = bl_open_path(v, "wwn-0x");
401 	if (IS_ERR(bdev_file)) {
402 		pr_warn("pNFS: no device found for volume %*phN\n",
403 			v->scsi.designator_len, v->scsi.designator);
404 		return PTR_ERR(bdev_file);
405 	}
406 	d->bdev_file = bdev_file;
407 
408 	d->len = bdev_nr_bytes(file_bdev(d->bdev_file));
409 	d->map = bl_map_simple;
410 	d->pr_key = v->scsi.pr_key;
411 
412 	if (d->len == 0)
413 		return -ENODEV;
414 
415 	pr_info("pNFS: using block device %s (reservation key 0x%llx)\n",
416 		file_bdev(d->bdev_file)->bd_disk->disk_name, d->pr_key);
417 
418 	ops = file_bdev(d->bdev_file)->bd_disk->fops->pr_ops;
419 	if (!ops) {
420 		pr_err("pNFS: block device %s does not support reservations.",
421 				file_bdev(d->bdev_file)->bd_disk->disk_name);
422 		error = -EINVAL;
423 		goto out_blkdev_put;
424 	}
425 
426 	return 0;
427 
428 out_blkdev_put:
429 	fput(d->bdev_file);
430 	return error;
431 }
432 
433 static int
434 bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
435 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
436 {
437 	struct pnfs_block_volume *v = &volumes[idx];
438 	int ret;
439 
440 	ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
441 	if (ret)
442 		return ret;
443 
444 	d->disk_offset = v->slice.start;
445 	d->len = v->slice.len;
446 	return 0;
447 }
448 
449 static int
450 bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
451 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
452 {
453 	struct pnfs_block_volume *v = &volumes[idx];
454 	u64 len = 0;
455 	int ret, i;
456 
457 	d->children = kcalloc(v->concat.volumes_count,
458 			sizeof(struct pnfs_block_dev), gfp_mask);
459 	if (!d->children)
460 		return -ENOMEM;
461 
462 	for (i = 0; i < v->concat.volumes_count; i++) {
463 		ret = bl_parse_deviceid(server, &d->children[i],
464 				volumes, v->concat.volumes[i], gfp_mask);
465 		if (ret)
466 			return ret;
467 
468 		d->nr_children++;
469 		d->children[i].start += len;
470 		len += d->children[i].len;
471 	}
472 
473 	d->len = len;
474 	d->map = bl_map_concat;
475 	return 0;
476 }
477 
478 static int
479 bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
480 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
481 {
482 	struct pnfs_block_volume *v = &volumes[idx];
483 	u64 len = 0;
484 	int ret, i;
485 
486 	d->children = kcalloc(v->stripe.volumes_count,
487 			sizeof(struct pnfs_block_dev), gfp_mask);
488 	if (!d->children)
489 		return -ENOMEM;
490 
491 	for (i = 0; i < v->stripe.volumes_count; i++) {
492 		ret = bl_parse_deviceid(server, &d->children[i],
493 				volumes, v->stripe.volumes[i], gfp_mask);
494 		if (ret)
495 			return ret;
496 
497 		d->nr_children++;
498 		len += d->children[i].len;
499 	}
500 
501 	d->len = len;
502 	d->chunk_size = v->stripe.chunk_size;
503 	d->map = bl_map_stripe;
504 	return 0;
505 }
506 
507 static int
508 bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
509 		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
510 {
511 	d->type = volumes[idx].type;
512 
513 	switch (d->type) {
514 	case PNFS_BLOCK_VOLUME_SIMPLE:
515 		return bl_parse_simple(server, d, volumes, idx, gfp_mask);
516 	case PNFS_BLOCK_VOLUME_SLICE:
517 		return bl_parse_slice(server, d, volumes, idx, gfp_mask);
518 	case PNFS_BLOCK_VOLUME_CONCAT:
519 		return bl_parse_concat(server, d, volumes, idx, gfp_mask);
520 	case PNFS_BLOCK_VOLUME_STRIPE:
521 		return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
522 	case PNFS_BLOCK_VOLUME_SCSI:
523 		return bl_parse_scsi(server, d, volumes, idx, gfp_mask);
524 	default:
525 		dprintk("unsupported volume type: %d\n", d->type);
526 		return -EIO;
527 	}
528 }
529 
530 struct nfs4_deviceid_node *
531 bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
532 		gfp_t gfp_mask)
533 {
534 	struct nfs4_deviceid_node *node = NULL;
535 	struct pnfs_block_volume *volumes;
536 	struct pnfs_block_dev *top;
537 	struct xdr_stream xdr;
538 	struct xdr_buf buf;
539 	struct page *scratch;
540 	int nr_volumes, ret, i;
541 	__be32 *p;
542 
543 	scratch = alloc_page(gfp_mask);
544 	if (!scratch)
545 		goto out;
546 
547 	xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
548 	xdr_set_scratch_page(&xdr, scratch);
549 
550 	p = xdr_inline_decode(&xdr, sizeof(__be32));
551 	if (!p)
552 		goto out_free_scratch;
553 	nr_volumes = be32_to_cpup(p++);
554 
555 	volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
556 			  gfp_mask);
557 	if (!volumes)
558 		goto out_free_scratch;
559 
560 	for (i = 0; i < nr_volumes; i++) {
561 		ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
562 		if (ret < 0)
563 			goto out_free_volumes;
564 	}
565 
566 	top = kzalloc(sizeof(*top), gfp_mask);
567 	if (!top)
568 		goto out_free_volumes;
569 
570 	ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
571 
572 	node = &top->node;
573 	nfs4_init_deviceid_node(node, server, &pdev->dev_id);
574 	if (ret)
575 		nfs4_mark_deviceid_unavailable(node);
576 
577 out_free_volumes:
578 	kfree(volumes);
579 out_free_scratch:
580 	__free_page(scratch);
581 out:
582 	return node;
583 }
584