xref: /linux/drivers/infiniband/sw/rxe/rxe_mr.c (revision dd093fb0)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3  * Copyright (c) 2016 Mellanox Technologies Ltd. All rights reserved.
4  * Copyright (c) 2015 System Fabric Works, Inc. All rights reserved.
5  */
6 
7 #include <linux/libnvdimm.h>
8 
9 #include "rxe.h"
10 #include "rxe_loc.h"
11 
12 /* Return a random 8 bit key value that is
13  * different than the last_key. Set last_key to -1
14  * if this is the first key for an MR or MW
15  */
16 u8 rxe_get_next_key(u32 last_key)
17 {
18 	u8 key;
19 
20 	do {
21 		get_random_bytes(&key, 1);
22 	} while (key == last_key);
23 
24 	return key;
25 }
26 
27 int mr_check_range(struct rxe_mr *mr, u64 iova, size_t length)
28 {
29 
30 
31 	switch (mr->ibmr.type) {
32 	case IB_MR_TYPE_DMA:
33 		return 0;
34 
35 	case IB_MR_TYPE_USER:
36 	case IB_MR_TYPE_MEM_REG:
37 		if (iova < mr->ibmr.iova || length > mr->ibmr.length ||
38 		    iova > mr->ibmr.iova + mr->ibmr.length - length)
39 			return -EFAULT;
40 		return 0;
41 
42 	default:
43 		rxe_dbg_mr(mr, "type (%d) not supported\n", mr->ibmr.type);
44 		return -EFAULT;
45 	}
46 }
47 
48 #define IB_ACCESS_REMOTE	(IB_ACCESS_REMOTE_READ		\
49 				| IB_ACCESS_REMOTE_WRITE	\
50 				| IB_ACCESS_REMOTE_ATOMIC)
51 
52 static void rxe_mr_init(int access, struct rxe_mr *mr)
53 {
54 	u32 lkey = mr->elem.index << 8 | rxe_get_next_key(-1);
55 	u32 rkey = (access & IB_ACCESS_REMOTE) ? lkey : 0;
56 
57 	/* set ibmr->l/rkey and also copy into private l/rkey
58 	 * for user MRs these will always be the same
59 	 * for cases where caller 'owns' the key portion
60 	 * they may be different until REG_MR WQE is executed.
61 	 */
62 	mr->lkey = mr->ibmr.lkey = lkey;
63 	mr->rkey = mr->ibmr.rkey = rkey;
64 
65 	mr->state = RXE_MR_STATE_INVALID;
66 }
67 
68 static int rxe_mr_alloc(struct rxe_mr *mr, int num_buf)
69 {
70 	int i;
71 	int num_map;
72 	struct rxe_map **map = mr->map;
73 
74 	num_map = (num_buf + RXE_BUF_PER_MAP - 1) / RXE_BUF_PER_MAP;
75 
76 	mr->map = kmalloc_array(num_map, sizeof(*map), GFP_KERNEL);
77 	if (!mr->map)
78 		goto err1;
79 
80 	for (i = 0; i < num_map; i++) {
81 		mr->map[i] = kmalloc(sizeof(**map), GFP_KERNEL);
82 		if (!mr->map[i])
83 			goto err2;
84 	}
85 
86 	BUILD_BUG_ON(!is_power_of_2(RXE_BUF_PER_MAP));
87 
88 	mr->map_shift = ilog2(RXE_BUF_PER_MAP);
89 	mr->map_mask = RXE_BUF_PER_MAP - 1;
90 
91 	mr->num_buf = num_buf;
92 	mr->num_map = num_map;
93 	mr->max_buf = num_map * RXE_BUF_PER_MAP;
94 
95 	return 0;
96 
97 err2:
98 	for (i--; i >= 0; i--)
99 		kfree(mr->map[i]);
100 
101 	kfree(mr->map);
102 	mr->map = NULL;
103 err1:
104 	return -ENOMEM;
105 }
106 
107 void rxe_mr_init_dma(int access, struct rxe_mr *mr)
108 {
109 	rxe_mr_init(access, mr);
110 
111 	mr->access = access;
112 	mr->state = RXE_MR_STATE_VALID;
113 	mr->ibmr.type = IB_MR_TYPE_DMA;
114 }
115 
116 static bool is_pmem_page(struct page *pg)
117 {
118 	unsigned long paddr = page_to_phys(pg);
119 
120 	return REGION_INTERSECTS ==
121 	       region_intersects(paddr, PAGE_SIZE, IORESOURCE_MEM,
122 				 IORES_DESC_PERSISTENT_MEMORY);
123 }
124 
125 int rxe_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
126 		     int access, struct rxe_mr *mr)
127 {
128 	struct rxe_map		**map;
129 	struct rxe_phys_buf	*buf = NULL;
130 	struct ib_umem		*umem;
131 	struct sg_page_iter	sg_iter;
132 	int			num_buf;
133 	void			*vaddr;
134 	int err;
135 
136 	umem = ib_umem_get(&rxe->ib_dev, start, length, access);
137 	if (IS_ERR(umem)) {
138 		rxe_dbg_mr(mr, "Unable to pin memory region err = %d\n",
139 			(int)PTR_ERR(umem));
140 		err = PTR_ERR(umem);
141 		goto err_out;
142 	}
143 
144 	num_buf = ib_umem_num_pages(umem);
145 
146 	rxe_mr_init(access, mr);
147 
148 	err = rxe_mr_alloc(mr, num_buf);
149 	if (err) {
150 		rxe_dbg_mr(mr, "Unable to allocate memory for map\n");
151 		goto err_release_umem;
152 	}
153 
154 	mr->page_shift = PAGE_SHIFT;
155 	mr->page_mask = PAGE_SIZE - 1;
156 
157 	num_buf			= 0;
158 	map = mr->map;
159 	if (length > 0) {
160 		bool persistent_access = access & IB_ACCESS_FLUSH_PERSISTENT;
161 
162 		buf = map[0]->buf;
163 		for_each_sgtable_page (&umem->sgt_append.sgt, &sg_iter, 0) {
164 			struct page *pg = sg_page_iter_page(&sg_iter);
165 
166 			if (persistent_access && !is_pmem_page(pg)) {
167 				rxe_dbg_mr(mr, "Unable to register persistent access to non-pmem device\n");
168 				err = -EINVAL;
169 				goto err_release_umem;
170 			}
171 
172 			if (num_buf >= RXE_BUF_PER_MAP) {
173 				map++;
174 				buf = map[0]->buf;
175 				num_buf = 0;
176 			}
177 
178 			vaddr = page_address(pg);
179 			if (!vaddr) {
180 				rxe_dbg_mr(mr, "Unable to get virtual address\n");
181 				err = -ENOMEM;
182 				goto err_release_umem;
183 			}
184 			buf->addr = (uintptr_t)vaddr;
185 			buf->size = PAGE_SIZE;
186 			num_buf++;
187 			buf++;
188 
189 		}
190 	}
191 
192 	mr->umem = umem;
193 	mr->access = access;
194 	mr->offset = ib_umem_offset(umem);
195 	mr->state = RXE_MR_STATE_VALID;
196 	mr->ibmr.type = IB_MR_TYPE_USER;
197 	mr->ibmr.page_size = PAGE_SIZE;
198 
199 	return 0;
200 
201 err_release_umem:
202 	ib_umem_release(umem);
203 err_out:
204 	return err;
205 }
206 
207 int rxe_mr_init_fast(int max_pages, struct rxe_mr *mr)
208 {
209 	int err;
210 
211 	/* always allow remote access for FMRs */
212 	rxe_mr_init(IB_ACCESS_REMOTE, mr);
213 
214 	err = rxe_mr_alloc(mr, max_pages);
215 	if (err)
216 		goto err1;
217 
218 	mr->max_buf = max_pages;
219 	mr->state = RXE_MR_STATE_FREE;
220 	mr->ibmr.type = IB_MR_TYPE_MEM_REG;
221 
222 	return 0;
223 
224 err1:
225 	return err;
226 }
227 
228 static void lookup_iova(struct rxe_mr *mr, u64 iova, int *m_out, int *n_out,
229 			size_t *offset_out)
230 {
231 	size_t offset = iova - mr->ibmr.iova + mr->offset;
232 	int			map_index;
233 	int			buf_index;
234 	u64			length;
235 
236 	if (likely(mr->page_shift)) {
237 		*offset_out = offset & mr->page_mask;
238 		offset >>= mr->page_shift;
239 		*n_out = offset & mr->map_mask;
240 		*m_out = offset >> mr->map_shift;
241 	} else {
242 		map_index = 0;
243 		buf_index = 0;
244 
245 		length = mr->map[map_index]->buf[buf_index].size;
246 
247 		while (offset >= length) {
248 			offset -= length;
249 			buf_index++;
250 
251 			if (buf_index == RXE_BUF_PER_MAP) {
252 				map_index++;
253 				buf_index = 0;
254 			}
255 			length = mr->map[map_index]->buf[buf_index].size;
256 		}
257 
258 		*m_out = map_index;
259 		*n_out = buf_index;
260 		*offset_out = offset;
261 	}
262 }
263 
264 void *iova_to_vaddr(struct rxe_mr *mr, u64 iova, int length)
265 {
266 	size_t offset;
267 	int m, n;
268 	void *addr;
269 
270 	if (mr->state != RXE_MR_STATE_VALID) {
271 		rxe_dbg_mr(mr, "Not in valid state\n");
272 		addr = NULL;
273 		goto out;
274 	}
275 
276 	if (!mr->map) {
277 		addr = (void *)(uintptr_t)iova;
278 		goto out;
279 	}
280 
281 	if (mr_check_range(mr, iova, length)) {
282 		rxe_dbg_mr(mr, "Range violation\n");
283 		addr = NULL;
284 		goto out;
285 	}
286 
287 	lookup_iova(mr, iova, &m, &n, &offset);
288 
289 	if (offset + length > mr->map[m]->buf[n].size) {
290 		rxe_dbg_mr(mr, "Crosses page boundary\n");
291 		addr = NULL;
292 		goto out;
293 	}
294 
295 	addr = (void *)(uintptr_t)mr->map[m]->buf[n].addr + offset;
296 
297 out:
298 	return addr;
299 }
300 
301 int rxe_flush_pmem_iova(struct rxe_mr *mr, u64 iova, int length)
302 {
303 	size_t offset;
304 
305 	if (length == 0)
306 		return 0;
307 
308 	if (mr->ibmr.type == IB_MR_TYPE_DMA)
309 		return -EFAULT;
310 
311 	offset = (iova - mr->ibmr.iova + mr->offset) & mr->page_mask;
312 	while (length > 0) {
313 		u8 *va;
314 		int bytes;
315 
316 		bytes = mr->ibmr.page_size - offset;
317 		if (bytes > length)
318 			bytes = length;
319 
320 		va = iova_to_vaddr(mr, iova, length);
321 		if (!va)
322 			return -EFAULT;
323 
324 		arch_wb_cache_pmem(va, bytes);
325 
326 		length -= bytes;
327 		iova += bytes;
328 		offset = 0;
329 	}
330 
331 	return 0;
332 }
333 
334 /* copy data from a range (vaddr, vaddr+length-1) to or from
335  * a mr object starting at iova.
336  */
337 int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
338 		enum rxe_mr_copy_dir dir)
339 {
340 	int			err;
341 	int			bytes;
342 	u8			*va;
343 	struct rxe_map		**map;
344 	struct rxe_phys_buf	*buf;
345 	int			m;
346 	int			i;
347 	size_t			offset;
348 
349 	if (length == 0)
350 		return 0;
351 
352 	if (mr->ibmr.type == IB_MR_TYPE_DMA) {
353 		u8 *src, *dest;
354 
355 		src = (dir == RXE_TO_MR_OBJ) ? addr : ((void *)(uintptr_t)iova);
356 
357 		dest = (dir == RXE_TO_MR_OBJ) ? ((void *)(uintptr_t)iova) : addr;
358 
359 		memcpy(dest, src, length);
360 
361 		return 0;
362 	}
363 
364 	WARN_ON_ONCE(!mr->map);
365 
366 	err = mr_check_range(mr, iova, length);
367 	if (err) {
368 		err = -EFAULT;
369 		goto err1;
370 	}
371 
372 	lookup_iova(mr, iova, &m, &i, &offset);
373 
374 	map = mr->map + m;
375 	buf	= map[0]->buf + i;
376 
377 	while (length > 0) {
378 		u8 *src, *dest;
379 
380 		va	= (u8 *)(uintptr_t)buf->addr + offset;
381 		src = (dir == RXE_TO_MR_OBJ) ? addr : va;
382 		dest = (dir == RXE_TO_MR_OBJ) ? va : addr;
383 
384 		bytes	= buf->size - offset;
385 
386 		if (bytes > length)
387 			bytes = length;
388 
389 		memcpy(dest, src, bytes);
390 
391 		length	-= bytes;
392 		addr	+= bytes;
393 
394 		offset	= 0;
395 		buf++;
396 		i++;
397 
398 		if (i == RXE_BUF_PER_MAP) {
399 			i = 0;
400 			map++;
401 			buf = map[0]->buf;
402 		}
403 	}
404 
405 	return 0;
406 
407 err1:
408 	return err;
409 }
410 
411 /* copy data in or out of a wqe, i.e. sg list
412  * under the control of a dma descriptor
413  */
414 int copy_data(
415 	struct rxe_pd		*pd,
416 	int			access,
417 	struct rxe_dma_info	*dma,
418 	void			*addr,
419 	int			length,
420 	enum rxe_mr_copy_dir	dir)
421 {
422 	int			bytes;
423 	struct rxe_sge		*sge	= &dma->sge[dma->cur_sge];
424 	int			offset	= dma->sge_offset;
425 	int			resid	= dma->resid;
426 	struct rxe_mr		*mr	= NULL;
427 	u64			iova;
428 	int			err;
429 
430 	if (length == 0)
431 		return 0;
432 
433 	if (length > resid) {
434 		err = -EINVAL;
435 		goto err2;
436 	}
437 
438 	if (sge->length && (offset < sge->length)) {
439 		mr = lookup_mr(pd, access, sge->lkey, RXE_LOOKUP_LOCAL);
440 		if (!mr) {
441 			err = -EINVAL;
442 			goto err1;
443 		}
444 	}
445 
446 	while (length > 0) {
447 		bytes = length;
448 
449 		if (offset >= sge->length) {
450 			if (mr) {
451 				rxe_put(mr);
452 				mr = NULL;
453 			}
454 			sge++;
455 			dma->cur_sge++;
456 			offset = 0;
457 
458 			if (dma->cur_sge >= dma->num_sge) {
459 				err = -ENOSPC;
460 				goto err2;
461 			}
462 
463 			if (sge->length) {
464 				mr = lookup_mr(pd, access, sge->lkey,
465 					       RXE_LOOKUP_LOCAL);
466 				if (!mr) {
467 					err = -EINVAL;
468 					goto err1;
469 				}
470 			} else {
471 				continue;
472 			}
473 		}
474 
475 		if (bytes > sge->length - offset)
476 			bytes = sge->length - offset;
477 
478 		if (bytes > 0) {
479 			iova = sge->addr + offset;
480 
481 			err = rxe_mr_copy(mr, iova, addr, bytes, dir);
482 			if (err)
483 				goto err2;
484 
485 			offset	+= bytes;
486 			resid	-= bytes;
487 			length	-= bytes;
488 			addr	+= bytes;
489 		}
490 	}
491 
492 	dma->sge_offset = offset;
493 	dma->resid	= resid;
494 
495 	if (mr)
496 		rxe_put(mr);
497 
498 	return 0;
499 
500 err2:
501 	if (mr)
502 		rxe_put(mr);
503 err1:
504 	return err;
505 }
506 
507 int advance_dma_data(struct rxe_dma_info *dma, unsigned int length)
508 {
509 	struct rxe_sge		*sge	= &dma->sge[dma->cur_sge];
510 	int			offset	= dma->sge_offset;
511 	int			resid	= dma->resid;
512 
513 	while (length) {
514 		unsigned int bytes;
515 
516 		if (offset >= sge->length) {
517 			sge++;
518 			dma->cur_sge++;
519 			offset = 0;
520 			if (dma->cur_sge >= dma->num_sge)
521 				return -ENOSPC;
522 		}
523 
524 		bytes = length;
525 
526 		if (bytes > sge->length - offset)
527 			bytes = sge->length - offset;
528 
529 		offset	+= bytes;
530 		resid	-= bytes;
531 		length	-= bytes;
532 	}
533 
534 	dma->sge_offset = offset;
535 	dma->resid	= resid;
536 
537 	return 0;
538 }
539 
540 /* (1) find the mr corresponding to lkey/rkey
541  *     depending on lookup_type
542  * (2) verify that the (qp) pd matches the mr pd
543  * (3) verify that the mr can support the requested access
544  * (4) verify that mr state is valid
545  */
546 struct rxe_mr *lookup_mr(struct rxe_pd *pd, int access, u32 key,
547 			 enum rxe_mr_lookup_type type)
548 {
549 	struct rxe_mr *mr;
550 	struct rxe_dev *rxe = to_rdev(pd->ibpd.device);
551 	int index = key >> 8;
552 
553 	mr = rxe_pool_get_index(&rxe->mr_pool, index);
554 	if (!mr)
555 		return NULL;
556 
557 	if (unlikely((type == RXE_LOOKUP_LOCAL && mr->lkey != key) ||
558 		     (type == RXE_LOOKUP_REMOTE && mr->rkey != key) ||
559 		     mr_pd(mr) != pd || ((access & mr->access) != access) ||
560 		     mr->state != RXE_MR_STATE_VALID)) {
561 		rxe_put(mr);
562 		mr = NULL;
563 	}
564 
565 	return mr;
566 }
567 
568 int rxe_invalidate_mr(struct rxe_qp *qp, u32 key)
569 {
570 	struct rxe_dev *rxe = to_rdev(qp->ibqp.device);
571 	struct rxe_mr *mr;
572 	int ret;
573 
574 	mr = rxe_pool_get_index(&rxe->mr_pool, key >> 8);
575 	if (!mr) {
576 		rxe_dbg_qp(qp, "No MR for key %#x\n", key);
577 		ret = -EINVAL;
578 		goto err;
579 	}
580 
581 	if (mr->rkey ? (key != mr->rkey) : (key != mr->lkey)) {
582 		rxe_dbg_mr(mr, "wr key (%#x) doesn't match mr key (%#x)\n",
583 			key, (mr->rkey ? mr->rkey : mr->lkey));
584 		ret = -EINVAL;
585 		goto err_drop_ref;
586 	}
587 
588 	if (atomic_read(&mr->num_mw) > 0) {
589 		rxe_dbg_mr(mr, "Attempt to invalidate an MR while bound to MWs\n");
590 		ret = -EINVAL;
591 		goto err_drop_ref;
592 	}
593 
594 	if (unlikely(mr->ibmr.type != IB_MR_TYPE_MEM_REG)) {
595 		rxe_dbg_mr(mr, "Type (%d) is wrong\n", mr->ibmr.type);
596 		ret = -EINVAL;
597 		goto err_drop_ref;
598 	}
599 
600 	mr->state = RXE_MR_STATE_FREE;
601 	ret = 0;
602 
603 err_drop_ref:
604 	rxe_put(mr);
605 err:
606 	return ret;
607 }
608 
609 /* user can (re)register fast MR by executing a REG_MR WQE.
610  * user is expected to hold a reference on the ib mr until the
611  * WQE completes.
612  * Once a fast MR is created this is the only way to change the
613  * private keys. It is the responsibility of the user to maintain
614  * the ib mr keys in sync with rxe mr keys.
615  */
616 int rxe_reg_fast_mr(struct rxe_qp *qp, struct rxe_send_wqe *wqe)
617 {
618 	struct rxe_mr *mr = to_rmr(wqe->wr.wr.reg.mr);
619 	u32 key = wqe->wr.wr.reg.key;
620 	u32 access = wqe->wr.wr.reg.access;
621 
622 	/* user can only register MR in free state */
623 	if (unlikely(mr->state != RXE_MR_STATE_FREE)) {
624 		rxe_dbg_mr(mr, "mr->lkey = 0x%x not free\n", mr->lkey);
625 		return -EINVAL;
626 	}
627 
628 	/* user can only register mr with qp in same protection domain */
629 	if (unlikely(qp->ibqp.pd != mr->ibmr.pd)) {
630 		rxe_dbg_mr(mr, "qp->pd and mr->pd don't match\n");
631 		return -EINVAL;
632 	}
633 
634 	/* user is only allowed to change key portion of l/rkey */
635 	if (unlikely((mr->lkey & ~0xff) != (key & ~0xff))) {
636 		rxe_dbg_mr(mr, "key = 0x%x has wrong index mr->lkey = 0x%x\n",
637 			key, mr->lkey);
638 		return -EINVAL;
639 	}
640 
641 	mr->access = access;
642 	mr->lkey = key;
643 	mr->rkey = (access & IB_ACCESS_REMOTE) ? key : 0;
644 	mr->ibmr.iova = wqe->wr.wr.reg.mr->iova;
645 	mr->state = RXE_MR_STATE_VALID;
646 
647 	return 0;
648 }
649 
650 int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
651 {
652 	struct rxe_mr *mr = to_rmr(ibmr);
653 
654 	/* See IBA 10.6.7.2.6 */
655 	if (atomic_read(&mr->num_mw) > 0)
656 		return -EINVAL;
657 
658 	rxe_cleanup(mr);
659 
660 	return 0;
661 }
662 
663 void rxe_mr_cleanup(struct rxe_pool_elem *elem)
664 {
665 	struct rxe_mr *mr = container_of(elem, typeof(*mr), elem);
666 	int i;
667 
668 	rxe_put(mr_pd(mr));
669 	ib_umem_release(mr->umem);
670 
671 	if (mr->map) {
672 		for (i = 0; i < mr->num_map; i++)
673 			kfree(mr->map[i]);
674 
675 		kfree(mr->map);
676 	}
677 }
678