xref: /freebsd/contrib/ofed/libibverbs/memory.c (revision d6b92ffa)
1 /*
2  * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2006 Cisco Systems, Inc.  All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #include <config.h>
35 
36 #include <errno.h>
37 #include <sys/mman.h>
38 #include <unistd.h>
39 #include <stdlib.h>
40 #include <stdint.h>
41 #include <stdio.h>
42 #include <string.h>
43 #include <dirent.h>
44 #include <limits.h>
45 #include <inttypes.h>
46 
47 #include "ibverbs.h"
48 
49 struct ibv_mem_node {
50 	enum {
51 		IBV_RED,
52 		IBV_BLACK
53 	}			color;
54 	struct ibv_mem_node    *parent;
55 	struct ibv_mem_node    *left, *right;
56 	uintptr_t		start, end;
57 	int			refcnt;
58 };
59 
60 static struct ibv_mem_node *mm_root;
61 static pthread_mutex_t mm_mutex = PTHREAD_MUTEX_INITIALIZER;
62 static int page_size;
63 static int huge_page_enabled;
64 static int too_late;
65 
smaps_page_size(FILE * file)66 static unsigned long smaps_page_size(FILE *file)
67 {
68 	int n;
69 	unsigned long size = page_size;
70 	char buf[1024];
71 
72 	while (fgets(buf, sizeof(buf), file) != NULL) {
73 		if (!strstr(buf, "KernelPageSize:"))
74 			continue;
75 
76 		n = sscanf(buf, "%*s %lu", &size);
77 		if (n < 1)
78 			continue;
79 
80 		/* page size is printed in Kb */
81 		size = size * 1024;
82 
83 		break;
84 	}
85 
86 	return size;
87 }
88 
get_page_size(void * base)89 static unsigned long get_page_size(void *base)
90 {
91 	unsigned long ret = page_size;
92 	pid_t pid;
93 	FILE *file;
94 	char buf[1024];
95 
96 	pid = getpid();
97 	snprintf(buf, sizeof(buf), "/proc/%d/smaps", pid);
98 
99 	file = fopen(buf, "r" STREAM_CLOEXEC);
100 	if (!file)
101 		goto out;
102 
103 	while (fgets(buf, sizeof(buf), file) != NULL) {
104 		int n;
105 		uintptr_t range_start, range_end;
106 
107 		n = sscanf(buf, "%" SCNxPTR "-%" SCNxPTR, &range_start, &range_end);
108 
109 		if (n < 2)
110 			continue;
111 
112 		if ((uintptr_t) base >= range_start && (uintptr_t) base < range_end) {
113 			ret = smaps_page_size(file);
114 			break;
115 		}
116 	}
117 
118 	fclose(file);
119 
120 out:
121 	return ret;
122 }
123 
ibv_fork_init(void)124 int ibv_fork_init(void)
125 {
126 	void *tmp, *tmp_aligned;
127 	int ret;
128 	unsigned long size;
129 
130 	if (getenv("RDMAV_HUGEPAGES_SAFE"))
131 		huge_page_enabled = 1;
132 
133 	if (mm_root)
134 		return 0;
135 
136 	if (too_late)
137 		return EINVAL;
138 
139 	page_size = sysconf(_SC_PAGESIZE);
140 	if (page_size < 0)
141 		return errno;
142 
143 	if (posix_memalign(&tmp, page_size, page_size))
144 		return ENOMEM;
145 
146 	if (huge_page_enabled) {
147 		size = get_page_size(tmp);
148 		tmp_aligned = (void *) ((uintptr_t) tmp & ~(size - 1));
149 	} else {
150 		size = page_size;
151 		tmp_aligned = tmp;
152 	}
153 
154 	ret = madvise(tmp_aligned, size, MADV_DONTFORK) ||
155 	      madvise(tmp_aligned, size, MADV_DOFORK);
156 
157 	free(tmp);
158 
159 	if (ret)
160 		return ENOSYS;
161 
162 	mm_root = malloc(sizeof *mm_root);
163 	if (!mm_root)
164 		return ENOMEM;
165 
166 	mm_root->parent = NULL;
167 	mm_root->left   = NULL;
168 	mm_root->right  = NULL;
169 	mm_root->color  = IBV_BLACK;
170 	mm_root->start  = 0;
171 	mm_root->end    = UINTPTR_MAX;
172 	mm_root->refcnt = 0;
173 
174 	return 0;
175 }
176 
__mm_prev(struct ibv_mem_node * node)177 static struct ibv_mem_node *__mm_prev(struct ibv_mem_node *node)
178 {
179 	if (node->left) {
180 		node = node->left;
181 		while (node->right)
182 			node = node->right;
183 	} else {
184 		while (node->parent && node == node->parent->left)
185 			node = node->parent;
186 
187 		node = node->parent;
188 	}
189 
190 	return node;
191 }
192 
__mm_next(struct ibv_mem_node * node)193 static struct ibv_mem_node *__mm_next(struct ibv_mem_node *node)
194 {
195 	if (node->right) {
196 		node = node->right;
197 		while (node->left)
198 			node = node->left;
199 	} else {
200 		while (node->parent && node == node->parent->right)
201 			node = node->parent;
202 
203 		node = node->parent;
204 	}
205 
206 	return node;
207 }
208 
__mm_rotate_right(struct ibv_mem_node * node)209 static void __mm_rotate_right(struct ibv_mem_node *node)
210 {
211 	struct ibv_mem_node *tmp;
212 
213 	tmp = node->left;
214 
215 	node->left = tmp->right;
216 	if (node->left)
217 		node->left->parent = node;
218 
219 	if (node->parent) {
220 		if (node->parent->right == node)
221 			node->parent->right = tmp;
222 		else
223 			node->parent->left = tmp;
224 	} else
225 		mm_root = tmp;
226 
227 	tmp->parent = node->parent;
228 
229 	tmp->right = node;
230 	node->parent = tmp;
231 }
232 
__mm_rotate_left(struct ibv_mem_node * node)233 static void __mm_rotate_left(struct ibv_mem_node *node)
234 {
235 	struct ibv_mem_node *tmp;
236 
237 	tmp = node->right;
238 
239 	node->right = tmp->left;
240 	if (node->right)
241 		node->right->parent = node;
242 
243 	if (node->parent) {
244 		if (node->parent->right == node)
245 			node->parent->right = tmp;
246 		else
247 			node->parent->left = tmp;
248 	} else
249 		mm_root = tmp;
250 
251 	tmp->parent = node->parent;
252 
253 	tmp->left = node;
254 	node->parent = tmp;
255 }
256 
257 #if 0
258 static int verify(struct ibv_mem_node *node)
259 {
260 	int hl, hr;
261 
262 	if (!node)
263 		return 1;
264 
265 	hl = verify(node->left);
266 	hr = verify(node->left);
267 
268 	if (!hl || !hr)
269 		return 0;
270 	if (hl != hr)
271 		return 0;
272 
273 	if (node->color == IBV_RED) {
274 		if (node->left && node->left->color != IBV_BLACK)
275 			return 0;
276 		if (node->right && node->right->color != IBV_BLACK)
277 			return 0;
278 		return hl;
279 	}
280 
281 	return hl + 1;
282 }
283 #endif
284 
__mm_add_rebalance(struct ibv_mem_node * node)285 static void __mm_add_rebalance(struct ibv_mem_node *node)
286 {
287 	struct ibv_mem_node *parent, *gp, *uncle;
288 
289 	while (node->parent && node->parent->color == IBV_RED) {
290 		parent = node->parent;
291 		gp     = node->parent->parent;
292 
293 		if (parent == gp->left) {
294 			uncle = gp->right;
295 
296 			if (uncle && uncle->color == IBV_RED) {
297 				parent->color = IBV_BLACK;
298 				uncle->color  = IBV_BLACK;
299 				gp->color     = IBV_RED;
300 
301 				node = gp;
302 			} else {
303 				if (node == parent->right) {
304 					__mm_rotate_left(parent);
305 					node   = parent;
306 					parent = node->parent;
307 				}
308 
309 				parent->color = IBV_BLACK;
310 				gp->color     = IBV_RED;
311 
312 				__mm_rotate_right(gp);
313 			}
314 		} else {
315 			uncle = gp->left;
316 
317 			if (uncle && uncle->color == IBV_RED) {
318 				parent->color = IBV_BLACK;
319 				uncle->color  = IBV_BLACK;
320 				gp->color     = IBV_RED;
321 
322 				node = gp;
323 			} else {
324 				if (node == parent->left) {
325 					__mm_rotate_right(parent);
326 					node   = parent;
327 					parent = node->parent;
328 				}
329 
330 				parent->color = IBV_BLACK;
331 				gp->color     = IBV_RED;
332 
333 				__mm_rotate_left(gp);
334 			}
335 		}
336 	}
337 
338 	mm_root->color = IBV_BLACK;
339 }
340 
__mm_add(struct ibv_mem_node * new)341 static void __mm_add(struct ibv_mem_node *new)
342 {
343 	struct ibv_mem_node *node, *parent = NULL;
344 
345 	node = mm_root;
346 	while (node) {
347 		parent = node;
348 		if (node->start < new->start)
349 			node = node->right;
350 		else
351 			node = node->left;
352 	}
353 
354 	if (parent->start < new->start)
355 		parent->right = new;
356 	else
357 		parent->left = new;
358 
359 	new->parent = parent;
360 	new->left   = NULL;
361 	new->right  = NULL;
362 
363 	new->color = IBV_RED;
364 	__mm_add_rebalance(new);
365 }
366 
__mm_remove(struct ibv_mem_node * node)367 static void __mm_remove(struct ibv_mem_node *node)
368 {
369 	struct ibv_mem_node *child, *parent, *sib, *tmp;
370 	int nodecol;
371 
372 	if (node->left && node->right) {
373 		tmp = node->left;
374 		while (tmp->right)
375 			tmp = tmp->right;
376 
377 		nodecol    = tmp->color;
378 		child      = tmp->left;
379 		tmp->color = node->color;
380 
381 		if (tmp->parent != node) {
382 			parent        = tmp->parent;
383 			parent->right = tmp->left;
384 			if (tmp->left)
385 				tmp->left->parent = parent;
386 
387 			tmp->left   	   = node->left;
388 			node->left->parent = tmp;
389 		} else
390 			parent = tmp;
391 
392 		tmp->right          = node->right;
393 		node->right->parent = tmp;
394 
395 		tmp->parent = node->parent;
396 		if (node->parent) {
397 			if (node->parent->left == node)
398 				node->parent->left = tmp;
399 			else
400 				node->parent->right = tmp;
401 		} else
402 			mm_root = tmp;
403 	} else {
404 		nodecol = node->color;
405 
406 		child  = node->left ? node->left : node->right;
407 		parent = node->parent;
408 
409 		if (child)
410 			child->parent = parent;
411 		if (parent) {
412 			if (parent->left == node)
413 				parent->left = child;
414 			else
415 				parent->right = child;
416 		} else
417 			mm_root = child;
418 	}
419 
420 	free(node);
421 
422 	if (nodecol == IBV_RED)
423 		return;
424 
425 	while ((!child || child->color == IBV_BLACK) && child != mm_root) {
426 		if (parent->left == child) {
427 			sib = parent->right;
428 
429 			if (sib->color == IBV_RED) {
430 				parent->color = IBV_RED;
431 				sib->color    = IBV_BLACK;
432 				__mm_rotate_left(parent);
433 				sib = parent->right;
434 			}
435 
436 			if ((!sib->left  || sib->left->color  == IBV_BLACK) &&
437 			    (!sib->right || sib->right->color == IBV_BLACK)) {
438 				sib->color = IBV_RED;
439 				child  = parent;
440 				parent = child->parent;
441 			} else {
442 				if (!sib->right || sib->right->color == IBV_BLACK) {
443 					if (sib->left)
444 						sib->left->color = IBV_BLACK;
445 					sib->color = IBV_RED;
446 					__mm_rotate_right(sib);
447 					sib = parent->right;
448 				}
449 
450 				sib->color    = parent->color;
451 				parent->color = IBV_BLACK;
452 				if (sib->right)
453 					sib->right->color = IBV_BLACK;
454 				__mm_rotate_left(parent);
455 				child = mm_root;
456 				break;
457 			}
458 		} else {
459 			sib = parent->left;
460 
461 			if (sib->color == IBV_RED) {
462 				parent->color = IBV_RED;
463 				sib->color    = IBV_BLACK;
464 				__mm_rotate_right(parent);
465 				sib = parent->left;
466 			}
467 
468 			if ((!sib->left  || sib->left->color  == IBV_BLACK) &&
469 			    (!sib->right || sib->right->color == IBV_BLACK)) {
470 				sib->color = IBV_RED;
471 				child  = parent;
472 				parent = child->parent;
473 			} else {
474 				if (!sib->left || sib->left->color == IBV_BLACK) {
475 					if (sib->right)
476 						sib->right->color = IBV_BLACK;
477 					sib->color = IBV_RED;
478 					__mm_rotate_left(sib);
479 					sib = parent->left;
480 				}
481 
482 				sib->color    = parent->color;
483 				parent->color = IBV_BLACK;
484 				if (sib->left)
485 					sib->left->color = IBV_BLACK;
486 				__mm_rotate_right(parent);
487 				child = mm_root;
488 				break;
489 			}
490 		}
491 	}
492 
493 	if (child)
494 		child->color = IBV_BLACK;
495 }
496 
__mm_find_start(uintptr_t start,uintptr_t end)497 static struct ibv_mem_node *__mm_find_start(uintptr_t start, uintptr_t end)
498 {
499 	struct ibv_mem_node *node = mm_root;
500 
501 	while (node) {
502 		if (node->start <= start && node->end >= start)
503 			break;
504 
505 		if (node->start < start)
506 			node = node->right;
507 		else
508 			node = node->left;
509 	}
510 
511 	return node;
512 }
513 
merge_ranges(struct ibv_mem_node * node,struct ibv_mem_node * prev)514 static struct ibv_mem_node *merge_ranges(struct ibv_mem_node *node,
515 					 struct ibv_mem_node *prev)
516 {
517 	prev->end = node->end;
518 	prev->refcnt = node->refcnt;
519 	__mm_remove(node);
520 
521 	return prev;
522 }
523 
split_range(struct ibv_mem_node * node,uintptr_t cut_line)524 static struct ibv_mem_node *split_range(struct ibv_mem_node *node,
525 					uintptr_t cut_line)
526 {
527 	struct ibv_mem_node *new_node = NULL;
528 
529 	new_node = malloc(sizeof *new_node);
530 	if (!new_node)
531 		return NULL;
532 	new_node->start  = cut_line;
533 	new_node->end    = node->end;
534 	new_node->refcnt = node->refcnt;
535 	node->end  = cut_line - 1;
536 	__mm_add(new_node);
537 
538 	return new_node;
539 }
540 
get_start_node(uintptr_t start,uintptr_t end,int inc)541 static struct ibv_mem_node *get_start_node(uintptr_t start, uintptr_t end,
542 					   int inc)
543 {
544 	struct ibv_mem_node *node, *tmp = NULL;
545 
546 	node = __mm_find_start(start, end);
547 	if (node->start < start)
548 		node = split_range(node, start);
549 	else {
550 		tmp = __mm_prev(node);
551 		if (tmp && tmp->refcnt == node->refcnt + inc)
552 			node = merge_ranges(node, tmp);
553 	}
554 	return node;
555 }
556 
557 /*
558  * This function is called if madvise() fails to undo merging/splitting
559  * operations performed on the node.
560  */
undo_node(struct ibv_mem_node * node,uintptr_t start,int inc)561 static struct ibv_mem_node *undo_node(struct ibv_mem_node *node,
562 				      uintptr_t start, int inc)
563 {
564 	struct ibv_mem_node *tmp = NULL;
565 
566 	/*
567 	 * This condition can be true only if we merged this
568 	 * node with the previous one, so we need to split them.
569 	*/
570 	if (start > node->start) {
571 		tmp = split_range(node, start);
572 		if (tmp) {
573 			node->refcnt += inc;
574 			node = tmp;
575 		} else
576 			return NULL;
577 	}
578 
579 	tmp  =  __mm_prev(node);
580 	if (tmp && tmp->refcnt == node->refcnt)
581 		node = merge_ranges(node, tmp);
582 
583 	tmp  =  __mm_next(node);
584 	if (tmp && tmp->refcnt == node->refcnt)
585 		node = merge_ranges(tmp, node);
586 
587 	return node;
588 }
589 
ibv_madvise_range(void * base,size_t size,int advice)590 static int ibv_madvise_range(void *base, size_t size, int advice)
591 {
592 	uintptr_t start, end;
593 	struct ibv_mem_node *node, *tmp;
594 	int inc;
595 	int rolling_back = 0;
596 	int ret = 0;
597 	unsigned long range_page_size;
598 
599 	if (!size)
600 		return 0;
601 
602 	if (huge_page_enabled)
603 		range_page_size = get_page_size(base);
604 	else
605 		range_page_size = page_size;
606 
607 	start = (uintptr_t) base & ~(range_page_size - 1);
608 	end   = ((uintptr_t) (base + size + range_page_size - 1) &
609 		 ~(range_page_size - 1)) - 1;
610 
611 	pthread_mutex_lock(&mm_mutex);
612 again:
613 	inc = advice == MADV_DONTFORK ? 1 : -1;
614 
615 	node = get_start_node(start, end, inc);
616 	if (!node) {
617 		ret = -1;
618 		goto out;
619 	}
620 
621 	while (node && node->start <= end) {
622 		if (node->end > end) {
623 			if (!split_range(node, end + 1)) {
624 				ret = -1;
625 				goto out;
626 			}
627 		}
628 
629 		if ((inc == -1 && node->refcnt == 1) ||
630 		    (inc ==  1 && node->refcnt == 0)) {
631 			/*
632 			 * If this is the first time through the loop,
633 			 * and we merged this node with the previous
634 			 * one, then we only want to do the madvise()
635 			 * on start ... node->end (rather than
636 			 * starting at node->start).
637 			 *
638 			 * Otherwise we end up doing madvise() on
639 			 * bigger region than we're being asked to,
640 			 * and that may lead to a spurious failure.
641 			 */
642 			if (start > node->start)
643 				ret = madvise((void *) start, node->end - start + 1,
644 					      advice);
645 			else
646 				ret = madvise((void *) node->start,
647 					      node->end - node->start + 1,
648 					      advice);
649 			if (ret) {
650 				node = undo_node(node, start, inc);
651 
652 				if (rolling_back || !node)
653 					goto out;
654 
655 				/* madvise failed, roll back previous changes */
656 				rolling_back = 1;
657 				advice = advice == MADV_DONTFORK ?
658 					MADV_DOFORK : MADV_DONTFORK;
659 				tmp = __mm_prev(node);
660 				if (!tmp || start > tmp->end)
661 					goto out;
662 				end = tmp->end;
663 				goto again;
664 			}
665 		}
666 
667 		node->refcnt += inc;
668 		node = __mm_next(node);
669 	}
670 
671 	if (node) {
672 		tmp = __mm_prev(node);
673 		if (tmp && node->refcnt == tmp->refcnt)
674 			node = merge_ranges(node, tmp);
675 	}
676 
677 out:
678 	if (rolling_back)
679 		ret = -1;
680 
681 	pthread_mutex_unlock(&mm_mutex);
682 
683 	return ret;
684 }
685 
ibv_dontfork_range(void * base,size_t size)686 int ibv_dontfork_range(void *base, size_t size)
687 {
688 	if (mm_root)
689 		return ibv_madvise_range(base, size, MADV_DONTFORK);
690 	else {
691 		too_late = 1;
692 		return 0;
693 	}
694 }
695 
ibv_dofork_range(void * base,size_t size)696 int ibv_dofork_range(void *base, size_t size)
697 {
698 	if (mm_root)
699 		return ibv_madvise_range(base, size, MADV_DOFORK);
700 	else {
701 		too_late = 1;
702 		return 0;
703 	}
704 }
705