1 /*
2  * Copyright (c) 2016-2017 Cray Inc. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <gnix_mr_cache.h>
34 #include <gnix_smrn.h>
35 #include <gnix_mr_notifier.h>
36 #include <gnix.h>
37 
38 typedef unsigned long long int cache_entry_state_t;
39 /* These are used for entry state and should be unique */
40 #define GNIX_CES_INUSE       (1ULL << 8)    /* in use */
41 #define GNIX_CES_STALE       (2ULL << 8)    /* cached for possible reuse */
42 #define GNIX_CES_STATE_MASK  (0xFULL << 8)
43 
44 typedef unsigned long long int cache_entry_flag_t;
45 /* One or more of these can be combined with the above */
46 #define GNIX_CE_RETIRED     (1ULL << 61)   /* in use, but not to be reused */
47 #define GNIX_CE_MERGED      (1ULL << 62)   /* merged entry, i.e., not
48 					    * an original request from
49 					    * fi_mr_reg */
50 #define GNIX_CE_UNMAPPED    (1ULL << 63)   /* at least 1 page of the
51 					    * entry has been unmapped
52 					    * by the OS */
53 
54 /**
55  * @brief structure for containing the fields relevant to the memory cache key
56  *
57  * @var   address  base address of the memory region
58  * @var   address  length of the memory region
59  */
60 typedef struct gnix_mr_cache_key {
61 	uint64_t address;
62 	uint64_t length;
63 } gnix_mr_cache_key_t;
64 
65 /**
66  * @brief gnix memory registration cache entry
67  *
68  * @var   state      state of the memory registration cache entry
69  * @var   mr         gnix memory registration descriptor
70  * @var   mem_hndl   gni memory handle for the memory registration
71  * @var   key        gnix memory registration cache key
72  * @var   domain     gnix domain associated with the memory registration
73  * @var   nic        gnix nic associated with the memory registration
74  * @var   ref_cnt    reference counting for the cache
75  * @var   lru_entry  lru list entry
76  * @var   siblings   list of sibling entries
77  * @var   children   list of subsumed child entries
78  */
79 typedef struct gnix_mr_cache_entry {
80 	struct gnix_smrn_context context;
81 	cache_entry_state_t state;
82 	gnix_mr_cache_key_t key;
83 	ofi_atomic32_t ref_cnt;
84 	struct dlist_entry lru_entry;
85 	struct dlist_entry siblings;
86 	struct dlist_entry children;
87 	uint64_t mr[0];
88 } gnix_mr_cache_entry_t;
89 
90 /* forward declarations */
91 int _gnix_mr_cache_register(
92 		gnix_mr_cache_t             *cache,
93 		uint64_t                    address,
94 		uint64_t                    length,
95 		struct _gnix_fi_reg_context *fi_reg_context,
96 		void                        **handle);
97 
98 int _gnix_mr_cache_deregister(
99 		gnix_mr_cache_t *cache,
100 		void            *handle);
101 
102 static inline int __mr_cache_entry_put(
103 		gnix_mr_cache_t       *cache,
104 		gnix_mr_cache_entry_t *entry);
105 
106 static inline int __mr_cache_entry_get(
107 		gnix_mr_cache_t       *cache,
108 		gnix_mr_cache_entry_t *entry);
109 
110 static inline int __mr_cache_entry_destroy(gnix_mr_cache_t *cache,
111 					   gnix_mr_cache_entry_t *entry);
112 
113 static int __mr_cache_create_registration(
114 		gnix_mr_cache_t             *cache,
115 		uint64_t                    address,
116 		uint64_t                    length,
117 		gnix_mr_cache_entry_t       **entry,
118 		gnix_mr_cache_key_t         *key,
119 		struct _gnix_fi_reg_context *fi_reg_context);
120 
121 
122 /* global declarations */
123 
124 /* default attributes for new caches */
125 gnix_mr_cache_attr_t __default_mr_cache_attr = {
126 		.soft_reg_limit      = 4096,
127 		.hard_reg_limit      = -1,
128 		.hard_stale_limit    = 128,
129 #if HAVE_KDREG
130 		.lazy_deregistration = 1,
131 #else
132 		.lazy_deregistration = 0,
133 #endif
134 };
135 
136 /* Functions for using and manipulating cache entry state */
__entry_get_state(gnix_mr_cache_entry_t * e)137 static inline cache_entry_state_t __entry_get_state(gnix_mr_cache_entry_t *e)
138 {
139 	return e->state & GNIX_CES_STATE_MASK;
140 }
141 
__entry_set_state(gnix_mr_cache_entry_t * e,cache_entry_state_t s)142 static inline void __entry_set_state(gnix_mr_cache_entry_t *e,
143 				     cache_entry_state_t s)
144 {
145 	e->state = (e->state & ~GNIX_CES_STATE_MASK) |
146 		(s & GNIX_CES_STATE_MASK);
147 }
148 
__entry_reset_state(gnix_mr_cache_entry_t * e)149 static inline void __entry_reset_state(gnix_mr_cache_entry_t *e)
150 {
151 	e->state = 0ULL;
152 }
153 
__entry_is_flag(gnix_mr_cache_entry_t * e,cache_entry_flag_t f)154 static inline bool __entry_is_flag(gnix_mr_cache_entry_t *e,
155 				   cache_entry_flag_t f)
156 {
157 	return (e->state & f) != 0;
158 }
159 
__entry_set_flag(gnix_mr_cache_entry_t * e,cache_entry_flag_t f)160 static inline void __entry_set_flag(gnix_mr_cache_entry_t *e,
161 				    cache_entry_flag_t f)
162 {
163 	e->state = e->state | f;
164 }
165 
__entry_is_retired(gnix_mr_cache_entry_t * e)166 static inline bool __entry_is_retired(gnix_mr_cache_entry_t *e)
167 {
168 	return __entry_is_flag(e, GNIX_CE_RETIRED);
169 }
170 
__entry_is_merged(gnix_mr_cache_entry_t * e)171 static inline bool __entry_is_merged(gnix_mr_cache_entry_t *e)
172 {
173 	return __entry_is_flag(e, GNIX_CE_MERGED);
174 }
175 
__entry_is_unmapped(gnix_mr_cache_entry_t * e)176 static inline bool __entry_is_unmapped(gnix_mr_cache_entry_t *e)
177 {
178 	return __entry_is_flag(e, GNIX_CE_UNMAPPED);
179 }
180 
__entry_set_retired(gnix_mr_cache_entry_t * e)181 static inline void __entry_set_retired(gnix_mr_cache_entry_t *e)
182 {
183 	__entry_set_flag(e, GNIX_CE_RETIRED);
184 }
185 
__entry_set_merged(gnix_mr_cache_entry_t * e)186 static inline void __entry_set_merged(gnix_mr_cache_entry_t *e)
187 {
188 	__entry_set_flag(e, GNIX_CE_MERGED);
189 }
190 
__entry_set_unmapped(gnix_mr_cache_entry_t * e)191 static inline void __entry_set_unmapped(gnix_mr_cache_entry_t *e)
192 {
193 	__entry_set_flag(e, GNIX_CE_UNMAPPED);
194 }
195 
196 /**
197  * Key comparison function for finding overlapping gnix memory
198  * registration cache entries
199  *
200  * @param[in] x key to be inserted or found
201  * @param[in] y key to be compared against
202  *
203  * @return    -1 if it should be positioned at the left, 0 if it overlaps,
204  *             1 otherwise
205  */
__find_overlapping_addr(void * x,void * y)206 static int __find_overlapping_addr(
207 		void *x,
208 		void *y)
209 {
210 	gnix_mr_cache_key_t *to_find  = (gnix_mr_cache_key_t *) x;
211 	gnix_mr_cache_key_t *to_compare = (gnix_mr_cache_key_t *) y;
212 	uint64_t to_find_end = to_find->address + to_find->length - 1;
213 	uint64_t to_compare_end = to_compare->address + to_compare->length - 1;
214 
215 	/* format: (x_addr,  x_len) - (y_addr,  y_len) truth_value
216 	 *
217 	 * case 1: (0x1000, 0x1000) - (0x1400, 0x0800) true
218 	 * case 2: (0x1000, 0x1000) - (0x0C00, 0x0800) true
219 	 * case 3: (0x1000, 0x1000) - (0x1C00, 0x0800) true
220 	 * case 4: (0x1000, 0x1000) - (0x0C00, 0x2000) true
221 	 * case 5: (0x1000, 0x1000) - (0x0400, 0x0400) false
222 	 * case 6: (0x1000, 0x1000) - (0x2400, 0x0400) false
223 	 */
224 	if (!(to_find_end < to_compare->address ||
225 			to_compare_end < to_find->address))
226 		return 0;
227 
228 	/* left */
229 	if (to_find->address < to_compare->address)
230 		return -1;
231 
232 	return 1;
233 }
234 
235 /**
236  * Key comparison function for gnix memory registration caches
237  *
238  * @param[in] x key to be inserted or found
239  * @param[in] y key to be compared against
240  *
241  * @return    -1 if it should be positioned at the left, 0 if the same,
242  *             1 otherwise
243  */
__mr_cache_key_comp(void * x,void * y)244 static inline int __mr_cache_key_comp(
245 		void *x,
246 		void *y)
247 {
248 	gnix_mr_cache_key_t *to_insert  = (gnix_mr_cache_key_t *) x;
249 	gnix_mr_cache_key_t *to_compare = (gnix_mr_cache_key_t *) y;
250 
251 	if (to_compare->address == to_insert->address)
252 		return 0;
253 
254 	/* to the left */
255 	if (to_insert->address < to_compare->address)
256 		return -1;
257 
258 	/* to the right */
259 	return 1;
260 }
261 
262 /**
263  * Helper function for matching the exact key entry
264  *
265  * @param entry     memory registration cache key
266  * @param to_match  memory registration cache key
267  * @return 1 if match, otherwise 0
268  */
__match_exact_key(gnix_mr_cache_key_t * entry,gnix_mr_cache_key_t * to_match)269 static inline int __match_exact_key(
270 		gnix_mr_cache_key_t *entry,
271 		gnix_mr_cache_key_t *to_match)
272 {
273 	return entry->address == to_match->address &&
274 			entry->length == to_match->length;
275 }
276 
277 /**
278  * dlist search function for matching the exact memory registration key
279  *
280  * @param entry memory registration cache entry
281  * @param match memory registration cache key
282  * @return 1 if match, otherwise 0
283  */
__mr_exact_key(struct dlist_entry * entry,const void * match)284 static inline int __mr_exact_key(struct dlist_entry *entry,
285 		const void *match)
286 {
287 	gnix_mr_cache_entry_t *x = container_of(entry,
288 							gnix_mr_cache_entry_t,
289 							siblings);
290 
291 	gnix_mr_cache_key_t *y = (gnix_mr_cache_key_t *) match;
292 
293 	return __match_exact_key(&x->key, y);
294 }
295 
296 
297 /**
298  * Helper function to determine if one key subsumes another
299  *
300  * @param x  gnix_mr_cache_key
301  * @param y  gnix_mr_cache_key
302  * @return 1 if x subsumes y, 0 otherwise
303  */
__can_subsume(gnix_mr_cache_key_t * x,gnix_mr_cache_key_t * y)304 static inline int __can_subsume(
305 		gnix_mr_cache_key_t *x,
306 		gnix_mr_cache_key_t *y)
307 {
308 	return (x->address <= y->address) &&
309 			((x->address + x->length) >=
310 					(y->address + y->length));
311 }
312 
__attach_retired_entries_to_registration(gnix_mr_cache_t * cache,struct dlist_entry * retired_entries,gnix_mr_cache_entry_t * parent)313 static inline void __attach_retired_entries_to_registration(
314 		gnix_mr_cache_t *cache,
315 		struct dlist_entry *retired_entries,
316 		gnix_mr_cache_entry_t *parent)
317 {
318 	gnix_mr_cache_entry_t *entry, *tmp;
319 
320 	dlist_for_each_safe(retired_entries, entry, tmp, siblings) {
321 		dlist_remove(&entry->siblings);
322 		dlist_insert_tail(&entry->siblings,
323 				&parent->children);
324 		if (!dlist_empty(&entry->children)) {
325 			/* move the entry's children to the sibling tree
326 			 * and decrement the reference count */
327 			dlist_splice_tail(&parent->children,
328 					&entry->children);
329 			__mr_cache_entry_put(cache, entry);
330 		}
331 	}
332 
333 	if (!dlist_empty(retired_entries)) {
334 		GNIX_FATAL(FI_LOG_MR, "retired_entries not empty\n");
335 	}
336 
337 	__mr_cache_entry_get(cache, parent);
338 }
339 
__remove_sibling_entries_from_tree(gnix_mr_cache_t * cache,struct dlist_entry * list,RbtHandle tree)340 static inline void __remove_sibling_entries_from_tree(
341 		gnix_mr_cache_t *cache,
342 		struct dlist_entry *list,
343 		RbtHandle tree)
344 {
345 	RbtStatus rc;
346 	RbtIterator iter;
347 	gnix_mr_cache_entry_t *entry;
348 
349 	dlist_for_each(list, entry, siblings)
350 	{
351 		GNIX_DEBUG(FI_LOG_MR,
352 			   "removing key from tree, key=%llx:%llx\n",
353 			   entry->key.address, entry->key.length);
354 		iter = rbtFind(tree, &entry->key);
355 		if (OFI_UNLIKELY(!iter)) {
356 			GNIX_FATAL(FI_LOG_MR, "key not found\n");
357 		}
358 
359 		rc = rbtErase(tree, iter);
360 		if (OFI_UNLIKELY(rc != RBT_STATUS_OK)) {
361 			GNIX_FATAL(FI_LOG_MR,
362 				   "could not remove entry from tree\n");
363 		}
364 	}
365 }
366 
367 /**
368  * Pushes an entry into the LRU cache. No limits are maintained here as
369  *   the hard_stale_limit attr value will directly limit the lru size
370  *
371  * @param[in] cache  a memory registration cache object
372  * @param[in] entry  a memory registration cache entry
373  *
374  * @return           FI_SUCCESS, always
375  */
__mr_cache_lru_enqueue(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t * entry)376 static inline int __mr_cache_lru_enqueue(
377 		gnix_mr_cache_t       *cache,
378 		gnix_mr_cache_entry_t *entry)
379 {
380 	dlist_insert_tail(&entry->lru_entry, &cache->lru_head);
381 
382 	return FI_SUCCESS;
383 }
384 
385 /**
386  * Pops an registration cache entry from the lru cache.
387  *
388  * @param[in] cache  a memory registration cache
389  * @param[in] entry  a memory registration cache entry
390  *
391  * @return           FI_SUCCESS, on success
392  * @return           -FI_ENOENT, on empty LRU
393  */
__mr_cache_lru_dequeue(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t ** entry)394 static inline int __mr_cache_lru_dequeue(
395 		gnix_mr_cache_t       *cache,
396 		gnix_mr_cache_entry_t **entry)
397 {
398 	gnix_mr_cache_entry_t *ret;
399 
400 	ret = dlist_first_entry(&cache->lru_head,
401 			gnix_mr_cache_entry_t, lru_entry);
402 	if (OFI_UNLIKELY(!ret)) { /* we check list_empty before calling */
403 		*entry = NULL;
404 		return -FI_ENOENT;
405 	}
406 
407 	/* remove entry from the list */
408 	*entry = ret;
409 	dlist_remove(&ret->lru_entry);
410 
411 	return FI_SUCCESS;
412 }
413 
414 /**
415  * Removes a particular registration cache entry from the lru cache.
416  *
417  * @param[in] cache  a memory registration cache
418  * @param[in] entry  a memory registration cache entry
419  *
420  * @return           FI_SUCCESS, on success
421  * @return           -FI_ENOENT, on empty LRU
422  */
__mr_cache_lru_remove(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t * entry)423 static inline int __mr_cache_lru_remove(
424 		gnix_mr_cache_t       *cache,
425 		gnix_mr_cache_entry_t *entry)
426 {
427 	/* Could do some error checking to see if in cache */
428 
429 	dlist_remove(&entry->lru_entry);
430 
431 	return FI_SUCCESS;
432 }
433 
434 /**
435  * Remove entries that have been unmapped as indicated by the notifer
436  *
437  * @param[in] cache  a memory registration cache
438  *
439  * @return           nothing
440  */
441 static bool __notifier_warned = false;
442 static void
__clear_notifier_events(gnix_mr_cache_t * cache)443 __clear_notifier_events(gnix_mr_cache_t *cache)
444 {
445 	int ret;
446 	gnix_mr_cache_entry_t *entry;
447 	struct gnix_smrn_context *context;
448 	RbtIterator iter;
449 
450 	if (!cache->attr.smrn) {
451 		return;
452 	}
453 
454 	if (!cache->attr.lazy_deregistration) {
455 		return;
456 	}
457 
458 	ret = _gnix_smrn_get_event(cache->attr.smrn,
459 				&cache->rq, &context);
460 	while (ret == FI_SUCCESS) {
461 		entry = container_of(context,
462 					struct gnix_mr_cache_entry, context);
463 		switch (__entry_get_state(entry)) {
464 		case GNIX_CES_INUSE:
465 			/* First, warn that this might be a
466 			 * problem.*/
467 			if ((__notifier_warned == false) &&
468 			    !__entry_is_merged(entry)) {
469 				GNIX_WARN(FI_LOG_MR,
470 					  "Registered memory region"
471 					  " includes unmapped pages."
472 					  "  Have you freed memory"
473 					  " without closing the memory"
474 					  " region?\n");
475 				__notifier_warned = true;
476 			}
477 
478 			GNIX_DEBUG(FI_LOG_MR,
479 				   "Marking unmapped entry (%p)"
480 				   " as retired %llx:%llx\n", entry,
481 				   entry->key.address,
482 				   entry->key.length);
483 
484 			__entry_set_unmapped(entry);
485 
486 			if (__entry_is_retired(entry)) {
487 				/* Nothing to do */
488 				break;
489 			}
490 
491 			/* Retire this entry (remove from
492 			 * inuse tree) */
493 
494 			__entry_set_retired(entry);
495 			iter = rbtFind(cache->inuse.rb_tree,
496 				       &entry->key);
497 			if (OFI_LIKELY(iter != NULL)) {
498 				ret = rbtErase(cache->inuse.rb_tree,
499 					       iter);
500 				if (ret != RBT_STATUS_OK) {
501 					GNIX_FATAL(FI_LOG_MR,
502 						   "Unmapped entry"
503 						   " could not be"
504 						   " removed from"
505 						   " in usetree.\n");
506 				}
507 			} else {
508 				/*  The only way we should get
509 				 *  here is if we're in the
510 				 *  middle of retiring this
511 				 *  entry.  Not sure if this
512 				 *  is worth a separate
513 				 *  warning from the one
514 				 *  above. */
515 			}
516 
517 			break;
518 		case GNIX_CES_STALE:
519 			__entry_set_unmapped(entry);
520 			iter = rbtFind(cache->stale.rb_tree,
521 				       &entry->key);
522 			if (!iter) {
523 				break;
524 			}
525 
526 			ret = rbtErase(cache->stale.rb_tree, iter);
527 			if (ret != RBT_STATUS_OK) {
528 				GNIX_FATAL(FI_LOG_MR,
529 					   "Unmapped entry could"
530 					   " not be removed "
531 					   " from stale tree.\n");
532 			}
533 
534 			GNIX_DEBUG(FI_LOG_MR, "Removed unmapped entry"
535 				   " (%p) from stale tree %llx:%llx\n",
536 				   entry, entry->key.address,
537 				   entry->key.length);
538 
539 			if (__mr_cache_lru_remove(cache, entry) == FI_SUCCESS) {
540 				GNIX_DEBUG(FI_LOG_MR, "Removed"
541 					   " unmapped entry (%p)"
542 					   " from lru list %llx:%llx\n",
543 					   entry, entry->key.address,
544 					   entry->key.length);
545 
546 				ofi_atomic_dec32(&cache->stale.elements);
547 
548 			} else {
549 				GNIX_WARN(FI_LOG_MR, "Failed to remove"
550 					  " unmapped entry"
551 					  " from lru list (%p) %p\n",
552 					  entry, iter);
553 			}
554 
555 			__mr_cache_entry_destroy(cache, entry);
556 
557 			break;
558 		default:
559 			GNIX_FATAL(FI_LOG_MR,
560 				   "Unmapped entry (%p) in incorrect"
561 				   " state: %d\n",
562 				   entry, entry->state);
563 		}
564 
565 		ret = _gnix_smrn_get_event(cache->attr.smrn,
566 					&cache->rq, &context);
567 	}
568 	if (ret != -FI_EAGAIN) {
569 		/* Should we do something else here? */
570 		GNIX_WARN(FI_LOG_MR,
571 			  "_gnix_smrn_get_event returned error: %s\n",
572 			  fi_strerror(-ret));
573 	}
574 
575 	return;
576 }
577 
578 /**
579  * Start monitoring a memory region
580  *
581  * @param[in] cache  a memory registration cache
582  * @param[in] entry  a memory registration entry
583  *
584  * @return           return code from _gnix_smrn_monitor
585  */
586 static int
__notifier_monitor(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t * entry)587 __notifier_monitor(gnix_mr_cache_t *cache,
588 		   gnix_mr_cache_entry_t *entry)
589 {
590 
591 	if (!cache->attr.lazy_deregistration) {
592 		return FI_SUCCESS;
593 	}
594 
595 	if (cache->attr.smrn == NULL) {
596 		return FI_SUCCESS;
597 	}
598 
599 	GNIX_DEBUG(FI_LOG_MR, "monitoring entry=%p %llx:%llx\n", entry,
600 		   entry->key.address, entry->key.length);
601 
602 	return  _gnix_smrn_monitor(cache->attr.smrn,
603 						  &cache->rq,
604 					      (void *) entry->key.address,
605 					      entry->key.length,
606 					      (uint64_t) &entry->context,
607 						  &entry->context);
608 }
609 
610 /**
611  * Stop monitoring a memory region
612  *
613  * @param[in] cache  a memory registration cache
614  * @param[in] entry  a memory registration entry
615  *
616  * @return           nothing
617  */
618 static void
__notifier_unmonitor(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t * entry)619 __notifier_unmonitor(gnix_mr_cache_t *cache,
620 		     gnix_mr_cache_entry_t *entry)
621 {
622 	int rc;
623 
624 	if (!cache->attr.lazy_deregistration) {
625 		return;
626 	}
627 
628 	if (cache->attr.smrn == NULL) {
629 		return;
630 	}
631 
632 	__clear_notifier_events(cache);
633 
634 	if (!__entry_is_unmapped(entry)) {
635 		GNIX_DEBUG(FI_LOG_MR, "unmonitoring entry=%p (state=%d)\n",
636 			   entry, entry->state);
637 		rc = _gnix_smrn_unmonitor(cache->attr.smrn,
638 						  (uint64_t) &entry->context,
639 					      &entry->context);
640 		if (rc != FI_SUCCESS) {
641 			/* This probably is okay (ESRCH), because the
642 			 * memory could have been unmapped in the
643 			 * interim, so clear the notifier events
644 			 * again */
645 			GNIX_DEBUG(FI_LOG_MR,
646 				   "failed to unmonitor memory (entry=%p),"
647 				   " so clear notifier events again\n",
648 				   entry, fi_strerror(-rc));
649 
650 			__clear_notifier_events(cache);
651 		}
652 	}
653 }
654 
655 /**
656  * Destroys the memory registration cache entry and deregisters the memory
657  *   region with uGNI
658  *
659  * @param[in] entry  a memory registration cache entry
660  *
661  * @return           return code from callbacks
662  */
__mr_cache_entry_destroy(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t * entry)663 static inline int __mr_cache_entry_destroy(gnix_mr_cache_t *cache,
664 		gnix_mr_cache_entry_t *entry)
665 {
666 	int rc;
667 
668 	rc = cache->attr.dereg_callback(entry->mr,
669 			cache->attr.destruct_context);
670 	if (!rc) {
671 		/* Should we bother with this check?  If we don't, the
672 		 * only difference it that __clear_notifier_events
673 		 * will be called one additional time. */
674 		if (!__entry_is_unmapped(entry)) {
675 			__notifier_unmonitor(cache, entry);
676 		}
677 		__entry_reset_state(entry);
678 
679 		rc = cache->attr.destruct_callback(cache->attr.dereg_context);
680 		if (!rc)
681 			free(entry);
682 	} else {
683 		GNIX_INFO(FI_LOG_MR, "failed to deregister memory"
684 			  " region with callback, "
685 			  "cache_entry=%p ret=%i\n", entry, rc);
686 	}
687 
688 	return rc;
689 }
690 
__insert_entry_into_stale(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t * entry)691 static inline int __insert_entry_into_stale(
692 		gnix_mr_cache_t *cache,
693 		gnix_mr_cache_entry_t *entry)
694 {
695 	RbtStatus rc;
696 	int ret = 0;
697 
698 	if (__entry_is_unmapped(entry)) {
699 		GNIX_DEBUG(FI_LOG_MR, "entry (%p) unmapped, not inserting"
700 			   " into stale %llx:%llx", entry,
701 			   entry->key.address, entry->key.length);
702 		/* Should we return some other value? */
703 		return ret;
704 	}
705 
706 	rc = rbtInsert(cache->stale.rb_tree,
707 			&entry->key,
708 			entry);
709 	if (rc != RBT_STATUS_OK) {
710 		GNIX_ERR(FI_LOG_MR,
711 				"could not insert into stale rb tree,"
712 				" rc=%d key.address=%llx key.length=%llx entry=%p",
713 				rc,
714 				entry->key.address,
715 				entry->key.length,
716 				entry);
717 
718 		ret = __mr_cache_entry_destroy(cache, entry);
719 	} else {
720 		GNIX_DEBUG(FI_LOG_MR,
721 			   "inserted key=%llx:%llx into stale\n",
722 			   entry->key.address, entry->key.length);
723 
724 		__mr_cache_lru_enqueue(cache, entry);
725 		ofi_atomic_inc32(&cache->stale.elements);
726 		switch (__entry_get_state(entry)) {
727 		case  GNIX_CES_INUSE:
728 			__entry_set_state(entry, GNIX_CES_STALE);
729 			break;
730 		default:
731 			GNIX_FATAL(FI_LOG_MR,
732 				   "stale entry (%p) %llx:%llx in bad"
733 				   " state (%d)\n", entry,
734 				   entry->key.address, entry->key.length,
735 				   entry->state);
736 		}
737 	}
738 
739 	return ret;
740 }
741 
__resolve_stale_entry_collision(gnix_mr_cache_t * cache,RbtIterator found,gnix_mr_cache_entry_t * entry)742 static inline void __resolve_stale_entry_collision(
743 		gnix_mr_cache_t *cache,
744 		RbtIterator found,
745 		gnix_mr_cache_entry_t *entry)
746 {
747 	RbtStatus __attribute__((unused)) rc;
748 	gnix_mr_cache_entry_t *c_entry, *tmp;
749 	gnix_mr_cache_key_t *c_key;
750 	int ret;
751 	DLIST_HEAD(to_destroy);
752 	RbtIterator iter = found;
753 	int add_new_entry = 1, cmp;
754 
755 	GNIX_TRACE(FI_LOG_MR, "\n");
756 
757 	GNIX_DEBUG(FI_LOG_MR,
758 		   "resolving collisions with entry (%p) %llx:%llx\n",
759 		   entry, entry->key.address, entry->key.length);
760 
761 	while (iter) {
762 		rbtKeyValue(cache->stale.rb_tree, iter, (void **) &c_key,
763 				(void **) &c_entry);
764 
765 		cmp = __find_overlapping_addr(&entry->key, c_key);
766 		if (cmp != 0)
767 			break;
768 
769 		if (__can_subsume(&entry->key, c_key) ||
770 				(entry->key.length > c_key->length)) {
771 			GNIX_DEBUG(FI_LOG_MR,
772 				   "adding stale entry (%p) to destroy list,"
773 				   "  key=%llx:%llx\n", c_entry,
774 				   c_key->address, c_key->length);
775 			dlist_insert_tail(&c_entry->siblings, &to_destroy);
776 		} else {
777 			add_new_entry = 0;
778 		}
779 
780 		iter = rbtNext(cache->stale.rb_tree, iter);
781 	}
782 
783 	/* TODO I can probably do this in a single sweep, avoiding a second
784 	 * pass and incurring n lg n removal time
785 	 */
786 	dlist_for_each_safe(&to_destroy, c_entry, tmp, siblings)
787 	{
788 		GNIX_DEBUG(FI_LOG_MR, "removing key from tree, entry %p"
789 			   " key=%llx:%llx\n", c_entry,
790 			   c_entry->key.address, c_entry->key.length);
791 		iter = rbtFind(cache->stale.rb_tree, &c_entry->key);
792 		if (OFI_UNLIKELY(!iter)) {
793 			GNIX_FATAL(FI_LOG_MR, "key not found\n");
794 		}
795 
796 		rc = rbtErase(cache->stale.rb_tree,
797 						iter);
798 		if (OFI_UNLIKELY(rc != RBT_STATUS_OK)) {
799 			GNIX_FATAL(FI_LOG_MR,
800 				   "could not remove entry from tree\n");
801 		}
802 
803 		if (__mr_cache_lru_remove(cache, c_entry) != FI_SUCCESS) {
804 			GNIX_WARN(FI_LOG_MR, "Failed to remove entry"
805 				  " from lru list (%p)\n",
806 				  c_entry);
807 		}
808 		ofi_atomic_dec32(&cache->stale.elements);
809 		dlist_remove(&c_entry->siblings);
810 		__mr_cache_entry_destroy(cache, c_entry);
811 	}
812 	if (OFI_UNLIKELY(!dlist_empty(&to_destroy))) {
813 		GNIX_FATAL(FI_LOG_MR, "to_destroy not empty\n");
814 	}
815 
816 	if (add_new_entry) {
817 		ret = __insert_entry_into_stale(cache, entry);
818 		if (ret) {
819 			GNIX_FATAL(FI_LOG_MR,
820 				   "Failed to insert subsumed MR "
821 				   " entry (%p) into stale list\n",
822 				   entry);
823 		}
824 	} else {
825 		/* stale entry is larger than this one
826 		 * so lets just toss this entry out
827 		 */
828 		GNIX_DEBUG(FI_LOG_MR,
829 			   "larger entry already exists, "
830 			   "to_destroy=%llx:%llx\n",
831 			   entry->key.address, entry->key.length);
832 
833 		ret = __mr_cache_entry_destroy(cache, entry);
834 		if (ret) {
835 			GNIX_ERR(FI_LOG_MR,
836 					"failed to destroy a "
837 					"registration, entry=%p grc=%d\n",
838 					c_entry, ret);
839 		}
840 	}
841 }
842 
843 /**
844  * Increments the reference count on a memory registration cache entry
845  *
846  * @param[in] cache  gnix memory registration cache
847  * @param[in] entry  a memory registration cache entry
848  *
849  * @return           reference count for the registration
850  */
__mr_cache_entry_get(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t * entry)851 static inline int __mr_cache_entry_get(
852 		gnix_mr_cache_t       *cache,
853 		gnix_mr_cache_entry_t *entry)
854 {
855 	GNIX_TRACE(FI_LOG_MR, "\n");
856 
857 	GNIX_DEBUG(FI_LOG_MR,
858 		   "Up ref cnt on entry %p\n", entry);
859 	return ofi_atomic_inc32(&entry->ref_cnt);
860 }
861 
862 /**
863  * Decrements the reference count on a memory registration cache entry
864  *
865  * @param[in] cache  gnix memory registration cache
866  * @param[in] entry  a memory registration cache entry
867  *
868  * @return           return code from dereg callback
869  */
__mr_cache_entry_put(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t * entry)870 static inline int __mr_cache_entry_put(
871 		gnix_mr_cache_t       *cache,
872 		gnix_mr_cache_entry_t *entry)
873 {
874 	RbtIterator iter;
875 	int rc;
876 	gni_return_t grc = GNI_RC_SUCCESS;
877 	RbtIterator found;
878 	gnix_mr_cache_entry_t *parent = NULL;
879 	struct dlist_entry *next;
880 
881 	GNIX_TRACE(FI_LOG_MR, "\n");
882 
883 	if (cache->attr.lazy_deregistration) {
884 		__clear_notifier_events(cache);
885 	}
886 
887 	GNIX_DEBUG(FI_LOG_MR,
888 		   "Decrease ref cnt on entry %p\n", entry);
889 	if (ofi_atomic_dec32(&entry->ref_cnt) == 0) {
890 		next = entry->siblings.next;
891 		dlist_remove(&entry->children);
892 		dlist_remove(&entry->siblings);
893 
894 		/* if this is the last child to deallocate,
895 		 * release the reference to the parent
896 		 */
897 		if (next != &entry->siblings && dlist_empty(next)) {
898 			parent = container_of(next, gnix_mr_cache_entry_t,
899 					children);
900 			grc = __mr_cache_entry_put(cache, parent);
901 			if (OFI_UNLIKELY(grc != GNI_RC_SUCCESS)) {
902 				GNIX_ERR(FI_LOG_MR,
903 						"failed to release reference to parent, "
904 						"parent=%p refs=%d\n",
905 						parent,
906 						ofi_atomic_get32(&parent->ref_cnt));
907 			}
908 		}
909 
910 		ofi_atomic_dec32(&cache->inuse.elements);
911 
912 		if (!__entry_is_retired(entry)) {
913 			iter = rbtFind(cache->inuse.rb_tree, &entry->key);
914 			if (OFI_UNLIKELY(!iter)) {
915 				GNIX_ERR(FI_LOG_MR,
916 						"failed to find entry in the inuse cache\n");
917 			} else {
918 				rc = rbtErase(cache->inuse.rb_tree, iter);
919 				if (OFI_UNLIKELY(rc != RBT_STATUS_OK)) {
920 					GNIX_ERR(FI_LOG_MR,
921 						 "failed to erase lru entry"
922 						 " from stale tree\n");
923 				}
924 			}
925 		}
926 
927 		/* if we are doing lazy dereg and the entry
928 		 * isn't retired, put it in the stale cache
929 		 */
930 		if (cache->attr.lazy_deregistration &&
931 		    !(__entry_is_retired(entry))) {
932 			GNIX_DEBUG(FI_LOG_MR,
933 				   "moving key %llx:%llx to stale\n",
934 				   entry->key.address, entry->key.length);
935 
936 			found = rbtFindLeftmost(cache->stale.rb_tree,
937 					&entry->key, __find_overlapping_addr);
938 			if (found) {
939 				/* one or more stale entries would overlap with this
940 				 * new entry. We need to resolve these collisions by dropping
941 				 * registrations
942 				 */
943 				__resolve_stale_entry_collision(cache, found, entry);
944 			} else {
945 				/* if not found, ... */
946 				grc = __insert_entry_into_stale(cache, entry);
947 			}
948 		} else {
949 			/* if retired or not using lazy registration */
950 			GNIX_DEBUG(FI_LOG_MR,
951 				   "destroying entry, key=%llx:%llx\n",
952 				   entry->key.address, entry->key.length);
953 
954 			grc = __mr_cache_entry_destroy(cache, entry);
955 		}
956 
957 		if (OFI_UNLIKELY(grc != GNI_RC_SUCCESS)) {
958 			GNIX_INFO(FI_LOG_MR, "dereg callback returned '%s'\n",
959 				  gni_err_str[grc]);
960 		}
961 	}
962 
963 
964 	return grc;
965 }
966 
967 /**
968  * Checks the sanity of cache attributes
969  *
970  * @param[in]   attr  attributes structure to be checked
971  * @return      FI_SUCCESS if the attributes are valid
972  *              -FI_EINVAL if the attributes are invalid
973  */
__check_mr_cache_attr_sanity(gnix_mr_cache_attr_t * attr)974 static inline int __check_mr_cache_attr_sanity(gnix_mr_cache_attr_t *attr)
975 {
976 	/* 0 < attr->hard_reg_limit < attr->soft_reg_limit */
977 	if (attr->hard_reg_limit > 0 &&
978 			attr->hard_reg_limit < attr->soft_reg_limit)
979 		return -FI_EINVAL;
980 
981 	/* callbacks must be provided */
982 	if (!attr->reg_callback || !attr->dereg_callback ||
983 			!attr->destruct_callback)
984 		return -FI_EINVAL;
985 
986 	/* valid otherwise */
987 	return FI_SUCCESS;
988 }
989 
_gnix_mr_cache_init(gnix_mr_cache_t ** cache,gnix_mr_cache_attr_t * attr)990 int _gnix_mr_cache_init(
991 		gnix_mr_cache_t         **cache,
992 		gnix_mr_cache_attr_t    *attr)
993 {
994 	gnix_mr_cache_attr_t *cache_attr = &__default_mr_cache_attr;
995 	gnix_mr_cache_t *cache_p;
996 	int rc;
997 
998 	GNIX_TRACE(FI_LOG_MR, "\n");
999 
1000 	/* if the provider asks us to use their attributes, are they sane? */
1001 	if (attr) {
1002 		if (__check_mr_cache_attr_sanity(attr) != FI_SUCCESS)
1003 			return -FI_EINVAL;
1004 
1005 		cache_attr = attr;
1006 	}
1007 
1008 	cache_p = (gnix_mr_cache_t *) calloc(1, sizeof(*cache_p));
1009 	if (!cache_p)
1010 		return -FI_ENOMEM;
1011 
1012 	/* save the attribute values */
1013 	memcpy(&cache_p->attr, cache_attr, sizeof(*cache_attr));
1014 
1015 	/* list is used because entries can be removed from the stale list if
1016 	 *   a user might call register on a stale entry's memory region
1017 	 */
1018 	dlist_init(&cache_p->lru_head);
1019 
1020 	/* set up inuse tree */
1021 	cache_p->inuse.rb_tree = rbtNew(__mr_cache_key_comp);
1022 	if (!cache_p->inuse.rb_tree) {
1023 		rc = -FI_ENOMEM;
1024 		goto err_inuse;
1025 	}
1026 
1027 	/* if using lazy deregistration, set up stale tree */
1028 	if (cache_p->attr.lazy_deregistration) {
1029 		cache_p->stale.rb_tree = rbtNew(__mr_cache_key_comp);
1030 		if (!cache_p->stale.rb_tree) {
1031 			rc = -FI_ENOMEM;
1032 			goto err_stale;
1033 		}
1034 	}
1035 
1036 	/* initialize the element counts. If we are reinitializing a dead cache,
1037 	 *   destroy will have already set the element counts
1038 	 */
1039 	if (cache_p->state == GNIX_MRC_STATE_UNINITIALIZED) {
1040 		ofi_atomic_initialize32(&cache_p->inuse.elements, 0);
1041 		ofi_atomic_initialize32(&cache_p->stale.elements, 0);
1042 	}
1043 
1044 	cache_p->hits = 0;
1045 	cache_p->misses = 0;
1046 
1047 	cache_p->state = GNIX_MRC_STATE_READY;
1048 
1049 	dlist_init(&cache_p->rq.list);
1050 	dlist_init(&cache_p->rq.entry);
1051 	fastlock_init(&cache_p->rq.lock);
1052 
1053 	*cache = cache_p;
1054 
1055 	return FI_SUCCESS;
1056 
1057 err_stale:
1058 	rbtDelete(cache_p->inuse.rb_tree);
1059 	cache_p->inuse.rb_tree = NULL;
1060 err_inuse:
1061 	free(cache_p);
1062 
1063 	return rc;
1064 }
1065 
_gnix_mr_cache_destroy(gnix_mr_cache_t * cache)1066 int _gnix_mr_cache_destroy(gnix_mr_cache_t *cache)
1067 {
1068 	if (cache->state != GNIX_MRC_STATE_READY)
1069 		return -FI_EINVAL;
1070 
1071 	GNIX_TRACE(FI_LOG_MR, "\n");
1072 
1073 	/*
1074 	 * Remove all of the stale entries from the cache
1075 	 */
1076 	_gnix_mr_cache_flush(cache);
1077 
1078 	/*
1079 	 * if there are still elements in the cache after the flush,
1080 	 *   then someone forgot to deregister memory.
1081 	 *   We probably shouldn't destroy the cache at this point.
1082 	 */
1083 	if (ofi_atomic_get32(&cache->inuse.elements) != 0) {
1084 		return -FI_EAGAIN;
1085 	}
1086 
1087 	/* destroy the tree */
1088 	rbtDelete(cache->inuse.rb_tree);
1089 	cache->inuse.rb_tree = NULL;
1090 
1091 	/* stale will been flushed already, so just destroy the tree */
1092 	if (cache->attr.lazy_deregistration) {
1093 		rbtDelete(cache->stale.rb_tree);
1094 		cache->stale.rb_tree = NULL;
1095 	}
1096 
1097 	cache->state = GNIX_MRC_STATE_DEAD;
1098 	free(cache);
1099 
1100 	return FI_SUCCESS;
1101 }
1102 
__mr_cache_flush(gnix_mr_cache_t * cache,int flush_count)1103 int __mr_cache_flush(gnix_mr_cache_t *cache, int flush_count) {
1104 	int rc;
1105 	RbtIterator iter;
1106 	gnix_mr_cache_entry_t *entry;
1107 	int destroyed = 0;
1108 
1109 	GNIX_TRACE(FI_LOG_MR, "\n");
1110 
1111 	GNIX_DEBUG(FI_LOG_MR, "starting flush on memory registration cache\n");
1112 
1113 	/* flushes are unnecessary for caches without lazy deregistration */
1114 	if (!cache->attr.lazy_deregistration)
1115 		return FI_SUCCESS;
1116 
1117 	while (!dlist_empty(&cache->lru_head)) {
1118 
1119 		if (flush_count >= 0 && flush_count == destroyed)
1120 			break;
1121 
1122 		rc = __mr_cache_lru_dequeue(cache, &entry);
1123 		if (OFI_UNLIKELY(rc != FI_SUCCESS)) {
1124 			GNIX_ERR(FI_LOG_MR,
1125 					"list may be corrupt, no entries from lru pop\n");
1126 			break;
1127 		}
1128 
1129 		GNIX_DEBUG(FI_LOG_MR, "attempting to flush key %llx:%llx\n",
1130 			   entry->key.address, entry->key.length);
1131 		iter = rbtFind(cache->stale.rb_tree, &entry->key);
1132 		if (OFI_UNLIKELY(!iter)) {
1133 			GNIX_ERR(FI_LOG_MR,
1134 				 "lru entries MUST be present in the cache,"
1135 				 " could not find entry (%p) in stale tree"
1136 				 " %llx:%llx\n",
1137 				 entry, entry->key.address, entry->key.length);
1138 			break;
1139 		}
1140 
1141 		rc = rbtErase(cache->stale.rb_tree, iter);
1142 		if (OFI_UNLIKELY(rc != RBT_STATUS_OK)) {
1143 			GNIX_ERR(FI_LOG_MR,
1144 					"failed to erase lru entry from stale tree\n");
1145 			break;
1146 		}
1147 
1148 		__mr_cache_entry_destroy(cache, entry);
1149 		entry = NULL;
1150 		++destroyed;
1151 	}
1152 
1153 	GNIX_DEBUG(FI_LOG_MR, "flushed %i of %i entries from memory "
1154 		   "registration cache\n", destroyed,
1155 		   ofi_atomic_get32(&cache->stale.elements));
1156 
1157 	if (destroyed > 0) {
1158 		ofi_atomic_sub32(&cache->stale.elements, destroyed);
1159 	}
1160 
1161 	return FI_SUCCESS;
1162 }
1163 
_gnix_mr_cache_flush(gnix_mr_cache_t * cache)1164 int _gnix_mr_cache_flush(gnix_mr_cache_t *cache)
1165 {
1166 
1167 	if (OFI_UNLIKELY(cache->state != GNIX_MRC_STATE_READY))
1168 		return -FI_EINVAL;
1169 
1170 	__mr_cache_flush(cache, cache->attr.hard_reg_limit);
1171 
1172 	return FI_SUCCESS;
1173 }
1174 
__mr_cache_search_inuse(gnix_mr_cache_t * cache,uint64_t address,uint64_t length,gnix_mr_cache_entry_t ** entry,gnix_mr_cache_key_t * key,struct _gnix_fi_reg_context * fi_reg_context)1175 static int __mr_cache_search_inuse(
1176 		gnix_mr_cache_t             *cache,
1177 		uint64_t                    address,
1178 		uint64_t                    length,
1179 		gnix_mr_cache_entry_t       **entry,
1180 		gnix_mr_cache_key_t         *key,
1181 		struct _gnix_fi_reg_context *fi_reg_context)
1182 {
1183 	int ret = FI_SUCCESS, cmp;
1184 	RbtIterator iter;
1185 	gnix_mr_cache_key_t *found_key, new_key;
1186 	gnix_mr_cache_entry_t *found_entry;
1187 	uint64_t new_end, found_end;
1188 	DLIST_HEAD(retired_entries);
1189 
1190 	if (cache->attr.lazy_deregistration) {
1191 		__clear_notifier_events(cache);
1192 	}
1193 
1194 	/* first we need to find an entry that overlaps with this one.
1195 	 * care should be taken to find the left most entry that overlaps
1196 	 * with this entry since the entry we are searching for might overlap
1197 	 * many entries and the tree may be left or right balanced
1198 	 * at the head
1199 	 */
1200 	iter = rbtFindLeftmost(cache->inuse.rb_tree, (void *) key,
1201 			__find_overlapping_addr);
1202 	if (!iter) {
1203 		GNIX_DEBUG(FI_LOG_MR,
1204 			   "could not find key in inuse, key=%llx:%llx\n",
1205 			   key->address, key->length);
1206 		return -FI_ENOENT;
1207 	}
1208 
1209 	rbtKeyValue(cache->inuse.rb_tree, iter, (void **) &found_key,
1210 			(void **) &found_entry);
1211 
1212 	GNIX_DEBUG(FI_LOG_MR,
1213 		   "found a key that matches the search criteria, "
1214 		   "found=%llx:%llx key=%llx:%llx\n",
1215 		   found_key->address, found_key->length,
1216 		   key->address, key->length);
1217 
1218 	/* if the entry that we've found completely subsumes
1219 	 * the requested entry, just return a reference to
1220 	 * that existing registration
1221 	 */
1222 	if (__can_subsume(found_key, key)) {
1223 		GNIX_DEBUG(FI_LOG_MR,
1224 			   "found an entry that subsumes the request, "
1225 			   "existing=%llx:%llx key=%llx:%llx entry %p\n",
1226 			   found_key->address, found_key->length,
1227 			   key->address, key->length, found_entry);
1228 		*entry = found_entry;
1229 		__mr_cache_entry_get(cache, found_entry);
1230 
1231 		cache->hits++;
1232 		return FI_SUCCESS;
1233 	}
1234 
1235 	/* otherwise, iterate from the existing entry until we can no longer
1236 	 * find an entry that overlaps with the new registration and remove
1237 	 * and retire each of those entries.
1238 	 */
1239 	new_key.address = MIN(found_key->address, key->address);
1240 	new_end = key->address + key->length;
1241 	while (iter) {
1242 		rbtKeyValue(cache->inuse.rb_tree, iter, (void **) &found_key,
1243 				(void **) &found_entry);
1244 
1245 
1246 		cmp = __find_overlapping_addr(found_key, key);
1247 		GNIX_DEBUG(FI_LOG_MR,
1248 			   "candidate: key=%llx:%llx result=%d\n",
1249 			   found_key->address,
1250 			   found_key->length, cmp);
1251 		if (cmp != 0)
1252 			break;
1253 
1254 		/* compute new ending address */
1255 		found_end = found_key->address + found_key->length;
1256 
1257 		/* mark the entry as retired */
1258 		GNIX_DEBUG(FI_LOG_MR, "retiring entry, key=%llx:%llx entry %p\n",
1259 			   found_key->address, found_key->length, found_entry);
1260 		__entry_set_retired(found_entry);
1261 		dlist_insert_tail(&found_entry->siblings, &retired_entries);
1262 
1263 		iter = rbtNext(cache->inuse.rb_tree, iter);
1264 	}
1265 	/* Since our new key might fully overlap every other entry in the tree,
1266 	 * we need to take the maximum of the last entry and the new entry
1267 	 */
1268 	new_key.length = MAX(found_end, new_end) - new_key.address;
1269 
1270 
1271 	/* remove retired entries from tree */
1272 	GNIX_DEBUG(FI_LOG_MR, "removing retired entries from inuse tree\n");
1273 	__remove_sibling_entries_from_tree(cache,
1274 			&retired_entries, cache->inuse.rb_tree);
1275 
1276 	/* create new registration */
1277 	GNIX_DEBUG(FI_LOG_MR,
1278 		   "creating a new merged registration, key=%llx:%llx\n",
1279 		   new_key.address, new_key.length);
1280 	ret = __mr_cache_create_registration(cache,
1281 			new_key.address, new_key.length,
1282 			entry, &new_key, fi_reg_context);
1283 	if (ret) {
1284 		/* If we get here, one of two things have happened.
1285 		 * Either some part of the new merged registration was
1286 		 * unmapped (i.e., freed by user) or the merged
1287 		 * registration failed for some other reason (probably
1288 		 * GNI_RC_ERROR_RESOURCE).  The first case is a user
1289 		 * error (which they should have been warned about by
1290 		 * the notifier), and the second case is always
1291 		 * possible.  Neither case is a problem.  The entries
1292 		 * above have been retired, and here we return the
1293 		 * error */
1294 		GNIX_DEBUG(FI_LOG_MR,
1295 			   "failed to create merged registration, key=",
1296 			   new_key.address, new_key.length);
1297 		return ret;
1298 	}
1299 	GNIX_DEBUG(FI_LOG_MR,
1300 		   "created a new merged registration, key=%llx:%llx entry %p\n",
1301 		   new_key.address, new_key.length, *entry);
1302 
1303 	__entry_set_merged(*entry);
1304 
1305 	/* move retired entries to the head of the new entry's child list */
1306 	if (!dlist_empty(&retired_entries)) {
1307 		__attach_retired_entries_to_registration(cache,
1308 				&retired_entries, *entry);
1309 	}
1310 
1311 	cache->misses++;
1312 
1313 	return ret;
1314 }
1315 
__mr_cache_search_stale(gnix_mr_cache_t * cache,uint64_t address,uint64_t length,gnix_mr_cache_entry_t ** entry,gnix_mr_cache_key_t * key,struct _gnix_fi_reg_context * fi_reg_context)1316 static int __mr_cache_search_stale(
1317 		gnix_mr_cache_t             *cache,
1318 		uint64_t                    address,
1319 		uint64_t                    length,
1320 		gnix_mr_cache_entry_t       **entry,
1321 		gnix_mr_cache_key_t         *key,
1322 		struct _gnix_fi_reg_context *fi_reg_context)
1323 {
1324 	int ret;
1325 	RbtStatus rc;
1326 	RbtIterator iter;
1327 	gnix_mr_cache_key_t *mr_key;
1328 	gnix_mr_cache_entry_t *mr_entry, *tmp;
1329 
1330 	if (cache->attr.lazy_deregistration) {
1331 		__clear_notifier_events(cache);
1332 	}
1333 
1334 	GNIX_DEBUG(FI_LOG_MR, "searching for stale entry, key=%llx:%llx\n",
1335 		   key->address, key->length);
1336 
1337 	iter = rbtFindLeftmost(cache->stale.rb_tree, (void *) key,
1338 			__find_overlapping_addr);
1339 	if (!iter)
1340 		return -FI_ENOENT;
1341 
1342 	rbtKeyValue(cache->stale.rb_tree, iter, (void **) &mr_key,
1343 			(void **) &mr_entry);
1344 
1345 	GNIX_DEBUG(FI_LOG_MR,
1346 		   "found a matching entry, found=%llx:%llx key=%llx:%llx\n",
1347 		   mr_key->address, mr_key->length,
1348 		   key->address, key->length);
1349 
1350 
1351 	/* if the entry that we've found completely subsumes
1352 	 * the requested entry, just return a reference to
1353 	 * that existing registration
1354 	 */
1355 	if (__can_subsume(mr_key, key)) {
1356 		ret = __mr_cache_search_inuse(cache, address, length,
1357 				&tmp, mr_key, fi_reg_context);
1358 		if (ret == FI_SUCCESS) {
1359 			/* if we found an entry in the inuse tree
1360 			 * in this manner, it means that there was
1361 			 * an entry either overlapping or contiguous
1362 			 * with the stale entry in the inuse tree, and
1363 			 * a new entry has been made and saved to tmp.
1364 			 * The old entry (mr_entry) should be destroyed
1365 			 * now as it is no longer needed.
1366 			 */
1367 			GNIX_DEBUG(FI_LOG_MR,
1368 				   "removing entry from stale key=%llx:%llx\n",
1369 				   mr_key->address, mr_key->length);
1370 
1371 			rc = rbtErase(cache->stale.rb_tree, iter);
1372 			if (OFI_UNLIKELY(rc != RBT_STATUS_OK)) {
1373 				GNIX_ERR(FI_LOG_MR,
1374 						"failed to erase entry from stale tree\n");
1375 			} else {
1376 				if (__mr_cache_lru_remove(cache, mr_entry)
1377 				    == FI_SUCCESS) {
1378 					ofi_atomic_dec32(&cache->stale.elements);
1379 				} else {
1380 					GNIX_WARN(FI_LOG_MR, "Failed to remove"
1381 						  " entry (%p) from lru list\n",
1382 						  mr_entry);
1383 				}
1384 				__mr_cache_entry_destroy(cache, mr_entry);
1385 			}
1386 
1387 			*entry = tmp;
1388 		} else {
1389 			GNIX_DEBUG(FI_LOG_MR,
1390 				   "removing entry (%p) from stale and"
1391 				   " migrating to inuse, key=%llx:%llx\n",
1392 				   mr_entry, mr_key->address, mr_key->length);
1393 			rc = rbtErase(cache->stale.rb_tree, iter);
1394 			if (OFI_UNLIKELY(rc != RBT_STATUS_OK)) {
1395 				GNIX_FATAL(FI_LOG_MR,
1396 					   "failed to erase entry (%p) from "
1397 					   " stale tree\n", mr_entry);
1398 			}
1399 
1400 			if (__mr_cache_lru_remove(cache, mr_entry)
1401 			    != FI_SUCCESS) {
1402 				GNIX_WARN(FI_LOG_MR, "Failed to remove"
1403 					  " entry (%p) from lru list\n",
1404 					  mr_entry);
1405 			}
1406 
1407 			ofi_atomic_dec32(&cache->stale.elements);
1408 
1409 			/* if we made it to this point, there weren't
1410 			 * any entries in the inuse tree that would
1411 			 * have overlapped with this entry
1412 			 */
1413 			rc = rbtInsert(cache->inuse.rb_tree,
1414 					&mr_entry->key, mr_entry);
1415 			if (OFI_UNLIKELY(rc != RBT_STATUS_OK)) {
1416 				GNIX_FATAL(FI_LOG_MR,
1417 					   "failed to insert entry into"
1418 					   "inuse tree\n");
1419 			}
1420 
1421 			ofi_atomic_set32(&mr_entry->ref_cnt, 1);
1422 			ofi_atomic_inc32(&cache->inuse.elements);
1423 
1424 			*entry = mr_entry;
1425 		}
1426 
1427 		return FI_SUCCESS;
1428 	}
1429 
1430 	GNIX_DEBUG(FI_LOG_MR,
1431 		   "could not use matching entry, "
1432 		   "found=%llx:%llx\n",
1433 		   mr_key->address, mr_key->length);
1434 
1435 	return -FI_ENOENT;
1436 }
1437 
__mr_cache_create_registration(gnix_mr_cache_t * cache,uint64_t address,uint64_t length,gnix_mr_cache_entry_t ** entry,gnix_mr_cache_key_t * key,struct _gnix_fi_reg_context * fi_reg_context)1438 static int __mr_cache_create_registration(
1439 		gnix_mr_cache_t             *cache,
1440 		uint64_t                    address,
1441 		uint64_t                    length,
1442 		gnix_mr_cache_entry_t       **entry,
1443 		gnix_mr_cache_key_t         *key,
1444 		struct _gnix_fi_reg_context *fi_reg_context)
1445 {
1446 	int rc;
1447 	gnix_mr_cache_entry_t *current_entry;
1448 	void *handle;
1449 
1450 	/* if we made it here, we didn't find the entry at all */
1451 	current_entry = calloc(1, sizeof(*current_entry) + cache->attr.elem_size);
1452 	if (!current_entry)
1453 		return -FI_ENOMEM;
1454 
1455 	handle = (void *) current_entry->mr;
1456 
1457 	dlist_init(&current_entry->lru_entry);
1458 	dlist_init(&current_entry->children);
1459 	dlist_init(&current_entry->siblings);
1460 
1461 	handle = cache->attr.reg_callback(handle, (void *) address, length,
1462 			fi_reg_context, cache->attr.reg_context);
1463 	if (OFI_UNLIKELY(!handle)) {
1464 		GNIX_INFO(FI_LOG_MR,
1465 			  "failed to register memory with callback\n");
1466 		goto err;
1467 	}
1468 
1469 	__entry_reset_state(current_entry);
1470 
1471 	/* set up the entry's key */
1472 	current_entry->key.address = address;
1473 	current_entry->key.length = length;
1474 
1475 	rc = __notifier_monitor(cache, current_entry);
1476 	if (OFI_UNLIKELY(rc != FI_SUCCESS)) {
1477 		GNIX_INFO(FI_LOG_MR,
1478 			  "failed to monitor memory with notifier\n");
1479 		goto err_dereg;
1480 	}
1481 
1482 	rc = rbtInsert(cache->inuse.rb_tree, &current_entry->key,
1483 			current_entry);
1484 	if (OFI_UNLIKELY(rc != RBT_STATUS_OK)) {
1485 		GNIX_ERR(FI_LOG_MR, "failed to insert registration "
1486 				"into cache, ret=%i\n", rc);
1487 
1488 		goto err_dereg;
1489 	}
1490 
1491 	GNIX_DEBUG(FI_LOG_MR, "inserted key %llx:%llx into inuse %p\n",
1492 		   current_entry->key.address, current_entry->key.length, current_entry);
1493 
1494 
1495 	ofi_atomic_inc32(&cache->inuse.elements);
1496 	ofi_atomic_initialize32(&current_entry->ref_cnt, 1);
1497 
1498 	*entry = current_entry;
1499 
1500 	return FI_SUCCESS;
1501 
1502 err_dereg:
1503 	rc = cache->attr.dereg_callback(current_entry->mr,
1504 					cache->attr.dereg_context);
1505 	if (OFI_UNLIKELY(rc)) {
1506 		GNIX_INFO(FI_LOG_MR,
1507 			  "failed to deregister memory with "
1508 			  "callback, ret=%d\n", rc);
1509 	}
1510 err:
1511 	free(current_entry);
1512 	return -FI_ENOMEM;
1513 }
1514 
1515 /**
1516  * Function to register memory with the cache
1517  *
1518  * @param[in] cache           gnix memory registration cache pointer
1519  * @param[in] mr              gnix memory region descriptor pointer
1520  * @param[in] address         base address of the memory region to be
1521  *                            registered
1522  * @param[in] length          length of the memory region to be registered
1523  * @param[in] fi_reg_context  fi_reg_mr API call parameters
1524  * @param[in,out] mem_hndl    gni memory handle pointer to written to and
1525  *                            returned
1526  */
_gnix_mr_cache_register(gnix_mr_cache_t * cache,uint64_t address,uint64_t length,struct _gnix_fi_reg_context * fi_reg_context,void ** handle)1527 int _gnix_mr_cache_register(
1528 		gnix_mr_cache_t             *cache,
1529 		uint64_t                    address,
1530 		uint64_t                    length,
1531 		struct _gnix_fi_reg_context *fi_reg_context,
1532 		void                        **handle)
1533 {
1534 	int ret;
1535 	gnix_mr_cache_key_t key = {
1536 			.address = address,
1537 			.length = length,
1538 	};
1539 	gnix_mr_cache_entry_t *entry;
1540 
1541 	GNIX_TRACE(FI_LOG_MR, "\n");
1542 
1543 	/* fastpath inuse */
1544 	ret = __mr_cache_search_inuse(cache, address, length,
1545 			&entry, &key, fi_reg_context);
1546 	if (ret == FI_SUCCESS)
1547 		goto success;
1548 
1549 	/* if we shouldn't introduce any new elements, return -FI_ENOSPC */
1550 	if (OFI_UNLIKELY(cache->attr.hard_reg_limit > 0 &&
1551 			 (ofi_atomic_get32(&cache->inuse.elements) >=
1552 			  cache->attr.hard_reg_limit))) {
1553 		ret = -FI_ENOSPC;
1554 		goto err;
1555 	}
1556 
1557 	if (cache->attr.lazy_deregistration) {
1558 		__clear_notifier_events(cache);
1559 
1560 		/* if lazy deregistration is in use, we can check the
1561 		 *   stale tree
1562 		 */
1563 		ret = __mr_cache_search_stale(cache, address, length,
1564 				&entry, &key, fi_reg_context);
1565 		if (ret == FI_SUCCESS) {
1566 			cache->hits++;
1567 			goto success;
1568 		}
1569 	}
1570 
1571 	/* If the cache is full, then flush one of the stale entries to make
1572 	 *   room for the new entry. This works because we check above to see if
1573 	 *   the number of inuse entries exceeds the hard reg limit
1574 	 */
1575 	if ((ofi_atomic_get32(&cache->inuse.elements) +
1576 			ofi_atomic_get32(&cache->stale.elements)) == cache->attr.hard_reg_limit)
1577 		__mr_cache_flush(cache, 1);
1578 
1579 	ret = __mr_cache_create_registration(cache, address, length,
1580 			&entry, &key, fi_reg_context);
1581 	if (ret) {
1582 		goto err;
1583 	}
1584 
1585 	cache->misses++;
1586 
1587 success:
1588 	__entry_set_state(entry, GNIX_CES_INUSE);
1589 	*handle = (void *) entry->mr;
1590 
1591 	return FI_SUCCESS;
1592 
1593 err:
1594 	return ret;
1595 }
1596 
1597 /**
1598  * Function to deregister memory in the cache
1599  *
1600  * @param[in]  mr     gnix memory registration descriptor pointer
1601  *
1602  * @return     FI_SUCCESS on success
1603  *             -FI_ENOENT if there isn't an active memory registration
1604  *               associated with the mr
1605  *             return codes associated with dereg callback
1606  */
_gnix_mr_cache_deregister(gnix_mr_cache_t * cache,void * handle)1607 int _gnix_mr_cache_deregister(
1608 		gnix_mr_cache_t *cache,
1609 		void            *handle)
1610 {
1611 	gnix_mr_cache_entry_t *entry;
1612 	gni_return_t grc;
1613 
1614 	GNIX_TRACE(FI_LOG_MR, "\n");
1615 
1616 	/* check to see if we can find the entry so that we can drop the
1617 	 *   held reference
1618 	 */
1619 
1620 	entry = container_of(handle, gnix_mr_cache_entry_t, mr);
1621 	if (__entry_get_state(entry) != GNIX_CES_INUSE) {
1622 		GNIX_WARN(FI_LOG_MR, "entry (%p) in incorrect state (%d)\n",
1623 			  entry, entry->state);
1624 		return -FI_EINVAL;
1625 	}
1626 
1627 	GNIX_DEBUG(FI_LOG_MR, "entry found, entry=%p refs=%d\n",
1628 		   entry, ofi_atomic_get32(&entry->ref_cnt));
1629 
1630 	grc = __mr_cache_entry_put(cache, entry);
1631 
1632 	/* Since we check this on each deregistration, the amount of elements
1633 	 * over the limit should always be 1
1634 	 */
1635 	if (ofi_atomic_get32(&cache->stale.elements) > cache->attr.hard_stale_limit)
1636 		__mr_cache_flush(cache, 1);
1637 
1638 	return gnixu_to_fi_errno(grc);
1639 }
1640 
1641