1 /*
2 * Copyright (c) 2016-2017 Cray Inc. All rights reserved.
3 *
4 * This software is available to you under a choice of one of two
5 * licenses. You may choose to be licensed under the terms of the GNU
6 * General Public License (GPL) Version 2, available from the file
7 * COPYING in the main directory of this source tree, or the
8 * BSD license below:
9 *
10 * Redistribution and use in source and binary forms, with or
11 * without modification, are permitted provided that the following
12 * conditions are met:
13 *
14 * - Redistributions of source code must retain the above
15 * copyright notice, this list of conditions and the following
16 * disclaimer.
17 *
18 * - Redistributions in binary form must reproduce the above
19 * copyright notice, this list of conditions and the following
20 * disclaimer in the documentation and/or other materials
21 * provided with the distribution.
22 *
23 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30 * SOFTWARE.
31 */
32
33 #include <gnix_mr_cache.h>
34 #include <gnix_smrn.h>
35 #include <gnix_mr_notifier.h>
36 #include <gnix.h>
37
38 typedef unsigned long long int cache_entry_state_t;
39 /* These are used for entry state and should be unique */
40 #define GNIX_CES_INUSE (1ULL << 8) /* in use */
41 #define GNIX_CES_STALE (2ULL << 8) /* cached for possible reuse */
42 #define GNIX_CES_STATE_MASK (0xFULL << 8)
43
44 typedef unsigned long long int cache_entry_flag_t;
45 /* One or more of these can be combined with the above */
46 #define GNIX_CE_RETIRED (1ULL << 61) /* in use, but not to be reused */
47 #define GNIX_CE_MERGED (1ULL << 62) /* merged entry, i.e., not
48 * an original request from
49 * fi_mr_reg */
50 #define GNIX_CE_UNMAPPED (1ULL << 63) /* at least 1 page of the
51 * entry has been unmapped
52 * by the OS */
53
54 /**
55 * @brief structure for containing the fields relevant to the memory cache key
56 *
57 * @var address base address of the memory region
58 * @var address length of the memory region
59 */
60 typedef struct gnix_mr_cache_key {
61 uint64_t address;
62 uint64_t length;
63 } gnix_mr_cache_key_t;
64
65 /**
66 * @brief gnix memory registration cache entry
67 *
68 * @var state state of the memory registration cache entry
69 * @var mr gnix memory registration descriptor
70 * @var mem_hndl gni memory handle for the memory registration
71 * @var key gnix memory registration cache key
72 * @var domain gnix domain associated with the memory registration
73 * @var nic gnix nic associated with the memory registration
74 * @var ref_cnt reference counting for the cache
75 * @var lru_entry lru list entry
76 * @var siblings list of sibling entries
77 * @var children list of subsumed child entries
78 */
79 typedef struct gnix_mr_cache_entry {
80 struct gnix_smrn_context context;
81 cache_entry_state_t state;
82 gnix_mr_cache_key_t key;
83 ofi_atomic32_t ref_cnt;
84 struct dlist_entry lru_entry;
85 struct dlist_entry siblings;
86 struct dlist_entry children;
87 uint64_t mr[0];
88 } gnix_mr_cache_entry_t;
89
90 /* forward declarations */
91 int _gnix_mr_cache_register(
92 gnix_mr_cache_t *cache,
93 uint64_t address,
94 uint64_t length,
95 struct _gnix_fi_reg_context *fi_reg_context,
96 void **handle);
97
98 int _gnix_mr_cache_deregister(
99 gnix_mr_cache_t *cache,
100 void *handle);
101
102 static inline int __mr_cache_entry_put(
103 gnix_mr_cache_t *cache,
104 gnix_mr_cache_entry_t *entry);
105
106 static inline int __mr_cache_entry_get(
107 gnix_mr_cache_t *cache,
108 gnix_mr_cache_entry_t *entry);
109
110 static inline int __mr_cache_entry_destroy(gnix_mr_cache_t *cache,
111 gnix_mr_cache_entry_t *entry);
112
113 static int __mr_cache_create_registration(
114 gnix_mr_cache_t *cache,
115 uint64_t address,
116 uint64_t length,
117 gnix_mr_cache_entry_t **entry,
118 gnix_mr_cache_key_t *key,
119 struct _gnix_fi_reg_context *fi_reg_context);
120
121
122 /* global declarations */
123
124 /* default attributes for new caches */
125 gnix_mr_cache_attr_t __default_mr_cache_attr = {
126 .soft_reg_limit = 4096,
127 .hard_reg_limit = -1,
128 .hard_stale_limit = 128,
129 #if HAVE_KDREG
130 .lazy_deregistration = 1,
131 #else
132 .lazy_deregistration = 0,
133 #endif
134 };
135
136 /* Functions for using and manipulating cache entry state */
__entry_get_state(gnix_mr_cache_entry_t * e)137 static inline cache_entry_state_t __entry_get_state(gnix_mr_cache_entry_t *e)
138 {
139 return e->state & GNIX_CES_STATE_MASK;
140 }
141
__entry_set_state(gnix_mr_cache_entry_t * e,cache_entry_state_t s)142 static inline void __entry_set_state(gnix_mr_cache_entry_t *e,
143 cache_entry_state_t s)
144 {
145 e->state = (e->state & ~GNIX_CES_STATE_MASK) |
146 (s & GNIX_CES_STATE_MASK);
147 }
148
__entry_reset_state(gnix_mr_cache_entry_t * e)149 static inline void __entry_reset_state(gnix_mr_cache_entry_t *e)
150 {
151 e->state = 0ULL;
152 }
153
__entry_is_flag(gnix_mr_cache_entry_t * e,cache_entry_flag_t f)154 static inline bool __entry_is_flag(gnix_mr_cache_entry_t *e,
155 cache_entry_flag_t f)
156 {
157 return (e->state & f) != 0;
158 }
159
__entry_set_flag(gnix_mr_cache_entry_t * e,cache_entry_flag_t f)160 static inline void __entry_set_flag(gnix_mr_cache_entry_t *e,
161 cache_entry_flag_t f)
162 {
163 e->state = e->state | f;
164 }
165
__entry_is_retired(gnix_mr_cache_entry_t * e)166 static inline bool __entry_is_retired(gnix_mr_cache_entry_t *e)
167 {
168 return __entry_is_flag(e, GNIX_CE_RETIRED);
169 }
170
__entry_is_merged(gnix_mr_cache_entry_t * e)171 static inline bool __entry_is_merged(gnix_mr_cache_entry_t *e)
172 {
173 return __entry_is_flag(e, GNIX_CE_MERGED);
174 }
175
__entry_is_unmapped(gnix_mr_cache_entry_t * e)176 static inline bool __entry_is_unmapped(gnix_mr_cache_entry_t *e)
177 {
178 return __entry_is_flag(e, GNIX_CE_UNMAPPED);
179 }
180
__entry_set_retired(gnix_mr_cache_entry_t * e)181 static inline void __entry_set_retired(gnix_mr_cache_entry_t *e)
182 {
183 __entry_set_flag(e, GNIX_CE_RETIRED);
184 }
185
__entry_set_merged(gnix_mr_cache_entry_t * e)186 static inline void __entry_set_merged(gnix_mr_cache_entry_t *e)
187 {
188 __entry_set_flag(e, GNIX_CE_MERGED);
189 }
190
__entry_set_unmapped(gnix_mr_cache_entry_t * e)191 static inline void __entry_set_unmapped(gnix_mr_cache_entry_t *e)
192 {
193 __entry_set_flag(e, GNIX_CE_UNMAPPED);
194 }
195
196 /**
197 * Key comparison function for finding overlapping gnix memory
198 * registration cache entries
199 *
200 * @param[in] x key to be inserted or found
201 * @param[in] y key to be compared against
202 *
203 * @return -1 if it should be positioned at the left, 0 if it overlaps,
204 * 1 otherwise
205 */
__find_overlapping_addr(void * x,void * y)206 static int __find_overlapping_addr(
207 void *x,
208 void *y)
209 {
210 gnix_mr_cache_key_t *to_find = (gnix_mr_cache_key_t *) x;
211 gnix_mr_cache_key_t *to_compare = (gnix_mr_cache_key_t *) y;
212 uint64_t to_find_end = to_find->address + to_find->length - 1;
213 uint64_t to_compare_end = to_compare->address + to_compare->length - 1;
214
215 /* format: (x_addr, x_len) - (y_addr, y_len) truth_value
216 *
217 * case 1: (0x1000, 0x1000) - (0x1400, 0x0800) true
218 * case 2: (0x1000, 0x1000) - (0x0C00, 0x0800) true
219 * case 3: (0x1000, 0x1000) - (0x1C00, 0x0800) true
220 * case 4: (0x1000, 0x1000) - (0x0C00, 0x2000) true
221 * case 5: (0x1000, 0x1000) - (0x0400, 0x0400) false
222 * case 6: (0x1000, 0x1000) - (0x2400, 0x0400) false
223 */
224 if (!(to_find_end < to_compare->address ||
225 to_compare_end < to_find->address))
226 return 0;
227
228 /* left */
229 if (to_find->address < to_compare->address)
230 return -1;
231
232 return 1;
233 }
234
235 /**
236 * Key comparison function for gnix memory registration caches
237 *
238 * @param[in] x key to be inserted or found
239 * @param[in] y key to be compared against
240 *
241 * @return -1 if it should be positioned at the left, 0 if the same,
242 * 1 otherwise
243 */
__mr_cache_key_comp(void * x,void * y)244 static inline int __mr_cache_key_comp(
245 void *x,
246 void *y)
247 {
248 gnix_mr_cache_key_t *to_insert = (gnix_mr_cache_key_t *) x;
249 gnix_mr_cache_key_t *to_compare = (gnix_mr_cache_key_t *) y;
250
251 if (to_compare->address == to_insert->address)
252 return 0;
253
254 /* to the left */
255 if (to_insert->address < to_compare->address)
256 return -1;
257
258 /* to the right */
259 return 1;
260 }
261
262 /**
263 * Helper function for matching the exact key entry
264 *
265 * @param entry memory registration cache key
266 * @param to_match memory registration cache key
267 * @return 1 if match, otherwise 0
268 */
__match_exact_key(gnix_mr_cache_key_t * entry,gnix_mr_cache_key_t * to_match)269 static inline int __match_exact_key(
270 gnix_mr_cache_key_t *entry,
271 gnix_mr_cache_key_t *to_match)
272 {
273 return entry->address == to_match->address &&
274 entry->length == to_match->length;
275 }
276
277 /**
278 * dlist search function for matching the exact memory registration key
279 *
280 * @param entry memory registration cache entry
281 * @param match memory registration cache key
282 * @return 1 if match, otherwise 0
283 */
__mr_exact_key(struct dlist_entry * entry,const void * match)284 static inline int __mr_exact_key(struct dlist_entry *entry,
285 const void *match)
286 {
287 gnix_mr_cache_entry_t *x = container_of(entry,
288 gnix_mr_cache_entry_t,
289 siblings);
290
291 gnix_mr_cache_key_t *y = (gnix_mr_cache_key_t *) match;
292
293 return __match_exact_key(&x->key, y);
294 }
295
296
297 /**
298 * Helper function to determine if one key subsumes another
299 *
300 * @param x gnix_mr_cache_key
301 * @param y gnix_mr_cache_key
302 * @return 1 if x subsumes y, 0 otherwise
303 */
__can_subsume(gnix_mr_cache_key_t * x,gnix_mr_cache_key_t * y)304 static inline int __can_subsume(
305 gnix_mr_cache_key_t *x,
306 gnix_mr_cache_key_t *y)
307 {
308 return (x->address <= y->address) &&
309 ((x->address + x->length) >=
310 (y->address + y->length));
311 }
312
__attach_retired_entries_to_registration(gnix_mr_cache_t * cache,struct dlist_entry * retired_entries,gnix_mr_cache_entry_t * parent)313 static inline void __attach_retired_entries_to_registration(
314 gnix_mr_cache_t *cache,
315 struct dlist_entry *retired_entries,
316 gnix_mr_cache_entry_t *parent)
317 {
318 gnix_mr_cache_entry_t *entry, *tmp;
319
320 dlist_for_each_safe(retired_entries, entry, tmp, siblings) {
321 dlist_remove(&entry->siblings);
322 dlist_insert_tail(&entry->siblings,
323 &parent->children);
324 if (!dlist_empty(&entry->children)) {
325 /* move the entry's children to the sibling tree
326 * and decrement the reference count */
327 dlist_splice_tail(&parent->children,
328 &entry->children);
329 __mr_cache_entry_put(cache, entry);
330 }
331 }
332
333 if (!dlist_empty(retired_entries)) {
334 GNIX_FATAL(FI_LOG_MR, "retired_entries not empty\n");
335 }
336
337 __mr_cache_entry_get(cache, parent);
338 }
339
__remove_sibling_entries_from_tree(gnix_mr_cache_t * cache,struct dlist_entry * list,RbtHandle tree)340 static inline void __remove_sibling_entries_from_tree(
341 gnix_mr_cache_t *cache,
342 struct dlist_entry *list,
343 RbtHandle tree)
344 {
345 RbtStatus rc;
346 RbtIterator iter;
347 gnix_mr_cache_entry_t *entry;
348
349 dlist_for_each(list, entry, siblings)
350 {
351 GNIX_DEBUG(FI_LOG_MR,
352 "removing key from tree, key=%llx:%llx\n",
353 entry->key.address, entry->key.length);
354 iter = rbtFind(tree, &entry->key);
355 if (OFI_UNLIKELY(!iter)) {
356 GNIX_FATAL(FI_LOG_MR, "key not found\n");
357 }
358
359 rc = rbtErase(tree, iter);
360 if (OFI_UNLIKELY(rc != RBT_STATUS_OK)) {
361 GNIX_FATAL(FI_LOG_MR,
362 "could not remove entry from tree\n");
363 }
364 }
365 }
366
367 /**
368 * Pushes an entry into the LRU cache. No limits are maintained here as
369 * the hard_stale_limit attr value will directly limit the lru size
370 *
371 * @param[in] cache a memory registration cache object
372 * @param[in] entry a memory registration cache entry
373 *
374 * @return FI_SUCCESS, always
375 */
__mr_cache_lru_enqueue(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t * entry)376 static inline int __mr_cache_lru_enqueue(
377 gnix_mr_cache_t *cache,
378 gnix_mr_cache_entry_t *entry)
379 {
380 dlist_insert_tail(&entry->lru_entry, &cache->lru_head);
381
382 return FI_SUCCESS;
383 }
384
385 /**
386 * Pops an registration cache entry from the lru cache.
387 *
388 * @param[in] cache a memory registration cache
389 * @param[in] entry a memory registration cache entry
390 *
391 * @return FI_SUCCESS, on success
392 * @return -FI_ENOENT, on empty LRU
393 */
__mr_cache_lru_dequeue(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t ** entry)394 static inline int __mr_cache_lru_dequeue(
395 gnix_mr_cache_t *cache,
396 gnix_mr_cache_entry_t **entry)
397 {
398 gnix_mr_cache_entry_t *ret;
399
400 ret = dlist_first_entry(&cache->lru_head,
401 gnix_mr_cache_entry_t, lru_entry);
402 if (OFI_UNLIKELY(!ret)) { /* we check list_empty before calling */
403 *entry = NULL;
404 return -FI_ENOENT;
405 }
406
407 /* remove entry from the list */
408 *entry = ret;
409 dlist_remove(&ret->lru_entry);
410
411 return FI_SUCCESS;
412 }
413
414 /**
415 * Removes a particular registration cache entry from the lru cache.
416 *
417 * @param[in] cache a memory registration cache
418 * @param[in] entry a memory registration cache entry
419 *
420 * @return FI_SUCCESS, on success
421 * @return -FI_ENOENT, on empty LRU
422 */
__mr_cache_lru_remove(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t * entry)423 static inline int __mr_cache_lru_remove(
424 gnix_mr_cache_t *cache,
425 gnix_mr_cache_entry_t *entry)
426 {
427 /* Could do some error checking to see if in cache */
428
429 dlist_remove(&entry->lru_entry);
430
431 return FI_SUCCESS;
432 }
433
434 /**
435 * Remove entries that have been unmapped as indicated by the notifer
436 *
437 * @param[in] cache a memory registration cache
438 *
439 * @return nothing
440 */
441 static bool __notifier_warned = false;
442 static void
__clear_notifier_events(gnix_mr_cache_t * cache)443 __clear_notifier_events(gnix_mr_cache_t *cache)
444 {
445 int ret;
446 gnix_mr_cache_entry_t *entry;
447 struct gnix_smrn_context *context;
448 RbtIterator iter;
449
450 if (!cache->attr.smrn) {
451 return;
452 }
453
454 if (!cache->attr.lazy_deregistration) {
455 return;
456 }
457
458 ret = _gnix_smrn_get_event(cache->attr.smrn,
459 &cache->rq, &context);
460 while (ret == FI_SUCCESS) {
461 entry = container_of(context,
462 struct gnix_mr_cache_entry, context);
463 switch (__entry_get_state(entry)) {
464 case GNIX_CES_INUSE:
465 /* First, warn that this might be a
466 * problem.*/
467 if ((__notifier_warned == false) &&
468 !__entry_is_merged(entry)) {
469 GNIX_WARN(FI_LOG_MR,
470 "Registered memory region"
471 " includes unmapped pages."
472 " Have you freed memory"
473 " without closing the memory"
474 " region?\n");
475 __notifier_warned = true;
476 }
477
478 GNIX_DEBUG(FI_LOG_MR,
479 "Marking unmapped entry (%p)"
480 " as retired %llx:%llx\n", entry,
481 entry->key.address,
482 entry->key.length);
483
484 __entry_set_unmapped(entry);
485
486 if (__entry_is_retired(entry)) {
487 /* Nothing to do */
488 break;
489 }
490
491 /* Retire this entry (remove from
492 * inuse tree) */
493
494 __entry_set_retired(entry);
495 iter = rbtFind(cache->inuse.rb_tree,
496 &entry->key);
497 if (OFI_LIKELY(iter != NULL)) {
498 ret = rbtErase(cache->inuse.rb_tree,
499 iter);
500 if (ret != RBT_STATUS_OK) {
501 GNIX_FATAL(FI_LOG_MR,
502 "Unmapped entry"
503 " could not be"
504 " removed from"
505 " in usetree.\n");
506 }
507 } else {
508 /* The only way we should get
509 * here is if we're in the
510 * middle of retiring this
511 * entry. Not sure if this
512 * is worth a separate
513 * warning from the one
514 * above. */
515 }
516
517 break;
518 case GNIX_CES_STALE:
519 __entry_set_unmapped(entry);
520 iter = rbtFind(cache->stale.rb_tree,
521 &entry->key);
522 if (!iter) {
523 break;
524 }
525
526 ret = rbtErase(cache->stale.rb_tree, iter);
527 if (ret != RBT_STATUS_OK) {
528 GNIX_FATAL(FI_LOG_MR,
529 "Unmapped entry could"
530 " not be removed "
531 " from stale tree.\n");
532 }
533
534 GNIX_DEBUG(FI_LOG_MR, "Removed unmapped entry"
535 " (%p) from stale tree %llx:%llx\n",
536 entry, entry->key.address,
537 entry->key.length);
538
539 if (__mr_cache_lru_remove(cache, entry) == FI_SUCCESS) {
540 GNIX_DEBUG(FI_LOG_MR, "Removed"
541 " unmapped entry (%p)"
542 " from lru list %llx:%llx\n",
543 entry, entry->key.address,
544 entry->key.length);
545
546 ofi_atomic_dec32(&cache->stale.elements);
547
548 } else {
549 GNIX_WARN(FI_LOG_MR, "Failed to remove"
550 " unmapped entry"
551 " from lru list (%p) %p\n",
552 entry, iter);
553 }
554
555 __mr_cache_entry_destroy(cache, entry);
556
557 break;
558 default:
559 GNIX_FATAL(FI_LOG_MR,
560 "Unmapped entry (%p) in incorrect"
561 " state: %d\n",
562 entry, entry->state);
563 }
564
565 ret = _gnix_smrn_get_event(cache->attr.smrn,
566 &cache->rq, &context);
567 }
568 if (ret != -FI_EAGAIN) {
569 /* Should we do something else here? */
570 GNIX_WARN(FI_LOG_MR,
571 "_gnix_smrn_get_event returned error: %s\n",
572 fi_strerror(-ret));
573 }
574
575 return;
576 }
577
578 /**
579 * Start monitoring a memory region
580 *
581 * @param[in] cache a memory registration cache
582 * @param[in] entry a memory registration entry
583 *
584 * @return return code from _gnix_smrn_monitor
585 */
586 static int
__notifier_monitor(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t * entry)587 __notifier_monitor(gnix_mr_cache_t *cache,
588 gnix_mr_cache_entry_t *entry)
589 {
590
591 if (!cache->attr.lazy_deregistration) {
592 return FI_SUCCESS;
593 }
594
595 if (cache->attr.smrn == NULL) {
596 return FI_SUCCESS;
597 }
598
599 GNIX_DEBUG(FI_LOG_MR, "monitoring entry=%p %llx:%llx\n", entry,
600 entry->key.address, entry->key.length);
601
602 return _gnix_smrn_monitor(cache->attr.smrn,
603 &cache->rq,
604 (void *) entry->key.address,
605 entry->key.length,
606 (uint64_t) &entry->context,
607 &entry->context);
608 }
609
610 /**
611 * Stop monitoring a memory region
612 *
613 * @param[in] cache a memory registration cache
614 * @param[in] entry a memory registration entry
615 *
616 * @return nothing
617 */
618 static void
__notifier_unmonitor(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t * entry)619 __notifier_unmonitor(gnix_mr_cache_t *cache,
620 gnix_mr_cache_entry_t *entry)
621 {
622 int rc;
623
624 if (!cache->attr.lazy_deregistration) {
625 return;
626 }
627
628 if (cache->attr.smrn == NULL) {
629 return;
630 }
631
632 __clear_notifier_events(cache);
633
634 if (!__entry_is_unmapped(entry)) {
635 GNIX_DEBUG(FI_LOG_MR, "unmonitoring entry=%p (state=%d)\n",
636 entry, entry->state);
637 rc = _gnix_smrn_unmonitor(cache->attr.smrn,
638 (uint64_t) &entry->context,
639 &entry->context);
640 if (rc != FI_SUCCESS) {
641 /* This probably is okay (ESRCH), because the
642 * memory could have been unmapped in the
643 * interim, so clear the notifier events
644 * again */
645 GNIX_DEBUG(FI_LOG_MR,
646 "failed to unmonitor memory (entry=%p),"
647 " so clear notifier events again\n",
648 entry, fi_strerror(-rc));
649
650 __clear_notifier_events(cache);
651 }
652 }
653 }
654
655 /**
656 * Destroys the memory registration cache entry and deregisters the memory
657 * region with uGNI
658 *
659 * @param[in] entry a memory registration cache entry
660 *
661 * @return return code from callbacks
662 */
__mr_cache_entry_destroy(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t * entry)663 static inline int __mr_cache_entry_destroy(gnix_mr_cache_t *cache,
664 gnix_mr_cache_entry_t *entry)
665 {
666 int rc;
667
668 rc = cache->attr.dereg_callback(entry->mr,
669 cache->attr.destruct_context);
670 if (!rc) {
671 /* Should we bother with this check? If we don't, the
672 * only difference it that __clear_notifier_events
673 * will be called one additional time. */
674 if (!__entry_is_unmapped(entry)) {
675 __notifier_unmonitor(cache, entry);
676 }
677 __entry_reset_state(entry);
678
679 rc = cache->attr.destruct_callback(cache->attr.dereg_context);
680 if (!rc)
681 free(entry);
682 } else {
683 GNIX_INFO(FI_LOG_MR, "failed to deregister memory"
684 " region with callback, "
685 "cache_entry=%p ret=%i\n", entry, rc);
686 }
687
688 return rc;
689 }
690
__insert_entry_into_stale(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t * entry)691 static inline int __insert_entry_into_stale(
692 gnix_mr_cache_t *cache,
693 gnix_mr_cache_entry_t *entry)
694 {
695 RbtStatus rc;
696 int ret = 0;
697
698 if (__entry_is_unmapped(entry)) {
699 GNIX_DEBUG(FI_LOG_MR, "entry (%p) unmapped, not inserting"
700 " into stale %llx:%llx", entry,
701 entry->key.address, entry->key.length);
702 /* Should we return some other value? */
703 return ret;
704 }
705
706 rc = rbtInsert(cache->stale.rb_tree,
707 &entry->key,
708 entry);
709 if (rc != RBT_STATUS_OK) {
710 GNIX_ERR(FI_LOG_MR,
711 "could not insert into stale rb tree,"
712 " rc=%d key.address=%llx key.length=%llx entry=%p",
713 rc,
714 entry->key.address,
715 entry->key.length,
716 entry);
717
718 ret = __mr_cache_entry_destroy(cache, entry);
719 } else {
720 GNIX_DEBUG(FI_LOG_MR,
721 "inserted key=%llx:%llx into stale\n",
722 entry->key.address, entry->key.length);
723
724 __mr_cache_lru_enqueue(cache, entry);
725 ofi_atomic_inc32(&cache->stale.elements);
726 switch (__entry_get_state(entry)) {
727 case GNIX_CES_INUSE:
728 __entry_set_state(entry, GNIX_CES_STALE);
729 break;
730 default:
731 GNIX_FATAL(FI_LOG_MR,
732 "stale entry (%p) %llx:%llx in bad"
733 " state (%d)\n", entry,
734 entry->key.address, entry->key.length,
735 entry->state);
736 }
737 }
738
739 return ret;
740 }
741
__resolve_stale_entry_collision(gnix_mr_cache_t * cache,RbtIterator found,gnix_mr_cache_entry_t * entry)742 static inline void __resolve_stale_entry_collision(
743 gnix_mr_cache_t *cache,
744 RbtIterator found,
745 gnix_mr_cache_entry_t *entry)
746 {
747 RbtStatus __attribute__((unused)) rc;
748 gnix_mr_cache_entry_t *c_entry, *tmp;
749 gnix_mr_cache_key_t *c_key;
750 int ret;
751 DLIST_HEAD(to_destroy);
752 RbtIterator iter = found;
753 int add_new_entry = 1, cmp;
754
755 GNIX_TRACE(FI_LOG_MR, "\n");
756
757 GNIX_DEBUG(FI_LOG_MR,
758 "resolving collisions with entry (%p) %llx:%llx\n",
759 entry, entry->key.address, entry->key.length);
760
761 while (iter) {
762 rbtKeyValue(cache->stale.rb_tree, iter, (void **) &c_key,
763 (void **) &c_entry);
764
765 cmp = __find_overlapping_addr(&entry->key, c_key);
766 if (cmp != 0)
767 break;
768
769 if (__can_subsume(&entry->key, c_key) ||
770 (entry->key.length > c_key->length)) {
771 GNIX_DEBUG(FI_LOG_MR,
772 "adding stale entry (%p) to destroy list,"
773 " key=%llx:%llx\n", c_entry,
774 c_key->address, c_key->length);
775 dlist_insert_tail(&c_entry->siblings, &to_destroy);
776 } else {
777 add_new_entry = 0;
778 }
779
780 iter = rbtNext(cache->stale.rb_tree, iter);
781 }
782
783 /* TODO I can probably do this in a single sweep, avoiding a second
784 * pass and incurring n lg n removal time
785 */
786 dlist_for_each_safe(&to_destroy, c_entry, tmp, siblings)
787 {
788 GNIX_DEBUG(FI_LOG_MR, "removing key from tree, entry %p"
789 " key=%llx:%llx\n", c_entry,
790 c_entry->key.address, c_entry->key.length);
791 iter = rbtFind(cache->stale.rb_tree, &c_entry->key);
792 if (OFI_UNLIKELY(!iter)) {
793 GNIX_FATAL(FI_LOG_MR, "key not found\n");
794 }
795
796 rc = rbtErase(cache->stale.rb_tree,
797 iter);
798 if (OFI_UNLIKELY(rc != RBT_STATUS_OK)) {
799 GNIX_FATAL(FI_LOG_MR,
800 "could not remove entry from tree\n");
801 }
802
803 if (__mr_cache_lru_remove(cache, c_entry) != FI_SUCCESS) {
804 GNIX_WARN(FI_LOG_MR, "Failed to remove entry"
805 " from lru list (%p)\n",
806 c_entry);
807 }
808 ofi_atomic_dec32(&cache->stale.elements);
809 dlist_remove(&c_entry->siblings);
810 __mr_cache_entry_destroy(cache, c_entry);
811 }
812 if (OFI_UNLIKELY(!dlist_empty(&to_destroy))) {
813 GNIX_FATAL(FI_LOG_MR, "to_destroy not empty\n");
814 }
815
816 if (add_new_entry) {
817 ret = __insert_entry_into_stale(cache, entry);
818 if (ret) {
819 GNIX_FATAL(FI_LOG_MR,
820 "Failed to insert subsumed MR "
821 " entry (%p) into stale list\n",
822 entry);
823 }
824 } else {
825 /* stale entry is larger than this one
826 * so lets just toss this entry out
827 */
828 GNIX_DEBUG(FI_LOG_MR,
829 "larger entry already exists, "
830 "to_destroy=%llx:%llx\n",
831 entry->key.address, entry->key.length);
832
833 ret = __mr_cache_entry_destroy(cache, entry);
834 if (ret) {
835 GNIX_ERR(FI_LOG_MR,
836 "failed to destroy a "
837 "registration, entry=%p grc=%d\n",
838 c_entry, ret);
839 }
840 }
841 }
842
843 /**
844 * Increments the reference count on a memory registration cache entry
845 *
846 * @param[in] cache gnix memory registration cache
847 * @param[in] entry a memory registration cache entry
848 *
849 * @return reference count for the registration
850 */
__mr_cache_entry_get(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t * entry)851 static inline int __mr_cache_entry_get(
852 gnix_mr_cache_t *cache,
853 gnix_mr_cache_entry_t *entry)
854 {
855 GNIX_TRACE(FI_LOG_MR, "\n");
856
857 GNIX_DEBUG(FI_LOG_MR,
858 "Up ref cnt on entry %p\n", entry);
859 return ofi_atomic_inc32(&entry->ref_cnt);
860 }
861
862 /**
863 * Decrements the reference count on a memory registration cache entry
864 *
865 * @param[in] cache gnix memory registration cache
866 * @param[in] entry a memory registration cache entry
867 *
868 * @return return code from dereg callback
869 */
__mr_cache_entry_put(gnix_mr_cache_t * cache,gnix_mr_cache_entry_t * entry)870 static inline int __mr_cache_entry_put(
871 gnix_mr_cache_t *cache,
872 gnix_mr_cache_entry_t *entry)
873 {
874 RbtIterator iter;
875 int rc;
876 gni_return_t grc = GNI_RC_SUCCESS;
877 RbtIterator found;
878 gnix_mr_cache_entry_t *parent = NULL;
879 struct dlist_entry *next;
880
881 GNIX_TRACE(FI_LOG_MR, "\n");
882
883 if (cache->attr.lazy_deregistration) {
884 __clear_notifier_events(cache);
885 }
886
887 GNIX_DEBUG(FI_LOG_MR,
888 "Decrease ref cnt on entry %p\n", entry);
889 if (ofi_atomic_dec32(&entry->ref_cnt) == 0) {
890 next = entry->siblings.next;
891 dlist_remove(&entry->children);
892 dlist_remove(&entry->siblings);
893
894 /* if this is the last child to deallocate,
895 * release the reference to the parent
896 */
897 if (next != &entry->siblings && dlist_empty(next)) {
898 parent = container_of(next, gnix_mr_cache_entry_t,
899 children);
900 grc = __mr_cache_entry_put(cache, parent);
901 if (OFI_UNLIKELY(grc != GNI_RC_SUCCESS)) {
902 GNIX_ERR(FI_LOG_MR,
903 "failed to release reference to parent, "
904 "parent=%p refs=%d\n",
905 parent,
906 ofi_atomic_get32(&parent->ref_cnt));
907 }
908 }
909
910 ofi_atomic_dec32(&cache->inuse.elements);
911
912 if (!__entry_is_retired(entry)) {
913 iter = rbtFind(cache->inuse.rb_tree, &entry->key);
914 if (OFI_UNLIKELY(!iter)) {
915 GNIX_ERR(FI_LOG_MR,
916 "failed to find entry in the inuse cache\n");
917 } else {
918 rc = rbtErase(cache->inuse.rb_tree, iter);
919 if (OFI_UNLIKELY(rc != RBT_STATUS_OK)) {
920 GNIX_ERR(FI_LOG_MR,
921 "failed to erase lru entry"
922 " from stale tree\n");
923 }
924 }
925 }
926
927 /* if we are doing lazy dereg and the entry
928 * isn't retired, put it in the stale cache
929 */
930 if (cache->attr.lazy_deregistration &&
931 !(__entry_is_retired(entry))) {
932 GNIX_DEBUG(FI_LOG_MR,
933 "moving key %llx:%llx to stale\n",
934 entry->key.address, entry->key.length);
935
936 found = rbtFindLeftmost(cache->stale.rb_tree,
937 &entry->key, __find_overlapping_addr);
938 if (found) {
939 /* one or more stale entries would overlap with this
940 * new entry. We need to resolve these collisions by dropping
941 * registrations
942 */
943 __resolve_stale_entry_collision(cache, found, entry);
944 } else {
945 /* if not found, ... */
946 grc = __insert_entry_into_stale(cache, entry);
947 }
948 } else {
949 /* if retired or not using lazy registration */
950 GNIX_DEBUG(FI_LOG_MR,
951 "destroying entry, key=%llx:%llx\n",
952 entry->key.address, entry->key.length);
953
954 grc = __mr_cache_entry_destroy(cache, entry);
955 }
956
957 if (OFI_UNLIKELY(grc != GNI_RC_SUCCESS)) {
958 GNIX_INFO(FI_LOG_MR, "dereg callback returned '%s'\n",
959 gni_err_str[grc]);
960 }
961 }
962
963
964 return grc;
965 }
966
967 /**
968 * Checks the sanity of cache attributes
969 *
970 * @param[in] attr attributes structure to be checked
971 * @return FI_SUCCESS if the attributes are valid
972 * -FI_EINVAL if the attributes are invalid
973 */
__check_mr_cache_attr_sanity(gnix_mr_cache_attr_t * attr)974 static inline int __check_mr_cache_attr_sanity(gnix_mr_cache_attr_t *attr)
975 {
976 /* 0 < attr->hard_reg_limit < attr->soft_reg_limit */
977 if (attr->hard_reg_limit > 0 &&
978 attr->hard_reg_limit < attr->soft_reg_limit)
979 return -FI_EINVAL;
980
981 /* callbacks must be provided */
982 if (!attr->reg_callback || !attr->dereg_callback ||
983 !attr->destruct_callback)
984 return -FI_EINVAL;
985
986 /* valid otherwise */
987 return FI_SUCCESS;
988 }
989
_gnix_mr_cache_init(gnix_mr_cache_t ** cache,gnix_mr_cache_attr_t * attr)990 int _gnix_mr_cache_init(
991 gnix_mr_cache_t **cache,
992 gnix_mr_cache_attr_t *attr)
993 {
994 gnix_mr_cache_attr_t *cache_attr = &__default_mr_cache_attr;
995 gnix_mr_cache_t *cache_p;
996 int rc;
997
998 GNIX_TRACE(FI_LOG_MR, "\n");
999
1000 /* if the provider asks us to use their attributes, are they sane? */
1001 if (attr) {
1002 if (__check_mr_cache_attr_sanity(attr) != FI_SUCCESS)
1003 return -FI_EINVAL;
1004
1005 cache_attr = attr;
1006 }
1007
1008 cache_p = (gnix_mr_cache_t *) calloc(1, sizeof(*cache_p));
1009 if (!cache_p)
1010 return -FI_ENOMEM;
1011
1012 /* save the attribute values */
1013 memcpy(&cache_p->attr, cache_attr, sizeof(*cache_attr));
1014
1015 /* list is used because entries can be removed from the stale list if
1016 * a user might call register on a stale entry's memory region
1017 */
1018 dlist_init(&cache_p->lru_head);
1019
1020 /* set up inuse tree */
1021 cache_p->inuse.rb_tree = rbtNew(__mr_cache_key_comp);
1022 if (!cache_p->inuse.rb_tree) {
1023 rc = -FI_ENOMEM;
1024 goto err_inuse;
1025 }
1026
1027 /* if using lazy deregistration, set up stale tree */
1028 if (cache_p->attr.lazy_deregistration) {
1029 cache_p->stale.rb_tree = rbtNew(__mr_cache_key_comp);
1030 if (!cache_p->stale.rb_tree) {
1031 rc = -FI_ENOMEM;
1032 goto err_stale;
1033 }
1034 }
1035
1036 /* initialize the element counts. If we are reinitializing a dead cache,
1037 * destroy will have already set the element counts
1038 */
1039 if (cache_p->state == GNIX_MRC_STATE_UNINITIALIZED) {
1040 ofi_atomic_initialize32(&cache_p->inuse.elements, 0);
1041 ofi_atomic_initialize32(&cache_p->stale.elements, 0);
1042 }
1043
1044 cache_p->hits = 0;
1045 cache_p->misses = 0;
1046
1047 cache_p->state = GNIX_MRC_STATE_READY;
1048
1049 dlist_init(&cache_p->rq.list);
1050 dlist_init(&cache_p->rq.entry);
1051 fastlock_init(&cache_p->rq.lock);
1052
1053 *cache = cache_p;
1054
1055 return FI_SUCCESS;
1056
1057 err_stale:
1058 rbtDelete(cache_p->inuse.rb_tree);
1059 cache_p->inuse.rb_tree = NULL;
1060 err_inuse:
1061 free(cache_p);
1062
1063 return rc;
1064 }
1065
_gnix_mr_cache_destroy(gnix_mr_cache_t * cache)1066 int _gnix_mr_cache_destroy(gnix_mr_cache_t *cache)
1067 {
1068 if (cache->state != GNIX_MRC_STATE_READY)
1069 return -FI_EINVAL;
1070
1071 GNIX_TRACE(FI_LOG_MR, "\n");
1072
1073 /*
1074 * Remove all of the stale entries from the cache
1075 */
1076 _gnix_mr_cache_flush(cache);
1077
1078 /*
1079 * if there are still elements in the cache after the flush,
1080 * then someone forgot to deregister memory.
1081 * We probably shouldn't destroy the cache at this point.
1082 */
1083 if (ofi_atomic_get32(&cache->inuse.elements) != 0) {
1084 return -FI_EAGAIN;
1085 }
1086
1087 /* destroy the tree */
1088 rbtDelete(cache->inuse.rb_tree);
1089 cache->inuse.rb_tree = NULL;
1090
1091 /* stale will been flushed already, so just destroy the tree */
1092 if (cache->attr.lazy_deregistration) {
1093 rbtDelete(cache->stale.rb_tree);
1094 cache->stale.rb_tree = NULL;
1095 }
1096
1097 cache->state = GNIX_MRC_STATE_DEAD;
1098 free(cache);
1099
1100 return FI_SUCCESS;
1101 }
1102
__mr_cache_flush(gnix_mr_cache_t * cache,int flush_count)1103 int __mr_cache_flush(gnix_mr_cache_t *cache, int flush_count) {
1104 int rc;
1105 RbtIterator iter;
1106 gnix_mr_cache_entry_t *entry;
1107 int destroyed = 0;
1108
1109 GNIX_TRACE(FI_LOG_MR, "\n");
1110
1111 GNIX_DEBUG(FI_LOG_MR, "starting flush on memory registration cache\n");
1112
1113 /* flushes are unnecessary for caches without lazy deregistration */
1114 if (!cache->attr.lazy_deregistration)
1115 return FI_SUCCESS;
1116
1117 while (!dlist_empty(&cache->lru_head)) {
1118
1119 if (flush_count >= 0 && flush_count == destroyed)
1120 break;
1121
1122 rc = __mr_cache_lru_dequeue(cache, &entry);
1123 if (OFI_UNLIKELY(rc != FI_SUCCESS)) {
1124 GNIX_ERR(FI_LOG_MR,
1125 "list may be corrupt, no entries from lru pop\n");
1126 break;
1127 }
1128
1129 GNIX_DEBUG(FI_LOG_MR, "attempting to flush key %llx:%llx\n",
1130 entry->key.address, entry->key.length);
1131 iter = rbtFind(cache->stale.rb_tree, &entry->key);
1132 if (OFI_UNLIKELY(!iter)) {
1133 GNIX_ERR(FI_LOG_MR,
1134 "lru entries MUST be present in the cache,"
1135 " could not find entry (%p) in stale tree"
1136 " %llx:%llx\n",
1137 entry, entry->key.address, entry->key.length);
1138 break;
1139 }
1140
1141 rc = rbtErase(cache->stale.rb_tree, iter);
1142 if (OFI_UNLIKELY(rc != RBT_STATUS_OK)) {
1143 GNIX_ERR(FI_LOG_MR,
1144 "failed to erase lru entry from stale tree\n");
1145 break;
1146 }
1147
1148 __mr_cache_entry_destroy(cache, entry);
1149 entry = NULL;
1150 ++destroyed;
1151 }
1152
1153 GNIX_DEBUG(FI_LOG_MR, "flushed %i of %i entries from memory "
1154 "registration cache\n", destroyed,
1155 ofi_atomic_get32(&cache->stale.elements));
1156
1157 if (destroyed > 0) {
1158 ofi_atomic_sub32(&cache->stale.elements, destroyed);
1159 }
1160
1161 return FI_SUCCESS;
1162 }
1163
_gnix_mr_cache_flush(gnix_mr_cache_t * cache)1164 int _gnix_mr_cache_flush(gnix_mr_cache_t *cache)
1165 {
1166
1167 if (OFI_UNLIKELY(cache->state != GNIX_MRC_STATE_READY))
1168 return -FI_EINVAL;
1169
1170 __mr_cache_flush(cache, cache->attr.hard_reg_limit);
1171
1172 return FI_SUCCESS;
1173 }
1174
__mr_cache_search_inuse(gnix_mr_cache_t * cache,uint64_t address,uint64_t length,gnix_mr_cache_entry_t ** entry,gnix_mr_cache_key_t * key,struct _gnix_fi_reg_context * fi_reg_context)1175 static int __mr_cache_search_inuse(
1176 gnix_mr_cache_t *cache,
1177 uint64_t address,
1178 uint64_t length,
1179 gnix_mr_cache_entry_t **entry,
1180 gnix_mr_cache_key_t *key,
1181 struct _gnix_fi_reg_context *fi_reg_context)
1182 {
1183 int ret = FI_SUCCESS, cmp;
1184 RbtIterator iter;
1185 gnix_mr_cache_key_t *found_key, new_key;
1186 gnix_mr_cache_entry_t *found_entry;
1187 uint64_t new_end, found_end;
1188 DLIST_HEAD(retired_entries);
1189
1190 if (cache->attr.lazy_deregistration) {
1191 __clear_notifier_events(cache);
1192 }
1193
1194 /* first we need to find an entry that overlaps with this one.
1195 * care should be taken to find the left most entry that overlaps
1196 * with this entry since the entry we are searching for might overlap
1197 * many entries and the tree may be left or right balanced
1198 * at the head
1199 */
1200 iter = rbtFindLeftmost(cache->inuse.rb_tree, (void *) key,
1201 __find_overlapping_addr);
1202 if (!iter) {
1203 GNIX_DEBUG(FI_LOG_MR,
1204 "could not find key in inuse, key=%llx:%llx\n",
1205 key->address, key->length);
1206 return -FI_ENOENT;
1207 }
1208
1209 rbtKeyValue(cache->inuse.rb_tree, iter, (void **) &found_key,
1210 (void **) &found_entry);
1211
1212 GNIX_DEBUG(FI_LOG_MR,
1213 "found a key that matches the search criteria, "
1214 "found=%llx:%llx key=%llx:%llx\n",
1215 found_key->address, found_key->length,
1216 key->address, key->length);
1217
1218 /* if the entry that we've found completely subsumes
1219 * the requested entry, just return a reference to
1220 * that existing registration
1221 */
1222 if (__can_subsume(found_key, key)) {
1223 GNIX_DEBUG(FI_LOG_MR,
1224 "found an entry that subsumes the request, "
1225 "existing=%llx:%llx key=%llx:%llx entry %p\n",
1226 found_key->address, found_key->length,
1227 key->address, key->length, found_entry);
1228 *entry = found_entry;
1229 __mr_cache_entry_get(cache, found_entry);
1230
1231 cache->hits++;
1232 return FI_SUCCESS;
1233 }
1234
1235 /* otherwise, iterate from the existing entry until we can no longer
1236 * find an entry that overlaps with the new registration and remove
1237 * and retire each of those entries.
1238 */
1239 new_key.address = MIN(found_key->address, key->address);
1240 new_end = key->address + key->length;
1241 while (iter) {
1242 rbtKeyValue(cache->inuse.rb_tree, iter, (void **) &found_key,
1243 (void **) &found_entry);
1244
1245
1246 cmp = __find_overlapping_addr(found_key, key);
1247 GNIX_DEBUG(FI_LOG_MR,
1248 "candidate: key=%llx:%llx result=%d\n",
1249 found_key->address,
1250 found_key->length, cmp);
1251 if (cmp != 0)
1252 break;
1253
1254 /* compute new ending address */
1255 found_end = found_key->address + found_key->length;
1256
1257 /* mark the entry as retired */
1258 GNIX_DEBUG(FI_LOG_MR, "retiring entry, key=%llx:%llx entry %p\n",
1259 found_key->address, found_key->length, found_entry);
1260 __entry_set_retired(found_entry);
1261 dlist_insert_tail(&found_entry->siblings, &retired_entries);
1262
1263 iter = rbtNext(cache->inuse.rb_tree, iter);
1264 }
1265 /* Since our new key might fully overlap every other entry in the tree,
1266 * we need to take the maximum of the last entry and the new entry
1267 */
1268 new_key.length = MAX(found_end, new_end) - new_key.address;
1269
1270
1271 /* remove retired entries from tree */
1272 GNIX_DEBUG(FI_LOG_MR, "removing retired entries from inuse tree\n");
1273 __remove_sibling_entries_from_tree(cache,
1274 &retired_entries, cache->inuse.rb_tree);
1275
1276 /* create new registration */
1277 GNIX_DEBUG(FI_LOG_MR,
1278 "creating a new merged registration, key=%llx:%llx\n",
1279 new_key.address, new_key.length);
1280 ret = __mr_cache_create_registration(cache,
1281 new_key.address, new_key.length,
1282 entry, &new_key, fi_reg_context);
1283 if (ret) {
1284 /* If we get here, one of two things have happened.
1285 * Either some part of the new merged registration was
1286 * unmapped (i.e., freed by user) or the merged
1287 * registration failed for some other reason (probably
1288 * GNI_RC_ERROR_RESOURCE). The first case is a user
1289 * error (which they should have been warned about by
1290 * the notifier), and the second case is always
1291 * possible. Neither case is a problem. The entries
1292 * above have been retired, and here we return the
1293 * error */
1294 GNIX_DEBUG(FI_LOG_MR,
1295 "failed to create merged registration, key=",
1296 new_key.address, new_key.length);
1297 return ret;
1298 }
1299 GNIX_DEBUG(FI_LOG_MR,
1300 "created a new merged registration, key=%llx:%llx entry %p\n",
1301 new_key.address, new_key.length, *entry);
1302
1303 __entry_set_merged(*entry);
1304
1305 /* move retired entries to the head of the new entry's child list */
1306 if (!dlist_empty(&retired_entries)) {
1307 __attach_retired_entries_to_registration(cache,
1308 &retired_entries, *entry);
1309 }
1310
1311 cache->misses++;
1312
1313 return ret;
1314 }
1315
__mr_cache_search_stale(gnix_mr_cache_t * cache,uint64_t address,uint64_t length,gnix_mr_cache_entry_t ** entry,gnix_mr_cache_key_t * key,struct _gnix_fi_reg_context * fi_reg_context)1316 static int __mr_cache_search_stale(
1317 gnix_mr_cache_t *cache,
1318 uint64_t address,
1319 uint64_t length,
1320 gnix_mr_cache_entry_t **entry,
1321 gnix_mr_cache_key_t *key,
1322 struct _gnix_fi_reg_context *fi_reg_context)
1323 {
1324 int ret;
1325 RbtStatus rc;
1326 RbtIterator iter;
1327 gnix_mr_cache_key_t *mr_key;
1328 gnix_mr_cache_entry_t *mr_entry, *tmp;
1329
1330 if (cache->attr.lazy_deregistration) {
1331 __clear_notifier_events(cache);
1332 }
1333
1334 GNIX_DEBUG(FI_LOG_MR, "searching for stale entry, key=%llx:%llx\n",
1335 key->address, key->length);
1336
1337 iter = rbtFindLeftmost(cache->stale.rb_tree, (void *) key,
1338 __find_overlapping_addr);
1339 if (!iter)
1340 return -FI_ENOENT;
1341
1342 rbtKeyValue(cache->stale.rb_tree, iter, (void **) &mr_key,
1343 (void **) &mr_entry);
1344
1345 GNIX_DEBUG(FI_LOG_MR,
1346 "found a matching entry, found=%llx:%llx key=%llx:%llx\n",
1347 mr_key->address, mr_key->length,
1348 key->address, key->length);
1349
1350
1351 /* if the entry that we've found completely subsumes
1352 * the requested entry, just return a reference to
1353 * that existing registration
1354 */
1355 if (__can_subsume(mr_key, key)) {
1356 ret = __mr_cache_search_inuse(cache, address, length,
1357 &tmp, mr_key, fi_reg_context);
1358 if (ret == FI_SUCCESS) {
1359 /* if we found an entry in the inuse tree
1360 * in this manner, it means that there was
1361 * an entry either overlapping or contiguous
1362 * with the stale entry in the inuse tree, and
1363 * a new entry has been made and saved to tmp.
1364 * The old entry (mr_entry) should be destroyed
1365 * now as it is no longer needed.
1366 */
1367 GNIX_DEBUG(FI_LOG_MR,
1368 "removing entry from stale key=%llx:%llx\n",
1369 mr_key->address, mr_key->length);
1370
1371 rc = rbtErase(cache->stale.rb_tree, iter);
1372 if (OFI_UNLIKELY(rc != RBT_STATUS_OK)) {
1373 GNIX_ERR(FI_LOG_MR,
1374 "failed to erase entry from stale tree\n");
1375 } else {
1376 if (__mr_cache_lru_remove(cache, mr_entry)
1377 == FI_SUCCESS) {
1378 ofi_atomic_dec32(&cache->stale.elements);
1379 } else {
1380 GNIX_WARN(FI_LOG_MR, "Failed to remove"
1381 " entry (%p) from lru list\n",
1382 mr_entry);
1383 }
1384 __mr_cache_entry_destroy(cache, mr_entry);
1385 }
1386
1387 *entry = tmp;
1388 } else {
1389 GNIX_DEBUG(FI_LOG_MR,
1390 "removing entry (%p) from stale and"
1391 " migrating to inuse, key=%llx:%llx\n",
1392 mr_entry, mr_key->address, mr_key->length);
1393 rc = rbtErase(cache->stale.rb_tree, iter);
1394 if (OFI_UNLIKELY(rc != RBT_STATUS_OK)) {
1395 GNIX_FATAL(FI_LOG_MR,
1396 "failed to erase entry (%p) from "
1397 " stale tree\n", mr_entry);
1398 }
1399
1400 if (__mr_cache_lru_remove(cache, mr_entry)
1401 != FI_SUCCESS) {
1402 GNIX_WARN(FI_LOG_MR, "Failed to remove"
1403 " entry (%p) from lru list\n",
1404 mr_entry);
1405 }
1406
1407 ofi_atomic_dec32(&cache->stale.elements);
1408
1409 /* if we made it to this point, there weren't
1410 * any entries in the inuse tree that would
1411 * have overlapped with this entry
1412 */
1413 rc = rbtInsert(cache->inuse.rb_tree,
1414 &mr_entry->key, mr_entry);
1415 if (OFI_UNLIKELY(rc != RBT_STATUS_OK)) {
1416 GNIX_FATAL(FI_LOG_MR,
1417 "failed to insert entry into"
1418 "inuse tree\n");
1419 }
1420
1421 ofi_atomic_set32(&mr_entry->ref_cnt, 1);
1422 ofi_atomic_inc32(&cache->inuse.elements);
1423
1424 *entry = mr_entry;
1425 }
1426
1427 return FI_SUCCESS;
1428 }
1429
1430 GNIX_DEBUG(FI_LOG_MR,
1431 "could not use matching entry, "
1432 "found=%llx:%llx\n",
1433 mr_key->address, mr_key->length);
1434
1435 return -FI_ENOENT;
1436 }
1437
__mr_cache_create_registration(gnix_mr_cache_t * cache,uint64_t address,uint64_t length,gnix_mr_cache_entry_t ** entry,gnix_mr_cache_key_t * key,struct _gnix_fi_reg_context * fi_reg_context)1438 static int __mr_cache_create_registration(
1439 gnix_mr_cache_t *cache,
1440 uint64_t address,
1441 uint64_t length,
1442 gnix_mr_cache_entry_t **entry,
1443 gnix_mr_cache_key_t *key,
1444 struct _gnix_fi_reg_context *fi_reg_context)
1445 {
1446 int rc;
1447 gnix_mr_cache_entry_t *current_entry;
1448 void *handle;
1449
1450 /* if we made it here, we didn't find the entry at all */
1451 current_entry = calloc(1, sizeof(*current_entry) + cache->attr.elem_size);
1452 if (!current_entry)
1453 return -FI_ENOMEM;
1454
1455 handle = (void *) current_entry->mr;
1456
1457 dlist_init(¤t_entry->lru_entry);
1458 dlist_init(¤t_entry->children);
1459 dlist_init(¤t_entry->siblings);
1460
1461 handle = cache->attr.reg_callback(handle, (void *) address, length,
1462 fi_reg_context, cache->attr.reg_context);
1463 if (OFI_UNLIKELY(!handle)) {
1464 GNIX_INFO(FI_LOG_MR,
1465 "failed to register memory with callback\n");
1466 goto err;
1467 }
1468
1469 __entry_reset_state(current_entry);
1470
1471 /* set up the entry's key */
1472 current_entry->key.address = address;
1473 current_entry->key.length = length;
1474
1475 rc = __notifier_monitor(cache, current_entry);
1476 if (OFI_UNLIKELY(rc != FI_SUCCESS)) {
1477 GNIX_INFO(FI_LOG_MR,
1478 "failed to monitor memory with notifier\n");
1479 goto err_dereg;
1480 }
1481
1482 rc = rbtInsert(cache->inuse.rb_tree, ¤t_entry->key,
1483 current_entry);
1484 if (OFI_UNLIKELY(rc != RBT_STATUS_OK)) {
1485 GNIX_ERR(FI_LOG_MR, "failed to insert registration "
1486 "into cache, ret=%i\n", rc);
1487
1488 goto err_dereg;
1489 }
1490
1491 GNIX_DEBUG(FI_LOG_MR, "inserted key %llx:%llx into inuse %p\n",
1492 current_entry->key.address, current_entry->key.length, current_entry);
1493
1494
1495 ofi_atomic_inc32(&cache->inuse.elements);
1496 ofi_atomic_initialize32(¤t_entry->ref_cnt, 1);
1497
1498 *entry = current_entry;
1499
1500 return FI_SUCCESS;
1501
1502 err_dereg:
1503 rc = cache->attr.dereg_callback(current_entry->mr,
1504 cache->attr.dereg_context);
1505 if (OFI_UNLIKELY(rc)) {
1506 GNIX_INFO(FI_LOG_MR,
1507 "failed to deregister memory with "
1508 "callback, ret=%d\n", rc);
1509 }
1510 err:
1511 free(current_entry);
1512 return -FI_ENOMEM;
1513 }
1514
1515 /**
1516 * Function to register memory with the cache
1517 *
1518 * @param[in] cache gnix memory registration cache pointer
1519 * @param[in] mr gnix memory region descriptor pointer
1520 * @param[in] address base address of the memory region to be
1521 * registered
1522 * @param[in] length length of the memory region to be registered
1523 * @param[in] fi_reg_context fi_reg_mr API call parameters
1524 * @param[in,out] mem_hndl gni memory handle pointer to written to and
1525 * returned
1526 */
_gnix_mr_cache_register(gnix_mr_cache_t * cache,uint64_t address,uint64_t length,struct _gnix_fi_reg_context * fi_reg_context,void ** handle)1527 int _gnix_mr_cache_register(
1528 gnix_mr_cache_t *cache,
1529 uint64_t address,
1530 uint64_t length,
1531 struct _gnix_fi_reg_context *fi_reg_context,
1532 void **handle)
1533 {
1534 int ret;
1535 gnix_mr_cache_key_t key = {
1536 .address = address,
1537 .length = length,
1538 };
1539 gnix_mr_cache_entry_t *entry;
1540
1541 GNIX_TRACE(FI_LOG_MR, "\n");
1542
1543 /* fastpath inuse */
1544 ret = __mr_cache_search_inuse(cache, address, length,
1545 &entry, &key, fi_reg_context);
1546 if (ret == FI_SUCCESS)
1547 goto success;
1548
1549 /* if we shouldn't introduce any new elements, return -FI_ENOSPC */
1550 if (OFI_UNLIKELY(cache->attr.hard_reg_limit > 0 &&
1551 (ofi_atomic_get32(&cache->inuse.elements) >=
1552 cache->attr.hard_reg_limit))) {
1553 ret = -FI_ENOSPC;
1554 goto err;
1555 }
1556
1557 if (cache->attr.lazy_deregistration) {
1558 __clear_notifier_events(cache);
1559
1560 /* if lazy deregistration is in use, we can check the
1561 * stale tree
1562 */
1563 ret = __mr_cache_search_stale(cache, address, length,
1564 &entry, &key, fi_reg_context);
1565 if (ret == FI_SUCCESS) {
1566 cache->hits++;
1567 goto success;
1568 }
1569 }
1570
1571 /* If the cache is full, then flush one of the stale entries to make
1572 * room for the new entry. This works because we check above to see if
1573 * the number of inuse entries exceeds the hard reg limit
1574 */
1575 if ((ofi_atomic_get32(&cache->inuse.elements) +
1576 ofi_atomic_get32(&cache->stale.elements)) == cache->attr.hard_reg_limit)
1577 __mr_cache_flush(cache, 1);
1578
1579 ret = __mr_cache_create_registration(cache, address, length,
1580 &entry, &key, fi_reg_context);
1581 if (ret) {
1582 goto err;
1583 }
1584
1585 cache->misses++;
1586
1587 success:
1588 __entry_set_state(entry, GNIX_CES_INUSE);
1589 *handle = (void *) entry->mr;
1590
1591 return FI_SUCCESS;
1592
1593 err:
1594 return ret;
1595 }
1596
1597 /**
1598 * Function to deregister memory in the cache
1599 *
1600 * @param[in] mr gnix memory registration descriptor pointer
1601 *
1602 * @return FI_SUCCESS on success
1603 * -FI_ENOENT if there isn't an active memory registration
1604 * associated with the mr
1605 * return codes associated with dereg callback
1606 */
_gnix_mr_cache_deregister(gnix_mr_cache_t * cache,void * handle)1607 int _gnix_mr_cache_deregister(
1608 gnix_mr_cache_t *cache,
1609 void *handle)
1610 {
1611 gnix_mr_cache_entry_t *entry;
1612 gni_return_t grc;
1613
1614 GNIX_TRACE(FI_LOG_MR, "\n");
1615
1616 /* check to see if we can find the entry so that we can drop the
1617 * held reference
1618 */
1619
1620 entry = container_of(handle, gnix_mr_cache_entry_t, mr);
1621 if (__entry_get_state(entry) != GNIX_CES_INUSE) {
1622 GNIX_WARN(FI_LOG_MR, "entry (%p) in incorrect state (%d)\n",
1623 entry, entry->state);
1624 return -FI_EINVAL;
1625 }
1626
1627 GNIX_DEBUG(FI_LOG_MR, "entry found, entry=%p refs=%d\n",
1628 entry, ofi_atomic_get32(&entry->ref_cnt));
1629
1630 grc = __mr_cache_entry_put(cache, entry);
1631
1632 /* Since we check this on each deregistration, the amount of elements
1633 * over the limit should always be 1
1634 */
1635 if (ofi_atomic_get32(&cache->stale.elements) > cache->attr.hard_stale_limit)
1636 __mr_cache_flush(cache, 1);
1637
1638 return gnixu_to_fi_errno(grc);
1639 }
1640
1641