1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4  *                         University Research and Technology
5  *                         Corporation.  All rights reserved.
6  * Copyright (c) 2004-2013 The University of Tennessee and The University
7  *                         of Tennessee Research Foundation.  All rights
8  *                         reserved.
9  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10  *                         University of Stuttgart.  All rights reserved.
11  * Copyright (c) 2004-2005 The Regents of the University of California.
12  *                         All rights reserved.
13  * Copyright (c) 2006-2009 Cisco Systems, Inc.  All rights reserved.
14  * Copyright (c) 2006      Voltaire. All rights reserved.
15  * Copyright (c) 2007      Mellanox Technologies. All rights reserved.
16  * Copyright (c) 2010      IBM Corporation.  All rights reserved.
17  * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
18  *                         reserved.
19  *
20  * $COPYRIGHT$
21  *
22  * Additional copyrights may follow
23  *
24  * $HEADER$
25  */
26 
27 #define OPAL_DISABLE_ENABLE_MEM_DEBUG 1
28 #include "opal_config.h"
29 #include "opal/align.h"
30 #include "rcache_udreg.h"
31 #include <errno.h>
32 #include <string.h>
33 #ifdef HAVE_MALLOC_H
34 #include <malloc.h>
35 #endif
36 #include "opal/mca/rcache/base/base.h"
37 #include "opal/runtime/opal_params.h"
38 #include "opal/include/opal_stdint.h"
39 #include "opal/util/sys_limits.h"
40 
41 #include <fcntl.h>
42 
43 #include <udreg_pub.h>
44 
45 #include <sys/mman.h>
46 
47 
48 static int mca_rcache_udreg_register (mca_rcache_base_module_t* rcache, void *addr,
49                                       size_t size, uint32_t flags, int32_t access_flags,
50                                       mca_rcache_base_registration_t **reg);
51 static int mca_rcache_udreg_deregister (mca_rcache_base_module_t *rcache,
52                                         mca_rcache_base_registration_t *reg);
53 static int mca_rcache_udreg_find (mca_rcache_base_module_t* rcache, void* addr,
54                                   size_t size, mca_rcache_base_registration_t **reg);
55 static void mca_rcache_udreg_finalize (mca_rcache_base_module_t *rcache);
56 static bool mca_rcache_udreg_evict (mca_rcache_base_module_t *rcache);
57 
58 static void *mca_rcache_udreg_reg_func (void *addr, uint64_t len, void *reg_context);
59 static uint32_t mca_rcache_udreg_dereg_func (void *device_data, void *dreg_context);
60 
61 
62 /*
63  *  Initializes the rcache module.
64  */
mca_rcache_udreg_module_init(mca_rcache_udreg_module_t * rcache)65 int mca_rcache_udreg_module_init (mca_rcache_udreg_module_t *rcache)
66 {
67     struct udreg_cache_attr cache_attr;
68     int urc;
69 
70     rcache->super.rcache_component = &mca_rcache_udreg_component.super;
71     rcache->super.rcache_register = mca_rcache_udreg_register;
72     rcache->super.rcache_find = mca_rcache_udreg_find;
73     rcache->super.rcache_deregister = mca_rcache_udreg_deregister;
74     /* This module relies on udreg for notification of memory release */
75     rcache->super.rcache_invalidate_range = NULL;
76     rcache->super.rcache_finalize = mca_rcache_udreg_finalize;
77 
78     cache_attr.modes = 0;
79 
80     /* Create udreg cache */
81     if (rcache->resources.use_kernel_cache) {
82         cache_attr.modes |= UDREG_CC_MODE_USE_KERNEL_CACHE;
83     }
84 
85     if (rcache->resources.use_evict_w_unreg) {
86         cache_attr.modes |= UDREG_CC_MODE_USE_EVICT_W_UNREG;
87     }
88 
89     if (mca_rcache_udreg_component.leave_pinned) {
90         cache_attr.modes |= UDREG_CC_MODE_USE_LAZY_DEREG;
91     }
92 
93     OBJ_CONSTRUCT(&rcache->lock, opal_mutex_t);
94 
95     strncpy (cache_attr.cache_name, rcache->resources.base.cache_name, UDREG_MAX_CACHENAME_LEN);
96     cache_attr.max_entries         = rcache->resources.max_entries;
97     cache_attr.debug_mode          = 0;
98     cache_attr.debug_rank          = 0;
99     cache_attr.reg_context         = rcache;
100     cache_attr.dreg_context        = rcache;
101     cache_attr.destructor_context  = rcache;
102     cache_attr.device_reg_func     = mca_rcache_udreg_reg_func;
103     cache_attr.device_dereg_func   = mca_rcache_udreg_dereg_func;
104     cache_attr.destructor_callback = NULL;
105 
106     opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
107                          "rcache/udreg: creating udreg cache with name %s", cache_attr.cache_name);
108 
109     /* attempt to create the udreg cache. this will fail if one already exists */
110     (void) UDREG_CacheCreate (&cache_attr);
111 
112     urc = UDREG_CacheAccess (rcache->resources.base.cache_name, (udreg_cache_handle_t *) &rcache->udreg_handle);
113     if (UDREG_RC_SUCCESS != urc) {
114         opal_output_verbose (MCA_BASE_VERBOSE_WARN, opal_rcache_base_framework.framework_output,
115                              "rcache/udreg: call to UDREG_CacheAccess failed with rc: %d", urc);
116         return OPAL_ERROR;
117     }
118 
119     OBJ_CONSTRUCT(&rcache->reg_list, opal_free_list_t);
120     opal_free_list_init (&rcache->reg_list, rcache->resources.base.sizeof_reg,
121                          opal_cache_line_size, OBJ_CLASS(mca_rcache_base_registration_t),
122                          0, opal_cache_line_size, 0, -1, 32, NULL, 0,
123                          NULL, NULL, NULL);
124 
125     return OPAL_SUCCESS;
126 }
127 
128 /* udreg callback functions */
mca_rcache_udreg_reg_func(void * addr,uint64_t size,void * reg_context)129 static void *mca_rcache_udreg_reg_func (void *addr, uint64_t size, void *reg_context)
130 {
131     mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t *) reg_context;
132     unsigned int page_size = opal_getpagesize ();
133     mca_rcache_base_registration_t *udreg_reg;
134     opal_free_list_item_t *item;
135     int rc;
136 
137     item = opal_free_list_get (&rcache_udreg->reg_list);
138     if (NULL == item) {
139         return NULL;
140     }
141 
142     udreg_reg = (mca_rcache_base_registration_t *) item;
143 
144     udreg_reg->rcache = reg_context;
145     udreg_reg->base  = OPAL_DOWN_ALIGN_PTR(addr, page_size, unsigned char *);
146     udreg_reg->bound = OPAL_ALIGN_PTR((intptr_t) addr + size, page_size, unsigned char *) - 1;
147     udreg_reg->ref_count = 0;
148 
149     addr = (void *) udreg_reg->base;
150     size = (uint64_t) (udreg_reg->bound - udreg_reg->base + 1);
151 
152     /* pull the flags and access flags out of the rcache module */
153     udreg_reg->access_flags = rcache_udreg->requested_access_flags;
154     udreg_reg->flags = rcache_udreg->requested_flags;
155 
156     opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
157                          "rcache/udreg: calling underlying register function for address range {%p, %p}",
158                          addr, (void *)((intptr_t) addr + size));
159     rc = rcache_udreg->resources.base.register_mem (rcache_udreg->resources.base.reg_data, udreg_reg->base, size,
160                                                     udreg_reg);
161     if (OPAL_SUCCESS != rc) {
162         opal_output_verbose (MCA_BASE_VERBOSE_WARN, opal_rcache_base_framework.framework_output,
163                              "rcache/udreg: could not register memory. rc: %d", rc);
164         opal_free_list_return (&rcache_udreg->reg_list, item);
165         /* NTH: this is the only way to get UDReg_Register to recognize a failure */
166         udreg_reg = UDREG_DEVICE_REG_FAILED;
167     }
168 
169     return udreg_reg;
170 }
171 
mca_rcache_udreg_dereg_func(void * device_data,void * dreg_context)172 static uint32_t mca_rcache_udreg_dereg_func (void *device_data, void *dreg_context)
173 {
174     mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t *) dreg_context;
175     mca_rcache_base_registration_t *udreg_reg = (mca_rcache_base_registration_t *) device_data;
176     int rc;
177 
178     assert (udreg_reg->ref_count == 0);
179 
180     rc = rcache_udreg->resources.base.deregister_mem (rcache_udreg->resources.base.reg_data, udreg_reg);
181     if (OPAL_LIKELY(OPAL_SUCCESS == rc)) {
182         opal_free_list_return (&rcache_udreg->reg_list,
183                                (opal_free_list_item_t *) udreg_reg);
184     }
185     /* might be worth printing out a warning if an error occurs here */
186 
187     return 0;
188 }
189 
mca_rcache_udreg_evict(mca_rcache_base_module_t * rcache)190 static bool mca_rcache_udreg_evict (mca_rcache_base_module_t *rcache)
191 {
192     mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t *) rcache;
193     udreg_return_t urc;
194 
195     urc = UDREG_Evict (rcache_udreg->udreg_handle);
196     return (UDREG_RC_SUCCESS == urc);
197 }
198 
199 /*
200  * register memory
201  */
mca_rcache_udreg_register(mca_rcache_base_module_t * rcache,void * addr,size_t size,uint32_t flags,int32_t access_flags,mca_rcache_base_registration_t ** reg)202 static int mca_rcache_udreg_register(mca_rcache_base_module_t *rcache, void *addr,
203                                     size_t size, uint32_t flags, int32_t access_flags,
204                                     mca_rcache_base_registration_t **reg)
205 {
206     mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t *) rcache;
207     mca_rcache_base_registration_t *udreg_reg, *old_reg;
208     bool bypass_cache = !!(flags & MCA_RCACHE_FLAGS_CACHE_BYPASS);
209     const unsigned int page_size = opal_getpagesize ();
210     unsigned char *base, *bound;
211     udreg_entry_t *udreg_entry = NULL;
212 
213     *reg = NULL;
214 
215     OPAL_THREAD_LOCK(&rcache_udreg->lock);
216 
217     /* we hold the lock so no other thread can modify these flags until the registration is complete */
218     rcache_udreg->requested_access_flags = access_flags;
219     rcache_udreg->requested_flags = flags;
220 
221     base = OPAL_DOWN_ALIGN_PTR(addr, page_size, unsigned char *);
222     bound = OPAL_ALIGN_PTR((intptr_t) addr + size, page_size, unsigned char *) - 1;
223 
224     addr = base;
225     size = (size_t) (uintptr_t) (bound - base) + 1;
226 
227     if (false == bypass_cache) {
228         /* Get a udreg entry for this region */
229         do {
230             opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
231                                  "rcache/udreg: XXX registering region {%p, %p} with udreg", addr, (void *)((intptr_t) addr + size));
232             while (UDREG_RC_SUCCESS != UDREG_Register (rcache_udreg->udreg_handle, addr, size, &udreg_entry)) {
233                 /* try to remove one unused reg and retry */
234                 opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
235                                      "calling evict!");
236                 if (!mca_rcache_udreg_evict (rcache)) {
237                     opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
238                                          "rcache/udreg: could not register memory with udreg");
239                     OPAL_THREAD_UNLOCK(&rcache_udreg->lock);
240                     return OPAL_ERR_OUT_OF_RESOURCE;
241                 }
242             }
243 
244             udreg_reg = (mca_rcache_base_registration_t *) udreg_entry->device_data;
245             if (NULL != udreg_reg && (udreg_reg->access_flags & access_flags) == access_flags) {
246                 /* sufficient access */
247                 break;
248             }
249 
250             old_reg = udreg_reg;
251 
252             if (old_reg) {
253                 /* to not confuse udreg make sure the new registration covers the same address
254                  * range as the old one. */
255                 addr = old_reg->base;
256                 size = (size_t)((intptr_t) old_reg->bound - (intptr_t) old_reg->base);
257 
258                 /* make the new access flags more permissive */
259                 access_flags |= old_reg->access_flags;
260 
261                 if (!old_reg->ref_count) {
262                     /* deregister the region before attempting to re-register */
263                     mca_rcache_udreg_dereg_func (old_reg, rcache);
264                     udreg_entry->device_data = NULL;
265                     old_reg = NULL;
266                 } else {
267                     /* ensure that mca_rcache_udreg_deregister does not call into udreg since
268                      * we are forcefully evicting the registration here */
269                     old_reg->flags |= MCA_RCACHE_FLAGS_CACHE_BYPASS | MCA_RCACHE_FLAGS_INVALID;
270                 }
271             }
272 
273             rcache_udreg->requested_access_flags = access_flags;
274 
275             /* get a new registration */
276             while (UDREG_DEVICE_REG_FAILED == (udreg_reg = mca_rcache_udreg_reg_func (addr, size, rcache))) {
277                 if (!mca_rcache_udreg_evict (rcache)) {
278                     opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
279                                          "rcache/udreg: could not register memory with udreg");
280                     OPAL_THREAD_UNLOCK(&rcache_udreg->lock);
281                     return OPAL_ERR_OUT_OF_RESOURCE;
282                 }
283             }
284 
285             /* update the device data with the new registration */
286             udreg_entry->device_data = udreg_reg;
287         } while (0);
288     } else {
289         /* if cache bypass is requested don't use the udreg cache */
290         while (UDREG_DEVICE_REG_FAILED == (udreg_reg = mca_rcache_udreg_reg_func (addr, size, rcache))) {
291             /* try to remove one unused reg and retry */
292             if (!mca_rcache_udreg_evict (rcache)) {
293                 opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
294                                      "rcache/udreg: could not register memory");
295                 OPAL_THREAD_UNLOCK(&rcache_udreg->lock);
296                 return OPAL_ERR_OUT_OF_RESOURCE;
297             }
298         }
299     }
300 
301     OPAL_THREAD_UNLOCK(&rcache_udreg->lock);
302 
303     *reg = udreg_reg;
304     (void) OPAL_THREAD_ADD_FETCH32(&udreg_reg->ref_count, 1);
305     udreg_reg->rcache_context = udreg_entry;
306 
307     return OPAL_SUCCESS;
308 }
309 
mca_rcache_udreg_find(mca_rcache_base_module_t * rcache,void * addr,size_t size,mca_rcache_base_registration_t ** reg)310 static int mca_rcache_udreg_find (mca_rcache_base_module_t *rcache, void *addr,
311                                  size_t size, mca_rcache_base_registration_t **reg)
312 {
313     *reg = NULL;
314     return OPAL_ERR_NOT_FOUND;
315 }
316 
mca_rcache_udreg_deregister(mca_rcache_base_module_t * rcache,mca_rcache_base_registration_t * reg)317 static int mca_rcache_udreg_deregister(mca_rcache_base_module_t *rcache,
318                                       mca_rcache_base_registration_t *reg)
319 {
320     mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t *) rcache;
321     int32_t ref_count = OPAL_THREAD_ADD_FETCH32 (&reg->ref_count, -1);
322 
323     assert(ref_count >= 0);
324 
325     if (!(reg->flags & MCA_RCACHE_FLAGS_CACHE_BYPASS)) {
326         OPAL_THREAD_LOCK(&rcache_udreg->lock);
327         UDREG_DecrRefcount (rcache_udreg->udreg_handle, reg->rcache_context);
328         OPAL_THREAD_UNLOCK(&rcache_udreg->lock);
329     } else if (!ref_count) {
330         mca_rcache_udreg_dereg_func (reg, rcache);
331     }
332 
333     return OPAL_SUCCESS;
334 }
335 
mca_rcache_udreg_finalize(mca_rcache_base_module_t * rcache)336 static void mca_rcache_udreg_finalize (mca_rcache_base_module_t *rcache)
337 {
338     mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t*)rcache;
339 
340     /* Statistic */
341     if (true == mca_rcache_udreg_component.print_stats) {
342         uint64_t hit = 0, miss = 0, evicted = 0;
343 
344         (void) UDREG_GetStat (rcache_udreg->udreg_handle,
345                               UDREG_STAT_CACHE_HIT, &hit);
346 
347         (void) UDREG_GetStat (rcache_udreg->udreg_handle,
348                               UDREG_STAT_CACHE_MISS, &miss);
349 
350         (void) UDREG_GetStat (rcache_udreg->udreg_handle,
351                               UDREG_STAT_CACHE_EVICTED, &evicted);
352 
353         opal_output(0, "%s udreg: stats (hit/miss/evicted): %" PRIu64 "/%" PRIu64 "/%" PRIu64 "\n",
354                     OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), hit, miss, evicted);
355     }
356 
357     UDREG_CacheRelease (rcache_udreg->udreg_handle);
358     OBJ_DESTRUCT(&rcache_udreg->reg_list);
359     OBJ_DESTRUCT(&rcache_udreg->lock);
360 }
361