1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2013 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2006-2009 Cisco Systems, Inc. All rights reserved.
14 * Copyright (c) 2006 Voltaire. All rights reserved.
15 * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
16 * Copyright (c) 2010 IBM Corporation. All rights reserved.
17 * Copyright (c) 2011-2016 Los Alamos National Security, LLC. All rights
18 * reserved.
19 *
20 * $COPYRIGHT$
21 *
22 * Additional copyrights may follow
23 *
24 * $HEADER$
25 */
26
27 #define OPAL_DISABLE_ENABLE_MEM_DEBUG 1
28 #include "opal_config.h"
29 #include "opal/align.h"
30 #include "rcache_udreg.h"
31 #include <errno.h>
32 #include <string.h>
33 #ifdef HAVE_MALLOC_H
34 #include <malloc.h>
35 #endif
36 #include "opal/mca/rcache/base/base.h"
37 #include "opal/runtime/opal_params.h"
38 #include "opal/include/opal_stdint.h"
39 #include "opal/util/sys_limits.h"
40
41 #include <fcntl.h>
42
43 #include <udreg_pub.h>
44
45 #include <sys/mman.h>
46
47
48 static int mca_rcache_udreg_register (mca_rcache_base_module_t* rcache, void *addr,
49 size_t size, uint32_t flags, int32_t access_flags,
50 mca_rcache_base_registration_t **reg);
51 static int mca_rcache_udreg_deregister (mca_rcache_base_module_t *rcache,
52 mca_rcache_base_registration_t *reg);
53 static int mca_rcache_udreg_find (mca_rcache_base_module_t* rcache, void* addr,
54 size_t size, mca_rcache_base_registration_t **reg);
55 static void mca_rcache_udreg_finalize (mca_rcache_base_module_t *rcache);
56 static bool mca_rcache_udreg_evict (mca_rcache_base_module_t *rcache);
57
58 static void *mca_rcache_udreg_reg_func (void *addr, uint64_t len, void *reg_context);
59 static uint32_t mca_rcache_udreg_dereg_func (void *device_data, void *dreg_context);
60
61
62 /*
63 * Initializes the rcache module.
64 */
mca_rcache_udreg_module_init(mca_rcache_udreg_module_t * rcache)65 int mca_rcache_udreg_module_init (mca_rcache_udreg_module_t *rcache)
66 {
67 struct udreg_cache_attr cache_attr;
68 int urc;
69
70 rcache->super.rcache_component = &mca_rcache_udreg_component.super;
71 rcache->super.rcache_register = mca_rcache_udreg_register;
72 rcache->super.rcache_find = mca_rcache_udreg_find;
73 rcache->super.rcache_deregister = mca_rcache_udreg_deregister;
74 /* This module relies on udreg for notification of memory release */
75 rcache->super.rcache_invalidate_range = NULL;
76 rcache->super.rcache_finalize = mca_rcache_udreg_finalize;
77
78 cache_attr.modes = 0;
79
80 /* Create udreg cache */
81 if (rcache->resources.use_kernel_cache) {
82 cache_attr.modes |= UDREG_CC_MODE_USE_KERNEL_CACHE;
83 }
84
85 if (rcache->resources.use_evict_w_unreg) {
86 cache_attr.modes |= UDREG_CC_MODE_USE_EVICT_W_UNREG;
87 }
88
89 if (mca_rcache_udreg_component.leave_pinned) {
90 cache_attr.modes |= UDREG_CC_MODE_USE_LAZY_DEREG;
91 }
92
93 OBJ_CONSTRUCT(&rcache->lock, opal_mutex_t);
94
95 strncpy (cache_attr.cache_name, rcache->resources.base.cache_name, UDREG_MAX_CACHENAME_LEN);
96 cache_attr.max_entries = rcache->resources.max_entries;
97 cache_attr.debug_mode = 0;
98 cache_attr.debug_rank = 0;
99 cache_attr.reg_context = rcache;
100 cache_attr.dreg_context = rcache;
101 cache_attr.destructor_context = rcache;
102 cache_attr.device_reg_func = mca_rcache_udreg_reg_func;
103 cache_attr.device_dereg_func = mca_rcache_udreg_dereg_func;
104 cache_attr.destructor_callback = NULL;
105
106 opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
107 "rcache/udreg: creating udreg cache with name %s", cache_attr.cache_name);
108
109 /* attempt to create the udreg cache. this will fail if one already exists */
110 (void) UDREG_CacheCreate (&cache_attr);
111
112 urc = UDREG_CacheAccess (rcache->resources.base.cache_name, (udreg_cache_handle_t *) &rcache->udreg_handle);
113 if (UDREG_RC_SUCCESS != urc) {
114 opal_output_verbose (MCA_BASE_VERBOSE_WARN, opal_rcache_base_framework.framework_output,
115 "rcache/udreg: call to UDREG_CacheAccess failed with rc: %d", urc);
116 return OPAL_ERROR;
117 }
118
119 OBJ_CONSTRUCT(&rcache->reg_list, opal_free_list_t);
120 opal_free_list_init (&rcache->reg_list, rcache->resources.base.sizeof_reg,
121 opal_cache_line_size, OBJ_CLASS(mca_rcache_base_registration_t),
122 0, opal_cache_line_size, 0, -1, 32, NULL, 0,
123 NULL, NULL, NULL);
124
125 return OPAL_SUCCESS;
126 }
127
128 /* udreg callback functions */
mca_rcache_udreg_reg_func(void * addr,uint64_t size,void * reg_context)129 static void *mca_rcache_udreg_reg_func (void *addr, uint64_t size, void *reg_context)
130 {
131 mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t *) reg_context;
132 unsigned int page_size = opal_getpagesize ();
133 mca_rcache_base_registration_t *udreg_reg;
134 opal_free_list_item_t *item;
135 int rc;
136
137 item = opal_free_list_get (&rcache_udreg->reg_list);
138 if (NULL == item) {
139 return NULL;
140 }
141
142 udreg_reg = (mca_rcache_base_registration_t *) item;
143
144 udreg_reg->rcache = reg_context;
145 udreg_reg->base = OPAL_DOWN_ALIGN_PTR(addr, page_size, unsigned char *);
146 udreg_reg->bound = OPAL_ALIGN_PTR((intptr_t) addr + size, page_size, unsigned char *) - 1;
147 udreg_reg->ref_count = 0;
148
149 addr = (void *) udreg_reg->base;
150 size = (uint64_t) (udreg_reg->bound - udreg_reg->base + 1);
151
152 /* pull the flags and access flags out of the rcache module */
153 udreg_reg->access_flags = rcache_udreg->requested_access_flags;
154 udreg_reg->flags = rcache_udreg->requested_flags;
155
156 opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
157 "rcache/udreg: calling underlying register function for address range {%p, %p}",
158 addr, (void *)((intptr_t) addr + size));
159 rc = rcache_udreg->resources.base.register_mem (rcache_udreg->resources.base.reg_data, udreg_reg->base, size,
160 udreg_reg);
161 if (OPAL_SUCCESS != rc) {
162 opal_output_verbose (MCA_BASE_VERBOSE_WARN, opal_rcache_base_framework.framework_output,
163 "rcache/udreg: could not register memory. rc: %d", rc);
164 opal_free_list_return (&rcache_udreg->reg_list, item);
165 /* NTH: this is the only way to get UDReg_Register to recognize a failure */
166 udreg_reg = UDREG_DEVICE_REG_FAILED;
167 }
168
169 return udreg_reg;
170 }
171
mca_rcache_udreg_dereg_func(void * device_data,void * dreg_context)172 static uint32_t mca_rcache_udreg_dereg_func (void *device_data, void *dreg_context)
173 {
174 mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t *) dreg_context;
175 mca_rcache_base_registration_t *udreg_reg = (mca_rcache_base_registration_t *) device_data;
176 int rc;
177
178 assert (udreg_reg->ref_count == 0);
179
180 rc = rcache_udreg->resources.base.deregister_mem (rcache_udreg->resources.base.reg_data, udreg_reg);
181 if (OPAL_LIKELY(OPAL_SUCCESS == rc)) {
182 opal_free_list_return (&rcache_udreg->reg_list,
183 (opal_free_list_item_t *) udreg_reg);
184 }
185 /* might be worth printing out a warning if an error occurs here */
186
187 return 0;
188 }
189
mca_rcache_udreg_evict(mca_rcache_base_module_t * rcache)190 static bool mca_rcache_udreg_evict (mca_rcache_base_module_t *rcache)
191 {
192 mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t *) rcache;
193 udreg_return_t urc;
194
195 urc = UDREG_Evict (rcache_udreg->udreg_handle);
196 return (UDREG_RC_SUCCESS == urc);
197 }
198
199 /*
200 * register memory
201 */
mca_rcache_udreg_register(mca_rcache_base_module_t * rcache,void * addr,size_t size,uint32_t flags,int32_t access_flags,mca_rcache_base_registration_t ** reg)202 static int mca_rcache_udreg_register(mca_rcache_base_module_t *rcache, void *addr,
203 size_t size, uint32_t flags, int32_t access_flags,
204 mca_rcache_base_registration_t **reg)
205 {
206 mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t *) rcache;
207 mca_rcache_base_registration_t *udreg_reg, *old_reg;
208 bool bypass_cache = !!(flags & MCA_RCACHE_FLAGS_CACHE_BYPASS);
209 const unsigned int page_size = opal_getpagesize ();
210 unsigned char *base, *bound;
211 udreg_entry_t *udreg_entry = NULL;
212
213 *reg = NULL;
214
215 OPAL_THREAD_LOCK(&rcache_udreg->lock);
216
217 /* we hold the lock so no other thread can modify these flags until the registration is complete */
218 rcache_udreg->requested_access_flags = access_flags;
219 rcache_udreg->requested_flags = flags;
220
221 base = OPAL_DOWN_ALIGN_PTR(addr, page_size, unsigned char *);
222 bound = OPAL_ALIGN_PTR((intptr_t) addr + size, page_size, unsigned char *) - 1;
223
224 addr = base;
225 size = (size_t) (uintptr_t) (bound - base) + 1;
226
227 if (false == bypass_cache) {
228 /* Get a udreg entry for this region */
229 do {
230 opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
231 "rcache/udreg: XXX registering region {%p, %p} with udreg", addr, (void *)((intptr_t) addr + size));
232 while (UDREG_RC_SUCCESS != UDREG_Register (rcache_udreg->udreg_handle, addr, size, &udreg_entry)) {
233 /* try to remove one unused reg and retry */
234 opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
235 "calling evict!");
236 if (!mca_rcache_udreg_evict (rcache)) {
237 opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
238 "rcache/udreg: could not register memory with udreg");
239 OPAL_THREAD_UNLOCK(&rcache_udreg->lock);
240 return OPAL_ERR_OUT_OF_RESOURCE;
241 }
242 }
243
244 udreg_reg = (mca_rcache_base_registration_t *) udreg_entry->device_data;
245 if (NULL != udreg_reg && (udreg_reg->access_flags & access_flags) == access_flags) {
246 /* sufficient access */
247 break;
248 }
249
250 old_reg = udreg_reg;
251
252 if (old_reg) {
253 /* to not confuse udreg make sure the new registration covers the same address
254 * range as the old one. */
255 addr = old_reg->base;
256 size = (size_t)((intptr_t) old_reg->bound - (intptr_t) old_reg->base);
257
258 /* make the new access flags more permissive */
259 access_flags |= old_reg->access_flags;
260
261 if (!old_reg->ref_count) {
262 /* deregister the region before attempting to re-register */
263 mca_rcache_udreg_dereg_func (old_reg, rcache);
264 udreg_entry->device_data = NULL;
265 old_reg = NULL;
266 } else {
267 /* ensure that mca_rcache_udreg_deregister does not call into udreg since
268 * we are forcefully evicting the registration here */
269 old_reg->flags |= MCA_RCACHE_FLAGS_CACHE_BYPASS | MCA_RCACHE_FLAGS_INVALID;
270 }
271 }
272
273 rcache_udreg->requested_access_flags = access_flags;
274
275 /* get a new registration */
276 while (UDREG_DEVICE_REG_FAILED == (udreg_reg = mca_rcache_udreg_reg_func (addr, size, rcache))) {
277 if (!mca_rcache_udreg_evict (rcache)) {
278 opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
279 "rcache/udreg: could not register memory with udreg");
280 OPAL_THREAD_UNLOCK(&rcache_udreg->lock);
281 return OPAL_ERR_OUT_OF_RESOURCE;
282 }
283 }
284
285 /* update the device data with the new registration */
286 udreg_entry->device_data = udreg_reg;
287 } while (0);
288 } else {
289 /* if cache bypass is requested don't use the udreg cache */
290 while (UDREG_DEVICE_REG_FAILED == (udreg_reg = mca_rcache_udreg_reg_func (addr, size, rcache))) {
291 /* try to remove one unused reg and retry */
292 if (!mca_rcache_udreg_evict (rcache)) {
293 opal_output_verbose (MCA_BASE_VERBOSE_INFO, opal_rcache_base_framework.framework_output,
294 "rcache/udreg: could not register memory");
295 OPAL_THREAD_UNLOCK(&rcache_udreg->lock);
296 return OPAL_ERR_OUT_OF_RESOURCE;
297 }
298 }
299 }
300
301 OPAL_THREAD_UNLOCK(&rcache_udreg->lock);
302
303 *reg = udreg_reg;
304 (void) OPAL_THREAD_ADD_FETCH32(&udreg_reg->ref_count, 1);
305 udreg_reg->rcache_context = udreg_entry;
306
307 return OPAL_SUCCESS;
308 }
309
mca_rcache_udreg_find(mca_rcache_base_module_t * rcache,void * addr,size_t size,mca_rcache_base_registration_t ** reg)310 static int mca_rcache_udreg_find (mca_rcache_base_module_t *rcache, void *addr,
311 size_t size, mca_rcache_base_registration_t **reg)
312 {
313 *reg = NULL;
314 return OPAL_ERR_NOT_FOUND;
315 }
316
mca_rcache_udreg_deregister(mca_rcache_base_module_t * rcache,mca_rcache_base_registration_t * reg)317 static int mca_rcache_udreg_deregister(mca_rcache_base_module_t *rcache,
318 mca_rcache_base_registration_t *reg)
319 {
320 mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t *) rcache;
321 int32_t ref_count = OPAL_THREAD_ADD_FETCH32 (®->ref_count, -1);
322
323 assert(ref_count >= 0);
324
325 if (!(reg->flags & MCA_RCACHE_FLAGS_CACHE_BYPASS)) {
326 OPAL_THREAD_LOCK(&rcache_udreg->lock);
327 UDREG_DecrRefcount (rcache_udreg->udreg_handle, reg->rcache_context);
328 OPAL_THREAD_UNLOCK(&rcache_udreg->lock);
329 } else if (!ref_count) {
330 mca_rcache_udreg_dereg_func (reg, rcache);
331 }
332
333 return OPAL_SUCCESS;
334 }
335
mca_rcache_udreg_finalize(mca_rcache_base_module_t * rcache)336 static void mca_rcache_udreg_finalize (mca_rcache_base_module_t *rcache)
337 {
338 mca_rcache_udreg_module_t *rcache_udreg = (mca_rcache_udreg_module_t*)rcache;
339
340 /* Statistic */
341 if (true == mca_rcache_udreg_component.print_stats) {
342 uint64_t hit = 0, miss = 0, evicted = 0;
343
344 (void) UDREG_GetStat (rcache_udreg->udreg_handle,
345 UDREG_STAT_CACHE_HIT, &hit);
346
347 (void) UDREG_GetStat (rcache_udreg->udreg_handle,
348 UDREG_STAT_CACHE_MISS, &miss);
349
350 (void) UDREG_GetStat (rcache_udreg->udreg_handle,
351 UDREG_STAT_CACHE_EVICTED, &evicted);
352
353 opal_output(0, "%s udreg: stats (hit/miss/evicted): %" PRIu64 "/%" PRIu64 "/%" PRIu64 "\n",
354 OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), hit, miss, evicted);
355 }
356
357 UDREG_CacheRelease (rcache_udreg->udreg_handle);
358 OBJ_DESTRUCT(&rcache_udreg->reg_list);
359 OBJ_DESTRUCT(&rcache_udreg->lock);
360 }
361