1 /*
2 * Copyright (c) 2017 Mellanox Technologies, Inc.
3 * All rights reserved.
4 * $COPYRIGHT$
5 *
6 * Additional copyrights may follow
7 *
8 * $HEADER$
9 */
10
11 #include "oshmem_config.h"
12
13 #include "opal/constants.h"
14 #include "opal/util/output.h"
15 #include "opal/util/path.h"
16 #include "opal/util/show_help.h"
17 #include "orte/util/show_help.h"
18
19 #include "oshmem/proc/proc.h"
20 #include "oshmem/mca/sshmem/sshmem.h"
21 #include "oshmem/include/shmemx.h"
22 #include "oshmem/mca/sshmem/base/base.h"
23 #include "oshmem/util/oshmem_util.h"
24 #include "oshmem/mca/spml/ucx/spml_ucx.h"
25
26 #include "sshmem_ucx.h"
27
28 //#include <ucs/sys/math.h>
29
30 #if HAVE_UCX_DEVICE_MEM
31 #include <ucp/core/ucp_resource.h>
32 #include <uct/ib/base/ib_alloc.h>
33 #endif
34
35 #define ALLOC_ELEM_SIZE sizeof(uint64_t)
36 #define min(a,b) ((a) < (b) ? (a) : (b))
37 #define max(a,b) ((a) > (b) ? (a) : (b))
38
39 /* ////////////////////////////////////////////////////////////////////////// */
40 /*local functions */
41 /* local functions */
42 static int
43 module_init(void);
44
45 static int
46 segment_create(map_segment_t *ds_buf,
47 const char *file_name,
48 size_t size, long hint);
49
50 static void *
51 segment_attach(map_segment_t *ds_buf, sshmem_mkey_t *mkey);
52
53 static int
54 segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey);
55
56 static int
57 segment_unlink(map_segment_t *ds_buf);
58
59 static int
60 module_finalize(void);
61
62 static int sshmem_ucx_memheap_realloc(map_segment_t *s, size_t size,
63 void* old_ptr, void** new_ptr);
64
65 static int sshmem_ucx_memheap_free(map_segment_t *s, void* ptr);
66
67 /*
68 * ucx shmem module
69 */
70 mca_sshmem_ucx_module_t mca_sshmem_ucx_module = {
71 /* super */
72 {
73 module_init,
74 segment_create,
75 segment_attach,
76 segment_detach,
77 segment_unlink,
78 module_finalize
79 }
80 };
81
82 static int
module_init(void)83 module_init(void)
84 {
85 /* nothing to do */
86 return OSHMEM_SUCCESS;
87 }
88
89 /* ////////////////////////////////////////////////////////////////////////// */
90 static int
module_finalize(void)91 module_finalize(void)
92 {
93 /* nothing to do */
94 return OSHMEM_SUCCESS;
95 }
96
97 /* ////////////////////////////////////////////////////////////////////////// */
98
99 static segment_allocator_t sshmem_ucx_allocator = {
100 .sa_realloc = sshmem_ucx_memheap_realloc,
101 .sa_free = sshmem_ucx_memheap_free
102 };
103
104 static int
segment_create_internal(map_segment_t * ds_buf,void * address,size_t size,unsigned flags,long hint,void * dev_mem)105 segment_create_internal(map_segment_t *ds_buf, void *address, size_t size,
106 unsigned flags, long hint, void *dev_mem)
107 {
108 mca_sshmem_ucx_segment_context_t *ctx;
109 int rc = OSHMEM_SUCCESS;
110 mca_spml_ucx_t *spml = (mca_spml_ucx_t*)mca_spml.self;
111 ucp_mem_map_params_t mem_map_params;
112 ucp_mem_h mem_h;
113 ucs_status_t status;
114
115 assert(ds_buf);
116
117 /* init the contents of map_segment_t */
118 shmem_ds_reset(ds_buf);
119
120 mem_map_params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS |
121 UCP_MEM_MAP_PARAM_FIELD_LENGTH |
122 UCP_MEM_MAP_PARAM_FIELD_FLAGS;
123
124 mem_map_params.address = address;
125 mem_map_params.length = size;
126 mem_map_params.flags = flags;
127
128 status = ucp_mem_map(spml->ucp_context, &mem_map_params, &mem_h);
129 if (UCS_OK != status) {
130 SSHMEM_ERROR("ucp_mem_map() failed: %s\n", ucs_status_string(status));
131 rc = OSHMEM_ERROR;
132 goto out;
133 }
134
135 if (!(flags & UCP_MEM_MAP_FIXED)) {
136 /* Memory was allocated at an arbitrary address; obtain it */
137 ucp_mem_attr_t mem_attr;
138 mem_attr.field_mask = UCP_MEM_ATTR_FIELD_ADDRESS;
139 status = ucp_mem_query(mem_h, &mem_attr);
140 if (status != UCS_OK) {
141 SSHMEM_ERROR("ucp_mem_query() failed: %s\n", ucs_status_string(status));
142 ucp_mem_unmap(spml->ucp_context, mem_h);
143 rc = OSHMEM_ERROR;
144 goto out;
145 }
146
147 ds_buf->super.va_base = mem_attr.address;
148 } else {
149 ds_buf->super.va_base = mem_map_params.address;
150 }
151
152 ctx = calloc(1, sizeof(*ctx));
153 if (!ctx) {
154 ucp_mem_unmap(spml->ucp_context, mem_h);
155 rc = OSHMEM_ERR_OUT_OF_RESOURCE;
156 goto out;
157 }
158
159 ds_buf->seg_size = size;
160 ds_buf->super.va_end = (void*)((uintptr_t)ds_buf->super.va_base + ds_buf->seg_size);
161 ds_buf->context = ctx;
162 ds_buf->type = MAP_SEGMENT_ALLOC_UCX;
163 ds_buf->alloc_hints = hint;
164 ctx->ucp_memh = mem_h;
165 ctx->dev_mem = dev_mem;
166 if (hint) {
167 ds_buf->allocator = &sshmem_ucx_allocator;
168 }
169
170 out:
171 OPAL_OUTPUT_VERBOSE(
172 (70, oshmem_sshmem_base_framework.framework_output,
173 "%s: %s: create %s "
174 "(id: %d, addr: %p size: %lu)\n",
175 mca_sshmem_ucx_component.super.base_version.mca_type_name,
176 mca_sshmem_ucx_component.super.base_version.mca_component_name,
177 (rc ? "failure" : "successful"),
178 ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size)
179 );
180 return rc;
181 }
182
183 #if HAVE_UCX_DEVICE_MEM
alloc_device_mem(mca_spml_ucx_t * spml,size_t size,void ** address_p)184 static uct_ib_device_mem_h alloc_device_mem(mca_spml_ucx_t *spml, size_t size,
185 void **address_p)
186 {
187 uct_ib_device_mem_h dev_mem = NULL;
188 ucs_status_t status;
189 uct_md_h uct_md;
190 void *address;
191 size_t length;
192
193 uct_md = ucp_context_find_tl_md(spml->ucp_context, "mlx5");
194 if (uct_md == NULL) {
195 SSHMEM_VERBOSE(1, "ucp_context_find_tl_md() returned NULL\n");
196 return NULL;
197 }
198
199 /* If found a matching memory domain, allocate device memory on it */
200 length = size;
201 address = NULL;
202 status = uct_ib_md_alloc_device_mem(uct_md, &length, &address,
203 UCT_MD_MEM_ACCESS_ALL, "sshmem_seg",
204 &dev_mem);
205 if (status != UCS_OK) {
206 /* If could not allocate device memory - fallback to mmap (since some
207 * PEs in the job may succeed and while others failed */
208 SSHMEM_VERBOSE(1, "uct_ib_md_alloc_dm() failed: %s\n",
209 ucs_status_string(status));
210 return NULL;
211 }
212
213 SSHMEM_VERBOSE(3, "uct_ib_md_alloc_dm() returned address %p\n", address);
214 *address_p = address;
215 return dev_mem;
216 }
217 #endif
218
219 static int
segment_create(map_segment_t * ds_buf,const char * file_name,size_t size,long hint)220 segment_create(map_segment_t *ds_buf,
221 const char *file_name,
222 size_t size, long hint)
223 {
224 mca_spml_ucx_t *spml = (mca_spml_ucx_t*)mca_spml.self;
225 unsigned flags;
226 int ret;
227
228 #if HAVE_UCX_DEVICE_MEM
229 int ret = OSHMEM_ERROR;
230 if (hint & SHMEM_HINT_DEVICE_NIC_MEM) {
231 if (size > UINT_MAX) {
232 return OSHMEM_ERR_BAD_PARAM;
233 }
234
235 void *dev_mem_address;
236 uct_ib_device_mem_h dev_mem = alloc_device_mem(spml, size,
237 &dev_mem_address);
238 if (dev_mem != NULL) {
239 ret = segment_create_internal(ds_buf, dev_mem_address, size, 0,
240 hint, dev_mem);
241 if (ret == OSHMEM_SUCCESS) {
242 return OSHMEM_SUCCESS;
243 } else if (dev_mem != NULL) {
244 uct_ib_md_release_device_mem(dev_mem);
245 /* fallback to regular allocation */
246 }
247 }
248 }
249 #endif
250
251 flags = UCP_MEM_MAP_ALLOCATE | (spml->heap_reg_nb ? UCP_MEM_MAP_NONBLOCK : 0);
252 if (hint) {
253 return segment_create_internal(ds_buf, NULL, size, flags, hint, NULL);
254 } else {
255 return segment_create_internal(ds_buf, mca_sshmem_base_start_address,
256 size, flags | UCP_MEM_MAP_FIXED, hint,
257 NULL);
258 }
259 }
260
261 static void *
segment_attach(map_segment_t * ds_buf,sshmem_mkey_t * mkey)262 segment_attach(map_segment_t *ds_buf, sshmem_mkey_t *mkey)
263 {
264 assert(ds_buf);
265 assert(mkey->va_base == 0);
266
267 OPAL_OUTPUT((oshmem_sshmem_base_framework.framework_output,
268 "can not attach to ucx segment"));
269 oshmem_shmem_abort(-1);
270 return NULL;
271 }
272
273 static int
segment_detach(map_segment_t * ds_buf,sshmem_mkey_t * mkey)274 segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey)
275 {
276 OPAL_OUTPUT_VERBOSE(
277 (70, oshmem_sshmem_base_framework.framework_output,
278 "%s: %s: detaching "
279 "(id: %d, addr: %p size: %lu)\n",
280 mca_sshmem_ucx_component.super.base_version.mca_type_name,
281 mca_sshmem_ucx_component.super.base_version.mca_component_name,
282 ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size)
283 );
284
285 /* reset the contents of the map_segment_t associated with this
286 * shared memory segment.
287 */
288 shmem_ds_reset(ds_buf);
289
290 return OSHMEM_SUCCESS;
291 }
292
293 static int
segment_unlink(map_segment_t * ds_buf)294 segment_unlink(map_segment_t *ds_buf)
295 {
296 mca_spml_ucx_t *spml = (mca_spml_ucx_t *)mca_spml.self;
297 mca_sshmem_ucx_segment_context_t *ctx = ds_buf->context;
298
299 if (ctx->shadow_allocator) {
300 sshmem_ucx_shadow_destroy(ctx->shadow_allocator);
301 }
302
303 ucp_mem_unmap(spml->ucp_context, ctx->ucp_memh);
304
305 #if HAVE_UCX_DEVICE_MEM
306 if (ctx->dev_mem) {
307 uct_ib_md_release_device_mem(ctx->dev_mem);
308 }
309 #endif
310
311 ds_buf->context = NULL;
312 free(ctx);
313
314 OPAL_OUTPUT_VERBOSE(
315 (70, oshmem_sshmem_base_framework.framework_output,
316 "%s: %s: unlinking "
317 "(id: %d, addr: %p size: %lu)\n",
318 mca_sshmem_ucx_component.super.base_version.mca_type_name,
319 mca_sshmem_ucx_component.super.base_version.mca_component_name,
320 ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size)
321 );
322
323 ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID;
324 MAP_SEGMENT_INVALIDATE(ds_buf);
325
326 return OSHMEM_SUCCESS;
327 }
328
sshmem_ucx_memheap_index2ptr(map_segment_t * s,unsigned index)329 static void *sshmem_ucx_memheap_index2ptr(map_segment_t *s, unsigned index)
330 {
331 return (char*)s->super.va_base + (index * ALLOC_ELEM_SIZE);
332 }
333
sshmem_ucx_memheap_ptr2index(map_segment_t * s,void * ptr)334 static unsigned sshmem_ucx_memheap_ptr2index(map_segment_t *s, void *ptr)
335 {
336 return ((char*)ptr - (char*)s->super.va_base) / ALLOC_ELEM_SIZE;
337 }
338
sshmem_ucx_memheap_wordcopy(void * dst,void * src,size_t size)339 static void sshmem_ucx_memheap_wordcopy(void *dst, void *src, size_t size)
340 {
341 const size_t count = (size + sizeof(uint64_t) - 1) / sizeof(uint64_t);
342 uint64_t *dst64 = (uint64_t*)dst;
343 uint64_t *src64 = (uint64_t*)src;
344 size_t i;
345
346 for (i = 0; i < count; ++i) {
347 *(dst64++) = *(src64++);
348 }
349 opal_atomic_wmb();
350 }
351
sshmem_ucx_memheap_realloc(map_segment_t * s,size_t size,void * old_ptr,void ** new_ptr)352 static int sshmem_ucx_memheap_realloc(map_segment_t *s, size_t size,
353 void* old_ptr, void** new_ptr)
354 {
355 mca_sshmem_ucx_segment_context_t *ctx = s->context;
356 unsigned alloc_count, index, old_index, old_alloc_count;
357 int res;
358 int inplace;
359
360 if (size > s->seg_size) {
361 return OSHMEM_ERR_OUT_OF_RESOURCE;
362 }
363
364 /* create allocator on demand */
365 if (!ctx->shadow_allocator) {
366 ctx->shadow_allocator = sshmem_ucx_shadow_create(s->seg_size);
367 if (!ctx->shadow_allocator) {
368 return OSHMEM_ERR_OUT_OF_RESOURCE;
369 }
370 }
371
372 /* Allocate new element. Zero-size allocation should still return a unique
373 * pointer, so allocate 1 byte */
374 alloc_count = max((size + ALLOC_ELEM_SIZE - 1) / ALLOC_ELEM_SIZE, 1);
375
376 if (!old_ptr) {
377 res = sshmem_ucx_shadow_alloc(ctx->shadow_allocator, alloc_count, &index);
378 } else {
379 old_index = sshmem_ucx_memheap_ptr2index(s, old_ptr);
380 res = sshmem_ucx_shadow_realloc(ctx->shadow_allocator, alloc_count,
381 old_index, &index, &inplace);
382 }
383
384 if (res != OSHMEM_SUCCESS) {
385 return res;
386 }
387
388 *new_ptr = sshmem_ucx_memheap_index2ptr(s, index);
389
390 /* Copy to new segment and release old*/
391 if (old_ptr && !inplace) {
392 old_alloc_count = sshmem_ucx_shadow_size(ctx->shadow_allocator, old_index);
393 sshmem_ucx_memheap_wordcopy(*new_ptr, old_ptr,
394 min(size, old_alloc_count * ALLOC_ELEM_SIZE));
395 sshmem_ucx_shadow_free(ctx->shadow_allocator, old_index);
396 }
397
398 return OSHMEM_SUCCESS;
399 }
400
sshmem_ucx_memheap_free(map_segment_t * s,void * ptr)401 static int sshmem_ucx_memheap_free(map_segment_t *s, void* ptr)
402 {
403 mca_sshmem_ucx_segment_context_t *ctx = s->context;
404
405 if (!ptr) {
406 return OSHMEM_SUCCESS;
407 }
408
409 return sshmem_ucx_shadow_free(ctx->shadow_allocator,
410 sshmem_ucx_memheap_ptr2index(s, ptr));
411 }
412