1 /*
2  * Copyright (c) 2017      Mellanox Technologies, Inc.
3  *                         All rights reserved.
4  * $COPYRIGHT$
5  *
6  * Additional copyrights may follow
7  *
8  * $HEADER$
9  */
10 
11 #include "oshmem_config.h"
12 
13 #include "opal/constants.h"
14 #include "opal/util/output.h"
15 #include "opal/util/path.h"
16 #include "opal/util/show_help.h"
17 #include "orte/util/show_help.h"
18 
19 #include "oshmem/proc/proc.h"
20 #include "oshmem/mca/sshmem/sshmem.h"
21 #include "oshmem/include/shmemx.h"
22 #include "oshmem/mca/sshmem/base/base.h"
23 #include "oshmem/util/oshmem_util.h"
24 #include "oshmem/mca/spml/ucx/spml_ucx.h"
25 
26 #include "sshmem_ucx.h"
27 
28 //#include <ucs/sys/math.h>
29 
30 #if HAVE_UCX_DEVICE_MEM
31 #include <ucp/core/ucp_resource.h>
32 #include <uct/ib/base/ib_alloc.h>
33 #endif
34 
35 #define ALLOC_ELEM_SIZE sizeof(uint64_t)
36 #define min(a,b) ((a) < (b) ? (a) : (b))
37 #define max(a,b) ((a) > (b) ? (a) : (b))
38 
39 /* ////////////////////////////////////////////////////////////////////////// */
40 /*local functions */
41 /* local functions */
42 static int
43 module_init(void);
44 
45 static int
46 segment_create(map_segment_t *ds_buf,
47                const char *file_name,
48                size_t size, long hint);
49 
50 static void *
51 segment_attach(map_segment_t *ds_buf, sshmem_mkey_t *mkey);
52 
53 static int
54 segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey);
55 
56 static int
57 segment_unlink(map_segment_t *ds_buf);
58 
59 static int
60 module_finalize(void);
61 
62 static int sshmem_ucx_memheap_realloc(map_segment_t *s, size_t size,
63                                       void* old_ptr, void** new_ptr);
64 
65 static int sshmem_ucx_memheap_free(map_segment_t *s, void* ptr);
66 
67 /*
68  * ucx shmem module
69  */
70 mca_sshmem_ucx_module_t mca_sshmem_ucx_module = {
71     /* super */
72     {
73         module_init,
74         segment_create,
75         segment_attach,
76         segment_detach,
77         segment_unlink,
78         module_finalize
79     }
80 };
81 
82 static int
module_init(void)83 module_init(void)
84 {
85     /* nothing to do */
86     return OSHMEM_SUCCESS;
87 }
88 
89 /* ////////////////////////////////////////////////////////////////////////// */
90 static int
module_finalize(void)91 module_finalize(void)
92 {
93     /* nothing to do */
94     return OSHMEM_SUCCESS;
95 }
96 
97 /* ////////////////////////////////////////////////////////////////////////// */
98 
99 static segment_allocator_t sshmem_ucx_allocator = {
100     .sa_realloc = sshmem_ucx_memheap_realloc,
101     .sa_free    = sshmem_ucx_memheap_free
102 };
103 
104 static int
segment_create_internal(map_segment_t * ds_buf,void * address,size_t size,unsigned flags,long hint,void * dev_mem)105 segment_create_internal(map_segment_t *ds_buf, void *address, size_t size,
106                         unsigned flags, long hint, void *dev_mem)
107 {
108     mca_sshmem_ucx_segment_context_t *ctx;
109     int rc = OSHMEM_SUCCESS;
110     mca_spml_ucx_t *spml = (mca_spml_ucx_t*)mca_spml.self;
111     ucp_mem_map_params_t mem_map_params;
112     ucp_mem_h mem_h;
113     ucs_status_t status;
114 
115     assert(ds_buf);
116 
117     /* init the contents of map_segment_t */
118     shmem_ds_reset(ds_buf);
119 
120     mem_map_params.field_mask = UCP_MEM_MAP_PARAM_FIELD_ADDRESS |
121                                 UCP_MEM_MAP_PARAM_FIELD_LENGTH |
122                                 UCP_MEM_MAP_PARAM_FIELD_FLAGS;
123 
124     mem_map_params.address    = address;
125     mem_map_params.length     = size;
126     mem_map_params.flags      = flags;
127 
128     status = ucp_mem_map(spml->ucp_context, &mem_map_params, &mem_h);
129     if (UCS_OK != status) {
130         SSHMEM_ERROR("ucp_mem_map() failed: %s\n", ucs_status_string(status));
131         rc = OSHMEM_ERROR;
132         goto out;
133     }
134 
135     if (!(flags & UCP_MEM_MAP_FIXED)) {
136         /* Memory was allocated at an arbitrary address; obtain it */
137         ucp_mem_attr_t mem_attr;
138         mem_attr.field_mask = UCP_MEM_ATTR_FIELD_ADDRESS;
139         status = ucp_mem_query(mem_h, &mem_attr);
140         if (status != UCS_OK) {
141             SSHMEM_ERROR("ucp_mem_query() failed: %s\n", ucs_status_string(status));
142             ucp_mem_unmap(spml->ucp_context, mem_h);
143             rc = OSHMEM_ERROR;
144             goto out;
145         }
146 
147         ds_buf->super.va_base = mem_attr.address;
148     } else {
149         ds_buf->super.va_base = mem_map_params.address;
150     }
151 
152     ctx = calloc(1, sizeof(*ctx));
153     if (!ctx) {
154         ucp_mem_unmap(spml->ucp_context, mem_h);
155         rc = OSHMEM_ERR_OUT_OF_RESOURCE;
156         goto out;
157     }
158 
159     ds_buf->seg_size      = size;
160     ds_buf->super.va_end  = (void*)((uintptr_t)ds_buf->super.va_base + ds_buf->seg_size);
161     ds_buf->context       = ctx;
162     ds_buf->type          = MAP_SEGMENT_ALLOC_UCX;
163     ds_buf->alloc_hints   = hint;
164     ctx->ucp_memh         = mem_h;
165     ctx->dev_mem          = dev_mem;
166     if (hint) {
167         ds_buf->allocator = &sshmem_ucx_allocator;
168     }
169 
170 out:
171     OPAL_OUTPUT_VERBOSE(
172           (70, oshmem_sshmem_base_framework.framework_output,
173            "%s: %s: create %s "
174            "(id: %d, addr: %p size: %lu)\n",
175            mca_sshmem_ucx_component.super.base_version.mca_type_name,
176            mca_sshmem_ucx_component.super.base_version.mca_component_name,
177            (rc ? "failure" : "successful"),
178            ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size)
179       );
180     return rc;
181 }
182 
183 #if HAVE_UCX_DEVICE_MEM
alloc_device_mem(mca_spml_ucx_t * spml,size_t size,void ** address_p)184 static uct_ib_device_mem_h alloc_device_mem(mca_spml_ucx_t *spml, size_t size,
185                                             void **address_p)
186 {
187     uct_ib_device_mem_h dev_mem = NULL;
188     ucs_status_t status;
189     uct_md_h uct_md;
190     void *address;
191     size_t length;
192 
193     uct_md = ucp_context_find_tl_md(spml->ucp_context, "mlx5");
194     if (uct_md == NULL) {
195         SSHMEM_VERBOSE(1, "ucp_context_find_tl_md() returned NULL\n");
196         return NULL;
197     }
198 
199     /* If found a matching memory domain, allocate device memory on it */
200     length  = size;
201     address = NULL;
202     status = uct_ib_md_alloc_device_mem(uct_md, &length, &address,
203                                         UCT_MD_MEM_ACCESS_ALL, "sshmem_seg",
204                                         &dev_mem);
205     if (status != UCS_OK) {
206         /* If could not allocate device memory - fallback to mmap (since some
207          * PEs in the job may succeed and while others failed */
208         SSHMEM_VERBOSE(1, "uct_ib_md_alloc_dm() failed: %s\n",
209                        ucs_status_string(status));
210         return NULL;
211     }
212 
213     SSHMEM_VERBOSE(3, "uct_ib_md_alloc_dm() returned address %p\n", address);
214     *address_p = address;
215     return dev_mem;
216 }
217 #endif
218 
219 static int
segment_create(map_segment_t * ds_buf,const char * file_name,size_t size,long hint)220 segment_create(map_segment_t *ds_buf,
221                const char *file_name,
222                size_t size, long hint)
223 {
224     mca_spml_ucx_t *spml = (mca_spml_ucx_t*)mca_spml.self;
225     unsigned flags;
226     int ret;
227 
228 #if HAVE_UCX_DEVICE_MEM
229     int ret = OSHMEM_ERROR;
230     if (hint & SHMEM_HINT_DEVICE_NIC_MEM) {
231         if (size > UINT_MAX) {
232             return OSHMEM_ERR_BAD_PARAM;
233         }
234 
235         void *dev_mem_address;
236         uct_ib_device_mem_h dev_mem = alloc_device_mem(spml, size,
237                                                        &dev_mem_address);
238         if (dev_mem != NULL) {
239             ret = segment_create_internal(ds_buf, dev_mem_address, size, 0,
240                                           hint, dev_mem);
241             if (ret == OSHMEM_SUCCESS) {
242                 return OSHMEM_SUCCESS;
243             } else if (dev_mem != NULL) {
244                 uct_ib_md_release_device_mem(dev_mem);
245                 /* fallback to regular allocation */
246             }
247         }
248     }
249 #endif
250 
251     flags = UCP_MEM_MAP_ALLOCATE | (spml->heap_reg_nb ? UCP_MEM_MAP_NONBLOCK : 0);
252     if (hint) {
253         return segment_create_internal(ds_buf, NULL, size, flags, hint, NULL);
254     } else {
255         return segment_create_internal(ds_buf, mca_sshmem_base_start_address,
256                                        size, flags | UCP_MEM_MAP_FIXED, hint,
257                                        NULL);
258     }
259 }
260 
261 static void *
segment_attach(map_segment_t * ds_buf,sshmem_mkey_t * mkey)262 segment_attach(map_segment_t *ds_buf, sshmem_mkey_t *mkey)
263 {
264     assert(ds_buf);
265     assert(mkey->va_base == 0);
266 
267     OPAL_OUTPUT((oshmem_sshmem_base_framework.framework_output,
268                 "can not attach to ucx segment"));
269     oshmem_shmem_abort(-1);
270     return NULL;
271 }
272 
273 static int
segment_detach(map_segment_t * ds_buf,sshmem_mkey_t * mkey)274 segment_detach(map_segment_t *ds_buf, sshmem_mkey_t *mkey)
275 {
276     OPAL_OUTPUT_VERBOSE(
277         (70, oshmem_sshmem_base_framework.framework_output,
278          "%s: %s: detaching "
279             "(id: %d, addr: %p size: %lu)\n",
280             mca_sshmem_ucx_component.super.base_version.mca_type_name,
281             mca_sshmem_ucx_component.super.base_version.mca_component_name,
282             ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size)
283     );
284 
285     /* reset the contents of the map_segment_t associated with this
286      * shared memory segment.
287      */
288     shmem_ds_reset(ds_buf);
289 
290     return OSHMEM_SUCCESS;
291 }
292 
293 static int
segment_unlink(map_segment_t * ds_buf)294 segment_unlink(map_segment_t *ds_buf)
295 {
296     mca_spml_ucx_t *spml = (mca_spml_ucx_t *)mca_spml.self;
297     mca_sshmem_ucx_segment_context_t *ctx = ds_buf->context;
298 
299     if (ctx->shadow_allocator) {
300         sshmem_ucx_shadow_destroy(ctx->shadow_allocator);
301     }
302 
303     ucp_mem_unmap(spml->ucp_context, ctx->ucp_memh);
304 
305 #if HAVE_UCX_DEVICE_MEM
306     if (ctx->dev_mem) {
307         uct_ib_md_release_device_mem(ctx->dev_mem);
308     }
309 #endif
310 
311     ds_buf->context = NULL;
312     free(ctx);
313 
314     OPAL_OUTPUT_VERBOSE(
315         (70, oshmem_sshmem_base_framework.framework_output,
316          "%s: %s: unlinking "
317             "(id: %d, addr: %p size: %lu)\n",
318             mca_sshmem_ucx_component.super.base_version.mca_type_name,
319             mca_sshmem_ucx_component.super.base_version.mca_component_name,
320             ds_buf->seg_id, ds_buf->super.va_base, (unsigned long)ds_buf->seg_size)
321     );
322 
323     ds_buf->seg_id = MAP_SEGMENT_SHM_INVALID;
324     MAP_SEGMENT_INVALIDATE(ds_buf);
325 
326     return OSHMEM_SUCCESS;
327 }
328 
sshmem_ucx_memheap_index2ptr(map_segment_t * s,unsigned index)329 static void *sshmem_ucx_memheap_index2ptr(map_segment_t *s, unsigned index)
330 {
331     return (char*)s->super.va_base + (index * ALLOC_ELEM_SIZE);
332 }
333 
sshmem_ucx_memheap_ptr2index(map_segment_t * s,void * ptr)334 static unsigned sshmem_ucx_memheap_ptr2index(map_segment_t *s, void *ptr)
335 {
336     return ((char*)ptr - (char*)s->super.va_base) / ALLOC_ELEM_SIZE;
337 }
338 
sshmem_ucx_memheap_wordcopy(void * dst,void * src,size_t size)339 static void sshmem_ucx_memheap_wordcopy(void *dst, void *src, size_t size)
340 {
341     const size_t count = (size + sizeof(uint64_t) - 1) / sizeof(uint64_t);
342     uint64_t *dst64 = (uint64_t*)dst;
343     uint64_t *src64 = (uint64_t*)src;
344     size_t i;
345 
346     for (i = 0; i < count; ++i) {
347         *(dst64++) = *(src64++);
348     }
349     opal_atomic_wmb();
350 }
351 
sshmem_ucx_memheap_realloc(map_segment_t * s,size_t size,void * old_ptr,void ** new_ptr)352 static int sshmem_ucx_memheap_realloc(map_segment_t *s, size_t size,
353                                       void* old_ptr, void** new_ptr)
354 {
355     mca_sshmem_ucx_segment_context_t *ctx = s->context;
356     unsigned alloc_count, index, old_index, old_alloc_count;
357     int res;
358     int inplace;
359 
360     if (size > s->seg_size) {
361         return OSHMEM_ERR_OUT_OF_RESOURCE;
362     }
363 
364     /* create allocator on demand */
365     if (!ctx->shadow_allocator) {
366         ctx->shadow_allocator = sshmem_ucx_shadow_create(s->seg_size);
367         if (!ctx->shadow_allocator) {
368             return OSHMEM_ERR_OUT_OF_RESOURCE;
369         }
370     }
371 
372     /* Allocate new element. Zero-size allocation should still return a unique
373      * pointer, so allocate 1 byte */
374     alloc_count = max((size + ALLOC_ELEM_SIZE - 1) / ALLOC_ELEM_SIZE, 1);
375 
376     if (!old_ptr) {
377         res = sshmem_ucx_shadow_alloc(ctx->shadow_allocator, alloc_count, &index);
378     } else {
379         old_index = sshmem_ucx_memheap_ptr2index(s, old_ptr);
380         res       = sshmem_ucx_shadow_realloc(ctx->shadow_allocator, alloc_count,
381                                               old_index, &index, &inplace);
382     }
383 
384     if (res != OSHMEM_SUCCESS) {
385         return res;
386     }
387 
388     *new_ptr = sshmem_ucx_memheap_index2ptr(s, index);
389 
390     /* Copy to new segment and release old*/
391     if (old_ptr && !inplace) {
392         old_alloc_count = sshmem_ucx_shadow_size(ctx->shadow_allocator, old_index);
393         sshmem_ucx_memheap_wordcopy(*new_ptr, old_ptr,
394                                     min(size, old_alloc_count * ALLOC_ELEM_SIZE));
395         sshmem_ucx_shadow_free(ctx->shadow_allocator, old_index);
396     }
397 
398     return OSHMEM_SUCCESS;
399 }
400 
sshmem_ucx_memheap_free(map_segment_t * s,void * ptr)401 static int sshmem_ucx_memheap_free(map_segment_t *s, void* ptr)
402 {
403     mca_sshmem_ucx_segment_context_t *ctx = s->context;
404 
405     if (!ptr) {
406         return OSHMEM_SUCCESS;
407     }
408 
409     return sshmem_ucx_shadow_free(ctx->shadow_allocator,
410                                   sshmem_ucx_memheap_ptr2index(s, ptr));
411 }
412