1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
4 * University Research and Technology
5 * Corporation. All rights reserved.
6 * Copyright (c) 2004-2011 The University of Tennessee and The University
7 * of Tennessee Research Foundation. All rights
8 * reserved.
9 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
10 * University of Stuttgart. All rights reserved.
11 * Copyright (c) 2004-2005 The Regents of the University of California.
12 * All rights reserved.
13 * Copyright (c) 2007-2015 Cisco Systems, Inc. All rights reserved.
14 * Copyright (c) 2006-2007 Voltaire All rights reserved.
15 * Copyright (c) 2014-2017 Intel, Inc. All rights reserved.
16 * Copyright (c) 2015-2018 Research Organization for Information Science
17 * and Technology (RIST). All rights reserved.
18 * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
19 * Copyright (c) 2016-2017 Los Alamos National Security, LLC. All rights
20 * reserved.
21 *
22 * $COPYRIGHT$
23 *
24 * Additional copyrights may follow
25 *
26 * $HEADER$
27 */
28
29 #include "opal_config.h"
30
31 #include "opal/util/arch.h"
32 #include "opal/mca/pmix/pmix.h"
33
34 #include "btl_openib.h"
35 #include "btl_openib_proc.h"
36 #include "connect/base.h"
37 #include "connect/connect.h"
38
39 static void mca_btl_openib_proc_btl_construct(mca_btl_openib_proc_btlptr_t* elem);
40 static void mca_btl_openib_proc_btl_destruct(mca_btl_openib_proc_btlptr_t* elem);
41
42 OBJ_CLASS_INSTANCE(mca_btl_openib_proc_btlptr_t,
43 opal_list_item_t, mca_btl_openib_proc_btl_construct,
44 mca_btl_openib_proc_btl_destruct);
45
mca_btl_openib_proc_btl_construct(mca_btl_openib_proc_btlptr_t * elem)46 static void mca_btl_openib_proc_btl_construct(mca_btl_openib_proc_btlptr_t* elem)
47 {
48 elem->openib_btl = NULL;
49 }
50
mca_btl_openib_proc_btl_destruct(mca_btl_openib_proc_btlptr_t * elem)51 static void mca_btl_openib_proc_btl_destruct(mca_btl_openib_proc_btlptr_t* elem)
52 {
53 elem->openib_btl = NULL;
54 }
55
56 static void mca_btl_openib_proc_construct(mca_btl_openib_proc_t* proc);
57 static void mca_btl_openib_proc_destruct(mca_btl_openib_proc_t* proc);
58
59 OBJ_CLASS_INSTANCE(mca_btl_openib_proc_t,
60 opal_list_item_t, mca_btl_openib_proc_construct,
61 mca_btl_openib_proc_destruct);
62
mca_btl_openib_proc_construct(mca_btl_openib_proc_t * ib_proc)63 void mca_btl_openib_proc_construct(mca_btl_openib_proc_t* ib_proc)
64 {
65 ib_proc->proc_opal = 0;
66 ib_proc->proc_ports = NULL;
67 ib_proc->proc_port_count = 0;
68 ib_proc->proc_endpoints = 0;
69 ib_proc->proc_endpoint_count = 0;
70 OBJ_CONSTRUCT(&ib_proc->proc_lock, opal_mutex_t);
71 OBJ_CONSTRUCT(&ib_proc->openib_btls, opal_list_t);
72 }
73
74 /*
75 * Cleanup ib proc instance
76 */
77
mca_btl_openib_proc_destruct(mca_btl_openib_proc_t * ib_proc)78 void mca_btl_openib_proc_destruct(mca_btl_openib_proc_t* ib_proc)
79 {
80 /* release resources */
81 if(NULL != ib_proc->proc_endpoints) {
82 free(ib_proc->proc_endpoints);
83 }
84 if (NULL != ib_proc->proc_ports) {
85 int i, j;
86 for (i = 0; i < ib_proc->proc_port_count; ++i) {
87 for (j = 0; j < ib_proc->proc_ports[i].pm_cpc_data_count; ++j) {
88 if (NULL != ib_proc->proc_ports[i].pm_cpc_data[j].cbm_modex_message) {
89 free(ib_proc->proc_ports[i].pm_cpc_data[j].cbm_modex_message);
90 }
91 }
92 }
93 free(ib_proc->proc_ports);
94 }
95 OBJ_DESTRUCT(&ib_proc->proc_lock);
96
97 OPAL_LIST_DESTRUCT(&ib_proc->openib_btls);
98 }
99
100
101 /*
102 * Look for an existing IB process instances based on the associated
103 * opal_proc_t instance.
104 */
ibproc_lookup_no_lock(opal_proc_t * proc)105 static mca_btl_openib_proc_t* ibproc_lookup_no_lock(opal_proc_t* proc)
106 {
107 mca_btl_openib_proc_t* ib_proc;
108
109 OPAL_LIST_FOREACH(ib_proc, &mca_btl_openib_component.ib_procs, mca_btl_openib_proc_t) {
110 if(ib_proc->proc_opal == proc) {
111 return ib_proc;
112 }
113 }
114 return NULL;
115 }
116
ibproc_lookup_and_lock(opal_proc_t * proc)117 static mca_btl_openib_proc_t* ibproc_lookup_and_lock(opal_proc_t* proc)
118 {
119 mca_btl_openib_proc_t* ib_proc;
120
121 /* get the process from the list */
122 opal_mutex_lock(&mca_btl_openib_component.ib_lock);
123 ib_proc = ibproc_lookup_no_lock(proc);
124 opal_mutex_unlock(&mca_btl_openib_component.ib_lock);
125 if( NULL != ib_proc ){
126 /* if we were able to find it - lock it.
127 * NOTE: we want to lock it outside of list locked region */
128 opal_mutex_lock(&ib_proc->proc_lock);
129 }
130 return ib_proc;
131 }
132
unpack8(char ** src,uint8_t * value)133 static void inline unpack8(char **src, uint8_t *value)
134 {
135 /* Copy one character */
136 *value = (uint8_t) **src;
137 /* Most the src ahead one */
138 ++*src;
139 }
140
141 /*
142 * Create a IB process structure. There is a one-to-one correspondence
143 * between a opal_proc_t and a mca_btl_openib_proc_t instance. We
144 * cache additional data (specifically the list of
145 * mca_btl_openib_endpoint_t instances, and published addresses)
146 * associated w/ a given destination on this datastructure.
147 */
148
mca_btl_openib_proc_get_locked(opal_proc_t * proc)149 mca_btl_openib_proc_t* mca_btl_openib_proc_get_locked(opal_proc_t* proc)
150 {
151 mca_btl_openib_proc_t *ib_proc = NULL, *ib_proc_ret = NULL;
152 size_t msg_size;
153 uint32_t size;
154 int rc, i, j;
155 void *message;
156 char *offset;
157 int modex_message_size;
158 mca_btl_openib_modex_message_t dummy;
159 bool is_new = false;
160
161 /* Check if we have already created a IB proc
162 * structure for this ompi process */
163 ib_proc = ibproc_lookup_and_lock(proc);
164 if (NULL != ib_proc) {
165 /* Gotcha! */
166 return ib_proc;
167 }
168
169 /* All initialization has to be an atomic operation. we do the following assumption:
170 * - we let all concurent threads to try to do the initialization;
171 * - when one has finished it locks ib_lock and checks if corresponding
172 * process is still missing;
173 * - if so - new proc is added, otherwise - initialized proc struct is released.
174 */
175
176 /* First time, gotta create a new IB proc
177 * out of the opal_proc ... */
178 ib_proc = OBJ_NEW(mca_btl_openib_proc_t);
179 if (NULL == ib_proc) {
180 return NULL;
181 }
182
183 /* Initialize number of peer */
184 ib_proc->proc_endpoint_count = 0;
185 ib_proc->proc_opal = proc;
186
187 /* query for the peer address info */
188 OPAL_MODEX_RECV(rc, &mca_btl_openib_component.super.btl_version,
189 &proc->proc_name, &message, &msg_size);
190 if (OPAL_SUCCESS != rc) {
191 BTL_VERBOSE(("[%s:%d] opal_modex_recv failed for peer %s",
192 __FILE__, __LINE__,
193 OPAL_NAME_PRINT(proc->proc_name)));
194 goto no_err_exit;
195 }
196 if (0 == msg_size) {
197 goto no_err_exit;
198 }
199
200 /* Message was packed in btl_openib_component.c; the format is
201 listed in a comment in that file */
202 modex_message_size = ((char *) &(dummy.end)) - ((char*) &dummy);
203
204 /* Unpack the number of modules in the message */
205 offset = (char *) message;
206 unpack8(&offset, &(ib_proc->proc_port_count));
207 BTL_VERBOSE(("unpack: %d btls", ib_proc->proc_port_count));
208 if (ib_proc->proc_port_count > 0) {
209 ib_proc->proc_ports = (mca_btl_openib_proc_modex_t *)
210 malloc(sizeof(mca_btl_openib_proc_modex_t) *
211 ib_proc->proc_port_count);
212 } else {
213 ib_proc->proc_ports = NULL;
214 }
215
216 /* Loop over unpacking all the ports */
217 for (i = 0; i < ib_proc->proc_port_count; i++) {
218
219 /* Unpack the modex comment message struct */
220 size = modex_message_size;
221 memcpy(&(ib_proc->proc_ports[i].pm_port_info), offset, size);
222 #if !defined(WORDS_BIGENDIAN) && OPAL_ENABLE_HETEROGENEOUS_SUPPORT
223 MCA_BTL_OPENIB_MODEX_MSG_NTOH(ib_proc->proc_ports[i].pm_port_info);
224 #endif
225 offset += size;
226 BTL_VERBOSE(("unpacked btl %d: modex message, offset now %d",
227 i, (int)(offset-((char*)message))));
228
229 /* Unpack the number of CPCs that follow */
230 unpack8(&offset, &(ib_proc->proc_ports[i].pm_cpc_data_count));
231 BTL_VERBOSE(("unpacked btl %d: number of cpcs to follow %d (offset now %d)",
232 i, ib_proc->proc_ports[i].pm_cpc_data_count,
233 (int)(offset-((char*)message))));
234 ib_proc->proc_ports[i].pm_cpc_data = (opal_btl_openib_connect_base_module_data_t *)
235 calloc(ib_proc->proc_ports[i].pm_cpc_data_count,
236 sizeof(opal_btl_openib_connect_base_module_data_t));
237 if (NULL == ib_proc->proc_ports[i].pm_cpc_data) {
238 goto err_exit;
239 }
240
241 /* Unpack the CPCs */
242 for (j = 0; j < ib_proc->proc_ports[i].pm_cpc_data_count; ++j) {
243 uint8_t u8;
244 opal_btl_openib_connect_base_module_data_t *cpcd;
245 cpcd = ib_proc->proc_ports[i].pm_cpc_data + j;
246 unpack8(&offset, &u8);
247 BTL_VERBOSE(("unpacked btl %d: cpc %d: index %d (offset now %d)",
248 i, j, u8, (int)(offset-(char*)message)));
249 cpcd->cbm_component =
250 opal_btl_openib_connect_base_get_cpc_byindex(u8);
251 BTL_VERBOSE(("unpacked btl %d: cpc %d: component %s",
252 i, j, cpcd->cbm_component->cbc_name));
253
254 unpack8(&offset, &cpcd->cbm_priority);
255 unpack8(&offset, &cpcd->cbm_modex_message_len);
256 BTL_VERBOSE(("unpacked btl %d: cpc %d: priority %d, msg len %d (offset now %d)",
257 i, j, cpcd->cbm_priority,
258 cpcd->cbm_modex_message_len,
259 (int)(offset-(char*)message)));
260 if (cpcd->cbm_modex_message_len > 0) {
261 cpcd->cbm_modex_message = malloc(cpcd->cbm_modex_message_len);
262 if (NULL == cpcd->cbm_modex_message) {
263 BTL_ERROR(("Failed to malloc"));
264 goto err_exit;
265 }
266 memcpy(cpcd->cbm_modex_message, offset,
267 cpcd->cbm_modex_message_len);
268 offset += cpcd->cbm_modex_message_len;
269 BTL_VERBOSE(("unpacked btl %d: cpc %d: blob unpacked %d %x (offset now %d)",
270 i, j,
271 ((uint32_t*)cpcd->cbm_modex_message)[0],
272 ((uint32_t*)cpcd->cbm_modex_message)[1],
273 (int)(offset-((char*)message))));
274 }
275 }
276 }
277
278 if (0 == ib_proc->proc_port_count) {
279 ib_proc->proc_endpoints = NULL;
280 } else {
281 ib_proc->proc_endpoints = (volatile mca_btl_base_endpoint_t**)
282 malloc(ib_proc->proc_port_count *
283 sizeof(mca_btl_base_endpoint_t*));
284 }
285 if (NULL == ib_proc->proc_endpoints) {
286 goto err_exit;
287 }
288
289 BTL_VERBOSE(("unpacking done!"));
290
291 /* Finally add this process to the initialized procs list */
292 opal_mutex_lock(&mca_btl_openib_component.ib_lock);
293
294 ib_proc_ret = ibproc_lookup_no_lock(proc);
295 if (NULL == ib_proc_ret) {
296 /* if process can't be found in this list - insert it locked
297 * it is safe to lock ib_proc here because this thread is
298 * the only one who knows about it so far */
299 opal_mutex_lock(&ib_proc->proc_lock);
300 opal_list_append(&mca_btl_openib_component.ib_procs, &ib_proc->super);
301 ib_proc_ret = ib_proc;
302 is_new = true;
303 } else {
304 /* otherwise - release module_proc */
305 OBJ_RELEASE(ib_proc);
306 }
307 opal_mutex_unlock(&mca_btl_openib_component.ib_lock);
308
309 /* if we haven't insert the process - lock it here so we
310 * won't lock mca_btl_openib_component.ib_lock */
311 if( !is_new ){
312 opal_mutex_lock(&ib_proc_ret->proc_lock);
313 }
314
315 return ib_proc_ret;
316
317 err_exit:
318
319 BTL_ERROR(("%d: error exit from mca_btl_openib_proc_create", OPAL_PROC_MY_NAME.vpid));
320
321 no_err_exit:
322
323 OBJ_RELEASE(ib_proc);
324 return NULL;
325 }
326
mca_btl_openib_proc_remove(opal_proc_t * proc,mca_btl_base_endpoint_t * endpoint)327 int mca_btl_openib_proc_remove(opal_proc_t *proc,
328 mca_btl_base_endpoint_t *endpoint)
329 {
330 size_t i;
331 mca_btl_openib_proc_t* ib_proc = NULL;
332
333 /* Remove endpoint from the openib BTL version of the proc as
334 well */
335 ib_proc = ibproc_lookup_and_lock(proc);
336 if (NULL != ib_proc) {
337 for (i = 0; i < ib_proc->proc_endpoint_count; ++i) {
338 if (ib_proc->proc_endpoints[i] == endpoint) {
339 ib_proc->proc_endpoints[i] = NULL;
340 if (i == ib_proc->proc_endpoint_count - 1) {
341 --ib_proc->proc_endpoint_count;
342 }
343 opal_mutex_unlock(&ib_proc->proc_lock);
344 return OPAL_SUCCESS;
345 }
346 }
347 }
348
349 return OPAL_ERR_NOT_FOUND;
350 }
351
352 /*
353 * Note that this routine must be called with the lock on the process
354 * already held. Insert a btl instance into the proc array and assign
355 * it an address.
356 */
mca_btl_openib_proc_insert(mca_btl_openib_proc_t * module_proc,mca_btl_base_endpoint_t * module_endpoint)357 int mca_btl_openib_proc_insert(mca_btl_openib_proc_t* module_proc,
358 mca_btl_base_endpoint_t* module_endpoint)
359 {
360 /* insert into endpoint array */
361
362
363 #ifndef WORDS_BIGENDIAN
364 /* if we are little endian and our peer is not so lucky, then we
365 need to put all information sent to him in big endian (aka
366 Network Byte Order) and expect all information received to
367 be in NBO. Since big endian machines always send and receive
368 in NBO, we don't care so much about that case. */
369 if (module_proc->proc_opal->proc_arch & OPAL_ARCH_ISBIGENDIAN) {
370 module_endpoint->nbo = true;
371 }
372 #endif
373
374 /* only allow eager rdma if the peers agree on the size of a long */
375 if((module_proc->proc_opal->proc_arch & OPAL_ARCH_LONGISxx) !=
376 (opal_proc_local_get()->proc_arch & OPAL_ARCH_LONGISxx)) {
377 module_endpoint->use_eager_rdma = false;
378 }
379
380 module_endpoint->endpoint_proc = module_proc;
381 module_proc->proc_endpoints[module_proc->proc_endpoint_count++] = module_endpoint;
382 return OPAL_SUCCESS;
383 }
384
mca_btl_openib_proc_reg_btl(mca_btl_openib_proc_t * ib_proc,mca_btl_openib_module_t * openib_btl)385 int mca_btl_openib_proc_reg_btl(mca_btl_openib_proc_t* ib_proc,
386 mca_btl_openib_module_t* openib_btl)
387 {
388 mca_btl_openib_proc_btlptr_t* elem;
389
390 OPAL_LIST_FOREACH(elem, &ib_proc->openib_btls, mca_btl_openib_proc_btlptr_t) {
391 if(elem->openib_btl == openib_btl) {
392 /* this is normal return meaning that this BTL has already touched this ib_proc */
393 return OPAL_ERR_RESOURCE_BUSY;
394 }
395 }
396
397 elem = OBJ_NEW(mca_btl_openib_proc_btlptr_t);
398 if( NULL == elem ){
399 return OPAL_ERR_OUT_OF_RESOURCE;
400 }
401 elem->openib_btl = openib_btl;
402 opal_list_append(&ib_proc->openib_btls, &elem->super);
403 return OPAL_SUCCESS;
404 }
405