1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3 * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
4 * reserved.
5 * Copyright (c) 2014 Research Organization for Information Science
6 * and Technology (RIST). All rights reserved.
7 * 2014 Mellanox Technologies, Inc.
8 * All rights reserved.
9 * $COPYRIGHT$
10 *
11 * Additional copyrights may follow
12 *
13 * $HEADER$
14 *
15 */
16
17 #include "orte_config.h"
18 #include "orte/types.h"
19 #include "opal/types.h"
20
21 #include "orte/util/name_fns.h"
22 #include "orte/runtime/orte_globals.h"
23 #include "orte/util/proc_info.h"
24 #include "orte/util/show_help.h"
25
26 #include "orte/mca/routed/routed.h"
27
28 #include "oob_ud.h"
29 #include "oob_ud_send.h"
30
31 #define min(a,b) ((a) < (b) ? (a) : (b))
32
33 static int mca_oob_ud_module_init (void);
34 static void mca_oob_ud_module_fini (mca_oob_ud_peer_t **peer);
35 static int mca_oob_ud_set_addr (const orte_process_name_t *name, const char *uri);
36 static void mca_oob_ud_send_nb(orte_rml_send_t *msg);
37 static void mca_oob_ud_ping(const orte_process_name_t *proc);
38
39 mca_oob_ud_module_t mca_oob_ud_module = {
40 {
41 mca_oob_ud_module_init,
42 mca_oob_ud_module_fini,
43
44 mca_oob_ud_set_addr,
45
46 mca_oob_ud_ping,
47
48 mca_oob_ud_send_nb
49 }
50 };
51
mca_oob_ud_send_nb(orte_rml_send_t * msg)52 static void mca_oob_ud_send_nb(orte_rml_send_t *msg) {
53 opal_output_verbose(2, orte_oob_base_framework.framework_output,
54 "%s oob:ud:send_nb to peer %s",
55 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
56 ORTE_NAME_PRINT(&msg->dst));
57
58 /* push this into our event base for processing */
59 ORTE_ACTIVATE_UD_POST_SEND(msg, mca_oob_ud_process_send_nb);
60 }
61
mca_oob_ud_ping(const orte_process_name_t * proc)62 static void mca_oob_ud_ping(const orte_process_name_t *proc) {
63 opal_output_verbose(2, orte_oob_base_framework.framework_output,
64 "%s oob:ud:ping proc %s",
65 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
66 ORTE_NAME_PRINT(proc));
67
68 /* push this into our event base for processing */
69 ORTE_ACTIVATE_UD_PING(proc, mca_oob_ud_process_ping);
70 }
71
72 /* uri must be at least 27 bytes in size */
mca_oob_ud_port_get_uri(mca_oob_ud_port_t * port,char * uri)73 void mca_oob_ud_port_get_uri (mca_oob_ud_port_t *port, char *uri)
74 {
75 sprintf (uri, "ud://%u.%u.%u", port->listen_qp.ib_qp->qp_num,
76 port->lid, port->port_num);
77 }
78
mca_oob_ud_set_addr(const orte_process_name_t * name,const char * uri)79 static int mca_oob_ud_set_addr (const orte_process_name_t *name, const char *uri)
80 {
81 mca_oob_ud_peer_t *peer = NULL;
82 int rc;
83
84 opal_output_verbose(5, orte_oob_base_framework.framework_output,
85 "%s oob:ud:set_addr: setting location for peer %s from %s",
86 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(name), uri);
87
88 (void) mca_oob_ud_peer_lookup (name, &peer);
89
90 if (NULL == uri) {
91 if (NULL != peer) {
92 mca_oob_ud_peer_release (peer);
93 }
94
95 peer = NULL;
96 } else if (NULL == peer) {
97 peer = mca_oob_ud_peer_from_uri (uri);
98 if (NULL == peer) {
99 return ORTE_ERR_BAD_PARAM;
100 }
101 } else {
102 rc = mca_oob_ud_peer_update_with_uri (peer, uri);
103
104 if (ORTE_SUCCESS != rc) {
105 return rc;
106 }
107 }
108
109 if (NULL != peer) {
110 peer->peer_name = *name;
111 peer->needs_notification = true;
112 }
113
114 opal_proc_table_set_value(&mca_oob_ud_module.peers,
115 *name, (void *)peer);
116
117 return ORTE_SUCCESS;
118 }
119
mca_oob_ud_port_post_one_recv(mca_oob_ud_port_t * port,int msg_num)120 int mca_oob_ud_port_post_one_recv (mca_oob_ud_port_t *port, int msg_num)
121 {
122 char *grh_buf = port->grh_buf.ptr + msg_num * sizeof (struct ibv_grh);
123 char *msg_buf = port->msg_buf.ptr + msg_num * port->mtu;
124 struct ibv_recv_wr wr;
125 struct ibv_sge sge[2];
126
127 /* GRH */
128 mca_oob_ud_fill_sge(sge, grh_buf, sizeof (struct ibv_grh), port->grh_buf.mr->lkey);
129
130 /* message */
131 mca_oob_ud_fill_sge(sge + 1, msg_buf, port->mtu, port->msg_buf.mr->lkey);
132
133 mca_oob_ud_fill_recv_wr (&wr, sge, 2);
134 wr.wr_id = MCA_OOB_UD_RECV_WR | (uint64_t)msg_num;
135
136 return mca_oob_ud_qp_post_recv (&port->listen_qp, &wr);
137 }
138
139 static bool module_has_been_inited = false;
140
mca_oob_ud_module_init(void)141 static int mca_oob_ud_module_init (void)
142 {
143 /* protect against repeat inits */
144 if (module_has_been_inited) {
145 return ORTE_SUCCESS;
146 }
147 module_has_been_inited = true;
148
149 OBJ_CONSTRUCT(&mca_oob_ud_module.peers, opal_proc_table_t);
150 opal_proc_table_init (&mca_oob_ud_module.peers, 16, 1024);
151
152 return ORTE_SUCCESS;
153 }
154
mca_oob_ud_module_fini(mca_oob_ud_peer_t ** peer)155 static void mca_oob_ud_module_fini (mca_oob_ud_peer_t **peer)
156 {
157 opal_process_name_t key;
158 void *node1, *node2;
159 int rc;
160
161 rc = opal_proc_table_get_first_key (&mca_oob_ud_module.peers, &key,
162 (void **) peer, &node1, &node2);
163 if (OPAL_SUCCESS == rc) {
164 do {
165 if (NULL != *peer) {
166 mca_oob_ud_peer_release (*peer);
167 }
168 rc = opal_proc_table_get_next_key (&mca_oob_ud_module.peers, &key,
169 (void **) peer, node1, &node1, node2, &node2);
170 } while (OPAL_SUCCESS == rc);
171 }
172
173 opal_proc_table_remove_all(&mca_oob_ud_module.peers);
174
175 OBJ_DESTRUCT(&mca_oob_ud_module.peers);
176
177 return;
178 }
179
mca_oob_ud_register_iov(struct iovec * iov,int count,struct ibv_mr ** ib_mr,struct ibv_pd * ib_pd,unsigned int mtu,int * sge_countp,int * wr_countp,int * data_lenp)180 int mca_oob_ud_register_iov (struct iovec *iov, int count, struct ibv_mr **ib_mr,
181 struct ibv_pd *ib_pd, unsigned int mtu, int *sge_countp,
182 int *wr_countp, int *data_lenp)
183 {
184 int data_len, iov_index, sge_count;
185 unsigned int packet_size = 0;
186
187 opal_output_verbose (80, orte_oob_base_framework.framework_output,
188 "%s oob:ud:register_iov registering memory", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
189
190 *wr_countp = 0;
191 *data_lenp = 0;
192 *sge_countp = 0;
193
194 for (iov_index = 0, data_len = 0, sge_count = 0 ; iov_index < count ; ++iov_index) {
195 unsigned int iov_left = iov[iov_index].iov_len;
196
197 data_len += iov_left;
198
199 sge_count++;
200
201 do {
202 unsigned int to_trans = min (iov_left, mtu - packet_size);
203
204 packet_size = (to_trans < iov_left) ? 0 : packet_size + to_trans;
205 iov_left -= to_trans;
206
207 if (0 == packet_size && iov_left) {
208 sge_count++;
209 }
210 } while (iov_left);
211
212 /* register buffers */
213 if (NULL == ib_mr[iov_index]) {
214 ib_mr[iov_index] = ibv_reg_mr (ib_pd,
215 iov[iov_index].iov_base,
216 iov[iov_index].iov_len,
217 IBV_ACCESS_LOCAL_WRITE |
218 IBV_ACCESS_REMOTE_WRITE);
219 if (NULL == ib_mr[iov_index]) {
220 /* Ruh-roh */
221 orte_show_help("help-oob-ud.txt", "reg-mr-failed", true,
222 orte_process_info.nodename, iov[iov_index].iov_base,
223 iov[iov_index].iov_len,strerror(errno));
224 return ORTE_ERR_OUT_OF_RESOURCE;
225 }
226 }
227 }
228
229 *wr_countp = (data_len + mtu - 1) / mtu;
230 *sge_countp = sge_count;
231 *data_lenp = data_len;
232
233 return ORTE_SUCCESS;
234 }
235
mca_oob_ud_register_buf(char * buf,int size,struct ibv_mr ** ib_mr_buf,struct ibv_pd * ib_pd,unsigned int mtu,int * sge_countp,int * wr_countp)236 int mca_oob_ud_register_buf (char *buf, int size, struct ibv_mr **ib_mr_buf,
237 struct ibv_pd *ib_pd, unsigned int mtu, int *sge_countp, int *wr_countp)
238 {
239 int sge_count = 0;
240 unsigned int packet_size = 0;
241
242 opal_output_verbose (80, orte_oob_base_framework.framework_output,
243 "%s oob:ud:mca_oob_ud_register_buf registering memory", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
244
245 *wr_countp = 0;
246 *sge_countp = 0;
247
248 unsigned int iov_left = size;
249
250 sge_count++;
251
252 do {
253 unsigned int to_trans = min (iov_left, mtu - packet_size);
254
255 packet_size = (to_trans < iov_left) ? 0 : packet_size + to_trans;
256 iov_left -= to_trans;
257
258 if (0 == packet_size && iov_left) {
259 sge_count++;
260 }
261 } while (iov_left);
262
263 /* register buffers */
264 if (NULL == *ib_mr_buf) {
265 *ib_mr_buf = ibv_reg_mr (ib_pd, buf, size,
266 IBV_ACCESS_LOCAL_WRITE |
267 IBV_ACCESS_REMOTE_WRITE);
268 if (NULL == *ib_mr_buf) {
269 orte_show_help("help-oob-ud.txt", "reg-mr-failed", true,
270 orte_process_info.nodename, buf, size, strerror(errno));
271 return ORTE_ERR_OUT_OF_RESOURCE;
272 }
273 }
274
275 *wr_countp = (size + mtu - 1) / mtu;
276 *sge_countp = sge_count;
277
278 return ORTE_SUCCESS;
279 }
280