1 /* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */
2 /*
3  * Copyright (c) 2011-2012 Los Alamos National Security, LLC. All rights
4  *                         reserved.
5  * Copyright (c) 2014      Research Organization for Information Science
6  *                         and Technology (RIST). All rights reserved.
7  *               2014      Mellanox Technologies, Inc.
8  *                         All rights reserved.
9  * $COPYRIGHT$
10  *
11  * Additional copyrights may follow
12  *
13  * $HEADER$
14  *
15  */
16 
17 #include "orte_config.h"
18 #include "orte/types.h"
19 #include "opal/types.h"
20 
21 #include "orte/util/name_fns.h"
22 #include "orte/runtime/orte_globals.h"
23 #include "orte/util/proc_info.h"
24 #include "orte/util/show_help.h"
25 
26 #include "orte/mca/routed/routed.h"
27 
28 #include "oob_ud.h"
29 #include "oob_ud_send.h"
30 
31 #define min(a,b) ((a) < (b) ? (a) : (b))
32 
33 static int mca_oob_ud_module_init (void);
34 static void mca_oob_ud_module_fini (mca_oob_ud_peer_t **peer);
35 static int mca_oob_ud_set_addr (const orte_process_name_t *name, const char *uri);
36 static void mca_oob_ud_send_nb(orte_rml_send_t *msg);
37 static void mca_oob_ud_ping(const orte_process_name_t *proc);
38 
39 mca_oob_ud_module_t mca_oob_ud_module = {
40     {
41         mca_oob_ud_module_init,
42         mca_oob_ud_module_fini,
43 
44         mca_oob_ud_set_addr,
45 
46         mca_oob_ud_ping,
47 
48         mca_oob_ud_send_nb
49     }
50 };
51 
mca_oob_ud_send_nb(orte_rml_send_t * msg)52 static void mca_oob_ud_send_nb(orte_rml_send_t *msg) {
53     opal_output_verbose(2, orte_oob_base_framework.framework_output,
54                         "%s oob:ud:send_nb to peer %s",
55                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
56                         ORTE_NAME_PRINT(&msg->dst));
57 
58     /* push this into our event base for processing */
59     ORTE_ACTIVATE_UD_POST_SEND(msg, mca_oob_ud_process_send_nb);
60 }
61 
mca_oob_ud_ping(const orte_process_name_t * proc)62 static void mca_oob_ud_ping(const orte_process_name_t *proc) {
63     opal_output_verbose(2, orte_oob_base_framework.framework_output,
64                         "%s oob:ud:ping proc %s",
65                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
66                         ORTE_NAME_PRINT(proc));
67 
68     /* push this into our event base for processing */
69     ORTE_ACTIVATE_UD_PING(proc, mca_oob_ud_process_ping);
70 }
71 
72 /* uri must be at least 27 bytes in size */
mca_oob_ud_port_get_uri(mca_oob_ud_port_t * port,char * uri)73 void mca_oob_ud_port_get_uri (mca_oob_ud_port_t *port, char *uri)
74 {
75     sprintf (uri, "ud://%u.%u.%u", port->listen_qp.ib_qp->qp_num,
76              port->lid, port->port_num);
77 }
78 
mca_oob_ud_set_addr(const orte_process_name_t * name,const char * uri)79 static int mca_oob_ud_set_addr (const orte_process_name_t *name, const char *uri)
80 {
81     mca_oob_ud_peer_t *peer = NULL;
82     int rc;
83 
84     opal_output_verbose(5, orte_oob_base_framework.framework_output,
85                          "%s oob:ud:set_addr: setting location for peer %s from %s",
86                          ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(name), uri);
87 
88     (void) mca_oob_ud_peer_lookup (name, &peer);
89 
90     if (NULL == uri) {
91         if (NULL != peer) {
92             mca_oob_ud_peer_release (peer);
93         }
94 
95         peer = NULL;
96     } else if (NULL == peer) {
97         peer = mca_oob_ud_peer_from_uri (uri);
98         if (NULL == peer) {
99             return ORTE_ERR_BAD_PARAM;
100         }
101     } else {
102         rc = mca_oob_ud_peer_update_with_uri (peer, uri);
103 
104         if (ORTE_SUCCESS != rc) {
105             return rc;
106         }
107     }
108 
109     if (NULL != peer) {
110         peer->peer_name = *name;
111         peer->needs_notification = true;
112     }
113 
114     opal_proc_table_set_value(&mca_oob_ud_module.peers,
115                               *name, (void *)peer);
116 
117     return ORTE_SUCCESS;
118 }
119 
mca_oob_ud_port_post_one_recv(mca_oob_ud_port_t * port,int msg_num)120 int mca_oob_ud_port_post_one_recv (mca_oob_ud_port_t *port, int msg_num)
121 {
122     char *grh_buf = port->grh_buf.ptr + msg_num * sizeof (struct ibv_grh);
123     char *msg_buf = port->msg_buf.ptr + msg_num * port->mtu;
124     struct ibv_recv_wr wr;
125     struct ibv_sge sge[2];
126 
127     /* GRH */
128     mca_oob_ud_fill_sge(sge, grh_buf, sizeof (struct ibv_grh), port->grh_buf.mr->lkey);
129 
130     /* message */
131     mca_oob_ud_fill_sge(sge + 1, msg_buf, port->mtu, port->msg_buf.mr->lkey);
132 
133     mca_oob_ud_fill_recv_wr (&wr, sge, 2);
134     wr.wr_id   = MCA_OOB_UD_RECV_WR | (uint64_t)msg_num;
135 
136     return mca_oob_ud_qp_post_recv (&port->listen_qp, &wr);
137 }
138 
139 static bool module_has_been_inited = false;
140 
mca_oob_ud_module_init(void)141 static int mca_oob_ud_module_init (void)
142 {
143     /* protect against repeat inits */
144     if (module_has_been_inited) {
145         return ORTE_SUCCESS;
146     }
147     module_has_been_inited = true;
148 
149     OBJ_CONSTRUCT(&mca_oob_ud_module.peers, opal_proc_table_t);
150     opal_proc_table_init (&mca_oob_ud_module.peers, 16, 1024);
151 
152     return ORTE_SUCCESS;
153 }
154 
mca_oob_ud_module_fini(mca_oob_ud_peer_t ** peer)155 static void mca_oob_ud_module_fini (mca_oob_ud_peer_t **peer)
156 {
157     opal_process_name_t key;
158     void *node1, *node2;
159     int rc;
160 
161     rc = opal_proc_table_get_first_key (&mca_oob_ud_module.peers, &key,
162                                         (void **) peer, &node1, &node2);
163     if (OPAL_SUCCESS == rc) {
164         do {
165             if (NULL != *peer) {
166                 mca_oob_ud_peer_release (*peer);
167             }
168             rc = opal_proc_table_get_next_key (&mca_oob_ud_module.peers, &key,
169                                                (void **) peer, node1, &node1, node2, &node2);
170         } while (OPAL_SUCCESS == rc);
171     }
172 
173     opal_proc_table_remove_all(&mca_oob_ud_module.peers);
174 
175     OBJ_DESTRUCT(&mca_oob_ud_module.peers);
176 
177     return;
178 }
179 
mca_oob_ud_register_iov(struct iovec * iov,int count,struct ibv_mr ** ib_mr,struct ibv_pd * ib_pd,unsigned int mtu,int * sge_countp,int * wr_countp,int * data_lenp)180 int mca_oob_ud_register_iov (struct iovec *iov, int count, struct ibv_mr **ib_mr,
181                              struct ibv_pd *ib_pd, unsigned int mtu, int *sge_countp,
182                              int *wr_countp, int *data_lenp)
183 {
184     int data_len, iov_index, sge_count;
185     unsigned int packet_size = 0;
186 
187     opal_output_verbose (80, orte_oob_base_framework.framework_output,
188                          "%s oob:ud:register_iov registering memory", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
189 
190     *wr_countp  = 0;
191     *data_lenp  = 0;
192     *sge_countp = 0;
193 
194     for (iov_index = 0, data_len = 0, sge_count = 0 ; iov_index < count ; ++iov_index) {
195         unsigned int iov_left = iov[iov_index].iov_len;
196 
197         data_len += iov_left;
198 
199         sge_count++;
200 
201         do {
202             unsigned int to_trans = min (iov_left, mtu - packet_size);
203 
204             packet_size = (to_trans < iov_left) ? 0 : packet_size + to_trans;
205             iov_left    -= to_trans;
206 
207             if (0 == packet_size && iov_left) {
208                 sge_count++;
209             }
210         } while (iov_left);
211 
212         /* register buffers */
213         if (NULL == ib_mr[iov_index]) {
214             ib_mr[iov_index] = ibv_reg_mr (ib_pd,
215                                            iov[iov_index].iov_base,
216                                            iov[iov_index].iov_len,
217                                            IBV_ACCESS_LOCAL_WRITE |
218                                            IBV_ACCESS_REMOTE_WRITE);
219             if (NULL == ib_mr[iov_index]) {
220                 /* Ruh-roh */
221                 orte_show_help("help-oob-ud.txt", "reg-mr-failed", true,
222                        orte_process_info.nodename, iov[iov_index].iov_base,
223                        iov[iov_index].iov_len,strerror(errno));
224                 return ORTE_ERR_OUT_OF_RESOURCE;
225             }
226         }
227     }
228 
229     *wr_countp  = (data_len + mtu - 1) / mtu;
230     *sge_countp = sge_count;
231     *data_lenp  = data_len;
232 
233     return ORTE_SUCCESS;
234 }
235 
mca_oob_ud_register_buf(char * buf,int size,struct ibv_mr ** ib_mr_buf,struct ibv_pd * ib_pd,unsigned int mtu,int * sge_countp,int * wr_countp)236 int mca_oob_ud_register_buf (char *buf, int size, struct ibv_mr **ib_mr_buf,
237                              struct ibv_pd *ib_pd, unsigned int mtu, int *sge_countp, int *wr_countp)
238 {
239     int sge_count = 0;
240     unsigned int packet_size = 0;
241 
242     opal_output_verbose (80, orte_oob_base_framework.framework_output,
243                          "%s oob:ud:mca_oob_ud_register_buf registering memory", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
244 
245     *wr_countp  = 0;
246     *sge_countp = 0;
247 
248     unsigned int iov_left = size;
249 
250     sge_count++;
251 
252     do {
253         unsigned int to_trans = min (iov_left, mtu - packet_size);
254 
255         packet_size = (to_trans < iov_left) ? 0 : packet_size + to_trans;
256         iov_left    -= to_trans;
257 
258         if (0 == packet_size && iov_left) {
259             sge_count++;
260         }
261     } while (iov_left);
262 
263     /* register buffers */
264     if (NULL == *ib_mr_buf) {
265         *ib_mr_buf = ibv_reg_mr (ib_pd, buf, size,
266                                 IBV_ACCESS_LOCAL_WRITE |
267                                 IBV_ACCESS_REMOTE_WRITE);
268         if (NULL == *ib_mr_buf) {
269             orte_show_help("help-oob-ud.txt", "reg-mr-failed", true,
270                        orte_process_info.nodename, buf, size, strerror(errno));
271             return ORTE_ERR_OUT_OF_RESOURCE;
272         }
273     }
274 
275     *wr_countp  = (size + mtu - 1) / mtu;
276     *sge_countp = sge_count;
277 
278     return ORTE_SUCCESS;
279 }
280