1 /*
2  * Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
3  * Copyright (c) 2017      Amazon.com, Inc. or its affiliates.
4  *                         All Rights reserved.
5  * Portions of this software copied from libfabric
6  * (https://github.com/ofiwg/libfabric)
7  *
8  * LICENSE_BEGIN
9  *
10  * BSD license:
11  *
12  *     Redistribution and use in source and binary forms, with or
13  *     without modification, are permitted provided that the following
14  *     conditions are met:
15  *
16  *      - Redistributions of source code must retain the above
17  *        copyright notice, this list of conditions and the following
18  *        disclaimer.
19  *
20  *      - Redistributions in binary form must reproduce the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer in the documentation and/or other materials
23  *        provided with the distribution.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
28  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
29  * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
32  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
33  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
35  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  *
38  * LICENSE_END
39  *
40  *
41  */
42 
43 #include "opal_config.h"
44 
45 #include <errno.h>
46 #include <arpa/inet.h>
47 #include <time.h>
48 #ifdef HAVE_NETINET_IN_H
49 #include <netinet/in.h>
50 #endif
51 
52 #include "libnl_utils.h"
53 
54 /* Adapt this copied code for Open MPI */
55 #include "opal/util/output.h"
56 
57 
58 static struct nla_policy route_policy[RTA_MAX+1] = {
59 	[RTA_IIF]	= { .type = NLA_STRING,
60 			    .maxlen = IFNAMSIZ, },
61 	[RTA_OIF]	= { .type = NLA_U32 },
62 	[RTA_PRIORITY]	= { .type = NLA_U32 },
63 	[RTA_FLOW]	= { .type = NLA_U32 },
64 	[RTA_MP_ALGO]	= { .type = NLA_U32 },
65 	[RTA_CACHEINFO]	= { .minlen = sizeof(struct rta_cacheinfo) },
66 	[RTA_METRICS]	= { .type = NLA_NESTED },
67 	[RTA_MULTIPATH]	= { .type = NLA_NESTED },
68 };
69 
opal_reachable_netlink_is_nlreply_expected(struct opal_reachable_netlink_sk * unlsk,struct nlmsghdr * nlm_hdr)70 static int opal_reachable_netlink_is_nlreply_expected(struct opal_reachable_netlink_sk *unlsk,
71 						      struct nlmsghdr *nlm_hdr)
72 {
73 #if OPAL_ENABLE_DEBUG
74     if (nlm_hdr->nlmsg_pid != nl_socket_get_local_port(unlsk->nlh)
75         || nlm_hdr->nlmsg_seq != unlsk->seq) {
76         opal_output(0, "Not an expected reply msg pid: %u local pid: %u msg seq: %u expected seq: %u\n",
77                     nlm_hdr->nlmsg_pid,
78                     nl_socket_get_local_port(unlsk->nlh),
79                     nlm_hdr->nlmsg_seq, unlsk->seq);
80         return 0;
81     }
82 #endif
83 
84     return 1;
85 }
86 
opal_reachable_netlink_is_nlreply_err(struct nlmsghdr * nlm_hdr)87 static int opal_reachable_netlink_is_nlreply_err(struct nlmsghdr *nlm_hdr)
88 {
89     if (nlm_hdr->nlmsg_type == NLMSG_ERROR) {
90         struct nlmsgerr *e = (struct nlmsgerr *)nlmsg_data(nlm_hdr);
91         if (nlm_hdr->nlmsg_len >= (__u32)NLMSG_SIZE(sizeof(*e)))
92             opal_output_verbose(20, 0,
93                                 "Received a netlink error message");
94         else
95             opal_output_verbose(20, 0,
96                                 "Received a truncated netlink error message\n");
97         return 1;
98     }
99 
100     return 0;
101 }
102 
opal_reachable_netlink_send_query(struct opal_reachable_netlink_sk * unlsk,struct nl_msg * msg,int protocol,int flag)103 static int opal_reachable_netlink_send_query(struct opal_reachable_netlink_sk *unlsk,
104 					     struct nl_msg *msg,
105 					     int protocol, int flag)
106 {
107     struct nlmsghdr *nlhdr;
108 
109     nlhdr = nlmsg_hdr(msg);
110     nlhdr->nlmsg_pid = nl_socket_get_local_port(unlsk->nlh);
111     nlhdr->nlmsg_seq = ++unlsk->seq;
112     nlmsg_set_proto(msg, protocol);
113     nlhdr->nlmsg_flags = flag;
114 
115     return nl_send(unlsk->nlh, msg);
116 }
117 
opal_reachable_netlink_set_rcvsk_timer(NL_HANDLE * nlh)118 static int opal_reachable_netlink_set_rcvsk_timer(NL_HANDLE *nlh)
119 {
120     int err = 0;
121     struct timeval timeout;
122 
123     timeout.tv_sec = 1;
124     timeout.tv_usec = 0;
125 
126     err = setsockopt(nl_socket_get_fd(nlh), SOL_SOCKET, SO_RCVTIMEO,
127                      (char *)&timeout, sizeof(timeout));
128 #if OPAL_ENABLE_DEBUG
129     if (err < 0)
130         opal_output(0, "Failed to set SO_RCVTIMEO for nl socket");
131 #endif
132 
133     return err;
134 }
135 
opal_reachable_netlink_sk_alloc(struct opal_reachable_netlink_sk ** p_sk,int protocol)136 static int opal_reachable_netlink_sk_alloc(struct opal_reachable_netlink_sk **p_sk, int protocol)
137 {
138     struct opal_reachable_netlink_sk *unlsk;
139     NL_HANDLE *nlh;
140     int err;
141 
142     unlsk = calloc(1, sizeof(*unlsk));
143     if (!unlsk) {
144         opal_output(0, "Failed to allocate opal_reachable_netlink_sk struct\n");
145         return ENOMEM;
146     }
147 
148     nlh = NL_HANDLE_ALLOC();
149     if (!nlh) {
150         opal_output(0, "Failed to allocate nl handle\n");
151         err = ENOMEM;
152         goto err_free_unlsk;
153     }
154 
155     err = nl_connect(nlh, protocol);
156     if (err < 0) {
157         opal_output(0, "Failed to connnect netlink route socket error: %s\n",
158                     NL_GETERROR(err));
159         err = EINVAL;
160         goto err_free_nlh;
161     }
162 
163     NL_DISABLE_SEQ_CHECK(nlh);
164     err = opal_reachable_netlink_set_rcvsk_timer(nlh);
165     if (err < 0)
166         goto err_close_nlh;
167 
168     unlsk->nlh = nlh;
169     unlsk->seq = time(NULL);
170     *p_sk = unlsk;
171     return 0;
172 
173  err_close_nlh:
174     nl_close(nlh);
175  err_free_nlh:
176     NL_HANDLE_FREE(nlh);
177  err_free_unlsk:
178     free(unlsk);
179     return err;
180 }
181 
opal_reachable_netlink_sk_free(struct opal_reachable_netlink_sk * unlsk)182 static void opal_reachable_netlink_sk_free(struct opal_reachable_netlink_sk *unlsk)
183 {
184     nl_close(unlsk->nlh);
185     NL_HANDLE_FREE(unlsk->nlh);
186     free(unlsk);
187 }
188 
opal_reachable_netlink_rt_raw_parse_cb(struct nl_msg * msg,void * arg)189 static int opal_reachable_netlink_rt_raw_parse_cb(struct nl_msg *msg, void *arg)
190 {
191     struct opal_reachable_netlink_rt_cb_arg *lookup_arg = (struct opal_reachable_netlink_rt_cb_arg *)arg;
192     struct opal_reachable_netlink_sk *unlsk = lookup_arg->unlsk;
193     struct nlmsghdr *nlm_hdr = nlmsg_hdr(msg);
194     struct rtmsg *rtm;
195     struct nlattr *tb[RTA_MAX + 1];
196     int found = 0;
197     int err;
198 
199     INC_CB_MSGCNT(lookup_arg);
200 
201     if (!opal_reachable_netlink_is_nlreply_expected(unlsk, nlm_hdr)) {
202 #if OPAL_ENABLE_DEBUG
203         nl_msg_dump(msg, stderr);
204 #endif
205         return NL_SKIP;
206     }
207 
208     if (opal_reachable_netlink_is_nlreply_err(nlm_hdr)) {
209 #if OPAL_ENABLE_DEBUG
210         nl_msg_dump(msg, stderr);
211 #endif
212         return NL_SKIP;
213     }
214 
215     if (nlm_hdr->nlmsg_type != RTM_NEWROUTE) {
216 #if OPAL_ENABLE_DEBUG
217         char buf[128];
218         nl_nlmsgtype2str(nlm_hdr->nlmsg_type, buf, sizeof(buf));
219         opal_output(0, "Received an invalid route request reply message type: %s\n",
220                     buf);
221         nl_msg_dump(msg, stderr);
222 #endif
223         return NL_SKIP;
224     }
225 
226     rtm = nlmsg_data(nlm_hdr);
227     if (rtm->rtm_family != AF_INET
228 #if OPAL_ENABLE_IPV6
229 	&& rtm->rtm_family != AF_INET6
230 #endif
231 	) {
232 #if OPAL_ENABLE_DEBUG
233         opal_output(0, "RTM message contains invalid AF family: %u\n",
234                     rtm->rtm_family);
235         nl_msg_dump(msg, stderr);
236 #endif
237         return NL_SKIP;
238     }
239 
240     err = nlmsg_parse(nlm_hdr, sizeof(struct rtmsg), tb, RTA_MAX,
241                       route_policy);
242     if (err < 0) {
243 #if OPAL_ENABLE_DEBUG
244         opal_output(0, "nlmsg parse error %s\n", NL_GETERROR(err));
245         nl_msg_dump(msg, stderr);
246 #endif
247         return NL_SKIP;
248     }
249 
250     if (tb[RTA_OIF]) {
251         if (nla_get_u32(tb[RTA_OIF]) == (uint32_t)lookup_arg->oif)
252             found = 1;
253         else
254             /* usually, this means that there is a route to the remote
255                host, but that it's not through the given interface.  For
256                our purposes, that means it's not reachable. */
257             opal_output_verbose(20, 0, "Retrieved route has a different outgoing interface %d (expected %d)\n",
258 				nla_get_u32(tb[RTA_OIF]),
259 				lookup_arg->oif);
260     }
261 
262     if (found && tb[RTA_GATEWAY]) {
263         lookup_arg->has_gateway = 1;
264     }
265     lookup_arg->found = found;
266     return NL_STOP;
267 }
268 
opal_reachable_netlink_rt_lookup(uint32_t src_addr,uint32_t dst_addr,int outgoing_interface,int * has_gateway)269 int opal_reachable_netlink_rt_lookup(uint32_t src_addr,
270 				     uint32_t dst_addr,
271 				     int outgoing_interface,
272                                      int *has_gateway)
273 {
274     struct opal_reachable_netlink_sk *unlsk; /* netlink socket */
275     struct nl_msg *nlm; /* netlink message */
276     struct rtmsg rmsg; /* route message */
277     struct opal_reachable_netlink_rt_cb_arg arg; /* callback argument */
278     int err;
279 
280     /* allocate netlink socket */
281     unlsk = NULL;
282     err = opal_reachable_netlink_sk_alloc(&unlsk, NETLINK_ROUTE);
283     if (err)
284         return err;
285 
286     /* allocate route message */
287     memset(&rmsg, 0, sizeof(rmsg));
288     rmsg.rtm_family = AF_INET;
289     rmsg.rtm_dst_len = sizeof(dst_addr) * CHAR_BIT;
290     rmsg.rtm_src_len = sizeof(src_addr) * CHAR_BIT;
291 
292     /* allocate netlink message of type RTM_GETROUTE */
293     nlm = nlmsg_alloc_simple(RTM_GETROUTE, 0);
294     if (!nlm) {
295         opal_output(0, "Failed to alloc nl message, %s\n",
296                     NL_GETERROR(err));
297         err = ENOMEM;
298         goto out;
299     }
300 
301     /* append route message and addresses to netlink message.   */
302     nlmsg_append(nlm, &rmsg, sizeof(rmsg), NLMSG_ALIGNTO);
303     nla_put_u32(nlm, RTA_DST, dst_addr);
304     nla_put_u32(nlm, RTA_SRC, src_addr);
305 
306     /* query kernel */
307     err = opal_reachable_netlink_send_query(unlsk, nlm, NETLINK_ROUTE, NLM_F_REQUEST);
308     nlmsg_free(nlm);
309     if (err < 0) {
310         opal_output(0, "Failed to send RTM_GETROUTE query message, error %s\n",
311                     NL_GETERROR(err));
312         err = EINVAL;
313         goto out;
314     }
315 
316     /* Setup callback function */
317     memset(&arg, 0, sizeof(arg));
318     arg.oif = outgoing_interface;
319     arg.unlsk = unlsk;
320     err = nl_socket_modify_cb(unlsk->nlh, NL_CB_MSG_IN, NL_CB_CUSTOM,
321                               opal_reachable_netlink_rt_raw_parse_cb, &arg);
322     if (err != 0) {
323         opal_output(0, "Failed to setup callback function, error %s\n",
324                     NL_GETERROR(err));
325         err = EINVAL;
326         goto out;
327     }
328 
329     /* recieve results */
330     NL_RECVMSGS(unlsk->nlh, arg, EHOSTUNREACH, err, out);
331 
332     /* check whether a route was found */
333     if (arg.found) {
334         *has_gateway = arg.has_gateway;
335         err = 0;
336     } else {
337         *has_gateway = 0;
338         err = EHOSTUNREACH;
339     }
340 
341  out:
342     opal_reachable_netlink_sk_free(unlsk);
343     return err;
344 }
345 
346 
347 #if OPAL_ENABLE_IPV6
opal_reachable_netlink_rt_lookup6(struct in6_addr * src_addr,struct in6_addr * dst_addr,int outgoing_interface,int * has_gateway)348 int opal_reachable_netlink_rt_lookup6(struct in6_addr *src_addr,
349 				      struct in6_addr *dst_addr,
350 				      int outgoing_interface,
351 				      int *has_gateway)
352 {
353 
354     struct opal_reachable_netlink_sk *unlsk; /* netlink socket */
355     struct nl_msg *nlm; /* netlink message */
356     struct rtmsg rmsg; /* route message */
357     struct opal_reachable_netlink_rt_cb_arg arg; /* callback argument */
358     int err;
359 
360     /* allocate netlink socket */
361     unlsk = NULL;
362     err = opal_reachable_netlink_sk_alloc(&unlsk, NETLINK_ROUTE);
363     if (err)
364 	return err;
365 
366     /* allocate route message */
367     memset(&rmsg, 0, sizeof(rmsg));
368     rmsg.rtm_family = AF_INET6;
369     rmsg.rtm_dst_len = sizeof(*dst_addr) * CHAR_BIT;
370     rmsg.rtm_src_len = sizeof(*src_addr) * CHAR_BIT;
371 
372     /* allocate netlink message of type RTM_GETROUTE */
373     nlm = nlmsg_alloc_simple(RTM_GETROUTE, 0);
374     if (!nlm) {
375 	opal_output(0, "Failed to alloc nl message, %s\n",
376 		    NL_GETERROR(err));
377 	err = ENOMEM;
378 	goto out;
379     }
380 
381     /* append route message and addresses to netlink message.   */
382     nlmsg_append(nlm, &rmsg, sizeof(rmsg), NLMSG_ALIGNTO);
383     nla_put(nlm, RTA_DST, sizeof(dst_addr->s6_addr), &(dst_addr->s6_addr));
384     nla_put(nlm, RTA_SRC, sizeof(src_addr->s6_addr), &(src_addr->s6_addr));
385 
386     /* query kernel */
387     err = opal_reachable_netlink_send_query(unlsk, nlm, NETLINK_ROUTE, NLM_F_REQUEST);
388     nlmsg_free(nlm);
389     if (err < 0) {
390 	opal_output(0, "Failed to send RTM_GETROUTE query message, error %s\n",
391 		    NL_GETERROR(err));
392 	err = EINVAL;
393 	goto out;
394     }
395 
396     /* Setup callback function */
397     memset(&arg, 0, sizeof(arg));
398     arg.oif = outgoing_interface;
399     arg.unlsk = unlsk;
400     err = nl_socket_modify_cb(unlsk->nlh, NL_CB_MSG_IN, NL_CB_CUSTOM,
401 			      opal_reachable_netlink_rt_raw_parse_cb, &arg);
402     if (err != 0) {
403 	opal_output(0, "Failed to setup callback function, error %s\n",
404 		    NL_GETERROR(err));
405 	err = EINVAL;
406 	goto out;
407     }
408 
409     /* receive results */
410     NL_RECVMSGS(unlsk->nlh, arg, EHOSTUNREACH, err, out);
411 
412     /* check whether a route was found */
413     if (arg.found) {
414         *has_gateway = arg.has_gateway;
415 	err = 0;
416     } else {
417         *has_gateway = 0;
418 	err = EHOSTUNREACH;
419     }
420 
421  out:
422     opal_reachable_netlink_sk_free(unlsk);
423     return err;
424 }
425 #endif /* #if OPAL_ENABLE_IPV6 */
426