1 /*
2  * Copyright (c) 2014-2016, Cisco Systems, Inc. All rights reserved.
3  *
4  * LICENSE_BEGIN
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * BSD license below:
11  *
12  *     Redistribution and use in source and binary forms, with or
13  *     without modification, are permitted provided that the following
14  *     conditions are met:
15  *
16  *      - Redistributions of source code must retain the above
17  *        copyright notice, this list of conditions and the following
18  *        disclaimer.
19  *
20  *      - Redistributions in binary form must reproduce the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer in the documentation and/or other materials
23  *        provided with the distribution.
24  *
25  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
28  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
29  * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
32  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
33  * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
35  * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36  * POSSIBILITY OF SUCH DAMAGE.
37  *
38  * LICENSE_END
39  *
40  *
41  */
42 #include <errno.h>
43 #include <arpa/inet.h>
44 #include <time.h>
45 #include <unistd.h>
46 
47 #include "libnl_utils.h"
48 #include "usnic_user_utils.h"
49 
50 #if USNIC_LOG_LVL >= USNIC_LOG_LVL_ERR
51 #define usnic_nlmsg_dump(msg) nl_msg_dump(msg, stderr)
52 #else
53 #define usnic_nlmsg_dump(msg)
54 #endif
55 
56 /*
57  * Querying the routing tables via netlink is expensive, especially
58  * when many processes are doing so at the same time on a single
59  * server (e.g., in an MPI job).  As such, we cache netlink responses
60  * to alleviate pressure on the netlink kernel interface.
61  */
62  struct usd_nl_cache_entry {
63 	time_t timestamp;
64 
65 	uint32_t src_ipaddr_be;
66 	uint32_t dest_ipaddr_be;
67 	int ifindex;
68 	uint32_t nh_addr;
69 	int reachable;
70 
71 	/* For now, this cache is a simple linked list.  Eventually,
72 	 * this cache should be a better data structure, such as a
73 	 * hash table. */
74 	struct usd_nl_cache_entry *prev;
75 	struct usd_nl_cache_entry *next;
76 };
77 
78 /* Semi-arbitrarily set cache TTL to 2 minutes */
79 static time_t usd_nl_cache_timeout = 120;
80 
81 static struct usd_nl_cache_entry *cache = NULL;
82 
83 
84 static struct nla_policy route_policy[RTA_MAX+1] = {
85 	[RTA_IIF]	= { .type = NLA_STRING,
86 			    .maxlen = IFNAMSIZ, },
87 	[RTA_OIF]	= { .type = NLA_U32 },
88 	[RTA_PRIORITY]	= { .type = NLA_U32 },
89 	[RTA_FLOW]	= { .type = NLA_U32 },
90 	[RTA_MP_ALGO]	= { .type = NLA_U32 },
91 	[RTA_CACHEINFO]	= { .minlen = sizeof(struct rta_cacheinfo) },
92 	[RTA_METRICS]	= { .type = NLA_NESTED },
93 	[RTA_MULTIPATH]	= { .type = NLA_NESTED },
94 };
95 
usnic_is_nlreply_expected(struct usnic_nl_sk * unlsk,struct nlmsghdr * nlm_hdr)96 static int usnic_is_nlreply_expected(struct usnic_nl_sk *unlsk,
97 					struct nlmsghdr *nlm_hdr)
98 {
99 	if (nlm_hdr->nlmsg_pid != nl_socket_get_local_port(unlsk->nlh)
100 		|| nlm_hdr->nlmsg_seq != unlsk->seq) {
101 		usnic_err("Not an expected reply msg pid: %u local pid: %u msg seq: %u expected seq: %u\n",
102 				nlm_hdr->nlmsg_pid,
103 				nl_socket_get_local_port(unlsk->nlh),
104 				nlm_hdr->nlmsg_seq, unlsk->seq);
105 		return 0;
106 	}
107 
108 	return 1;
109 }
110 
usnic_is_nlreply_err(struct nlmsghdr * nlm_hdr,struct usnic_rt_cb_arg * arg)111 static int usnic_is_nlreply_err(struct nlmsghdr *nlm_hdr,
112                                 struct usnic_rt_cb_arg *arg)
113 {
114 	if (nlm_hdr->nlmsg_type == NLMSG_ERROR) {
115 		struct nlmsgerr *e = (struct nlmsgerr *)nlmsg_data(nlm_hdr);
116 		if (nlm_hdr->nlmsg_len >= (__u32)NLMSG_SIZE(sizeof(*e))) {
117 			usnic_strerror(e->error,
118 					"Received a netlink error message");
119 			/* Sometimes nl_send() succeeds, but the
120 			 * request fails because the kernel is
121 			 * temporarily out of resources.  In these
122 			 * cases, we should tell the caller that they
123 			 * should try again. */
124 			if (e->error == -ECONNREFUSED) {
125 				arg->retry = 1;
126 			}
127 		} else
128 			usnic_err(
129 				"Received a truncated netlink error message\n");
130 		return 1;
131 	}
132 
133 	return 0;
134 }
135 
usnic_nl_send_query(struct usnic_nl_sk * unlsk,struct nl_msg * msg,int protocol,int flag)136 static int usnic_nl_send_query(struct usnic_nl_sk *unlsk, struct nl_msg *msg,
137 				int protocol, int flag)
138 {
139 	int ret, retry;
140 	struct nlmsghdr *nlhdr;
141 
142 	nlhdr = nlmsg_hdr(msg);
143 	while (1) {
144 		nlhdr->nlmsg_pid = nl_socket_get_local_port(unlsk->nlh);
145 		nlhdr->nlmsg_seq = ++unlsk->seq;
146 		nlmsg_set_proto(msg, protocol);
147 		nlhdr->nlmsg_flags = flag;
148 
149 		/* Sometimes nl_send() can fail simply because the
150 		 * kernel is temporarily out of resources, and we
151 		 * should just try again.  libnl1 and libnl3 handle
152 		 * this case a little differently, so use the
153 		 * USD_NL_SEND() macro to hide the differences.  If
154 		 * retry comes back as true, then sleep a little and
155 		 * try again. */
156 		USD_NL_SEND(unlsk->nlh, msg, ret, retry);
157 		if (retry) {
158 			usleep(5);
159 			continue;
160 		}
161 		break;
162 	}
163 
164 	return ret;
165 }
166 
usnic_nl_set_rcvsk_timer(NL_HANDLE * nlh)167 static int usnic_nl_set_rcvsk_timer(NL_HANDLE *nlh)
168 {
169 	int err = 0;
170 	struct timeval timeout;
171 
172 	timeout.tv_sec = 1;
173 	timeout.tv_usec = 0;
174 
175 	err = setsockopt(nl_socket_get_fd(nlh), SOL_SOCKET, SO_RCVTIMEO,
176 				(char *)&timeout, sizeof(timeout));
177 	if (err < 0)
178 		usnic_perr("Failed to set SO_RCVTIMEO for nl socket");
179 
180 	return err;
181 }
182 
usnic_nl_sk_alloc(struct usnic_nl_sk ** p_sk,int protocol)183 static int usnic_nl_sk_alloc(struct usnic_nl_sk **p_sk, int protocol)
184 {
185 	struct usnic_nl_sk *unlsk;
186 	NL_HANDLE *nlh;
187 	int err;
188 
189 	unlsk = calloc(1, sizeof(*unlsk));
190 	if (!unlsk) {
191 		usnic_err("Failed to allocate usnic_nl_sk struct\n");
192 		return ENOMEM;
193 	}
194 
195 	nlh = NL_HANDLE_ALLOC();
196 	if (!nlh) {
197 		usnic_err("Failed to allocate nl handle\n");
198 		err = ENOMEM;
199 		goto err_free_unlsk;
200 	}
201 
202 	err = nl_connect(nlh, protocol);
203 	if (err < 0) {
204 		usnic_err("Failed to connnect netlink route socket error: %s\n",
205 				NL_GETERROR(err));
206                 err = EINVAL;
207 		goto err_free_nlh;
208 	}
209 
210 	NL_DISABLE_SEQ_CHECK(nlh);
211 	err = usnic_nl_set_rcvsk_timer(nlh);
212 	if (err < 0)
213 		goto err_close_nlh;
214 
215 	unlsk->nlh = nlh;
216 	unlsk->seq = time(NULL);
217 	*p_sk = unlsk;
218 	return 0;
219 
220 err_close_nlh:
221 	nl_close(nlh);
222 err_free_nlh:
223 	NL_HANDLE_FREE(nlh);
224 err_free_unlsk:
225 	free(unlsk);
226 	return err;
227 }
228 
usnic_nl_sk_free(struct usnic_nl_sk * unlsk)229 static void usnic_nl_sk_free(struct usnic_nl_sk *unlsk)
230 {
231 	nl_close(unlsk->nlh);
232 	NL_HANDLE_FREE(unlsk->nlh);
233 	free(unlsk);
234 }
235 
usnic_rt_raw_parse_cb(struct nl_msg * msg,void * arg)236 static int usnic_rt_raw_parse_cb(struct nl_msg *msg, void *arg)
237 {
238 	struct usnic_rt_cb_arg *lookup_arg = (struct usnic_rt_cb_arg *)arg;
239 	struct usnic_nl_sk *unlsk = lookup_arg->unlsk;
240 	struct nlmsghdr *nlm_hdr = nlmsg_hdr(msg);
241 	struct rtmsg *rtm;
242 	struct nlattr *tb[RTA_MAX + 1];
243 	int found = 0;
244 	int err;
245 
246 	INC_CB_MSGCNT(lookup_arg);
247 
248 	if (!usnic_is_nlreply_expected(unlsk, nlm_hdr)) {
249 		usnic_nlmsg_dump(msg);
250 		return NL_SKIP;
251 	}
252 
253 	if (usnic_is_nlreply_err(nlm_hdr, lookup_arg)) {
254 		usnic_nlmsg_dump(msg);
255 		return NL_SKIP;
256 	}
257 
258 	if (nlm_hdr->nlmsg_type != RTM_NEWROUTE) {
259 		char buf[128];
260 		nl_nlmsgtype2str(nlm_hdr->nlmsg_type, buf, sizeof(buf));
261 		usnic_err("Received an invalid route request reply message type: %s\n",
262 				buf);
263 		usnic_nlmsg_dump(msg);
264 		return NL_SKIP;
265 	}
266 
267 	rtm = nlmsg_data(nlm_hdr);
268 	if (rtm->rtm_family != AF_INET) {
269 		usnic_err("RTM message contains invalid AF family: %u\n",
270 				rtm->rtm_family);
271 		usnic_nlmsg_dump(msg);
272 		return NL_SKIP;
273 	}
274 
275 	err = nlmsg_parse(nlm_hdr, sizeof(struct rtmsg), tb, RTA_MAX,
276 			  route_policy);
277 	if (err < 0) {
278 		usnic_err("nlmsg parse error %s\n", NL_GETERROR(err));
279 		usnic_nlmsg_dump(msg);
280 		return NL_SKIP;
281 	}
282 
283 	if (tb[RTA_OIF]) {
284 		if (nla_get_u32(tb[RTA_OIF]) == (uint32_t)lookup_arg->oif)
285 			found = 1;
286 		else
287 			usnic_err("Retrieved route has a different outgoing interface %d (expected %d)\n",
288 					nla_get_u32(tb[RTA_OIF]),
289 					lookup_arg->oif);
290 	}
291 
292 	if (found && tb[RTA_GATEWAY])
293 		lookup_arg->nh_addr = nla_get_u32(tb[RTA_GATEWAY]);
294 
295 	lookup_arg->found = found;
296 	return NL_STOP;
297 }
298 
299 
300 static struct usd_nl_cache_entry *
usd_nl_cache_lookup(uint32_t src_ipaddr_be,uint32_t dest_ipaddr_be,int ifindex)301 usd_nl_cache_lookup(uint32_t src_ipaddr_be, uint32_t dest_ipaddr_be, int ifindex)
302 {
303 	time_t now;
304 	struct usd_nl_cache_entry *nlce;
305 	struct usd_nl_cache_entry *stale;
306 
307 	now = time(NULL);
308 	for (nlce = cache; NULL != nlce; ) {
309 		/* While we're traversing the cache, we might as well
310 		 * remove stale entries */
311 		if (now > nlce->timestamp + usd_nl_cache_timeout) {
312 			stale = nlce;
313 			nlce = nlce->next;
314 
315 			if (stale->prev) {
316 				stale->prev->next = stale->next;
317 			}
318 			if (stale->next) {
319 				stale->next->prev = stale->prev;
320 			}
321 			if (cache == stale) {
322 				cache = nlce;
323 			}
324 			free(stale);
325 
326 			continue;
327 		}
328 
329 		if (nlce->src_ipaddr_be == src_ipaddr_be &&
330 			nlce->dest_ipaddr_be == dest_ipaddr_be &&
331 			nlce->ifindex == ifindex) {
332 			return nlce;
333 		}
334 
335 		nlce = nlce->next;
336 	}
337 
338 	return NULL;
339 }
340 
341 static void
usd_nl_cache_save(int32_t src_ipaddr_be,uint32_t dest_ipaddr_be,int ifindex,uint32_t nh_addr,int reachable)342 usd_nl_cache_save(int32_t src_ipaddr_be, uint32_t dest_ipaddr_be, int ifindex,
343 		uint32_t nh_addr, int reachable)
344 {
345 	struct usd_nl_cache_entry *nlce;
346 
347 	nlce = calloc(1, sizeof(*nlce));
348 	if (NULL == nlce) {
349 		return;
350 	}
351 
352 	nlce->timestamp = time(NULL);
353 	nlce->src_ipaddr_be = src_ipaddr_be;
354 	nlce->dest_ipaddr_be = dest_ipaddr_be;
355 	nlce->ifindex = ifindex;
356 	nlce->nh_addr = nh_addr;
357 	nlce->reachable = reachable;
358 
359 	nlce->next = cache;
360 	if (cache) {
361 		cache->prev = nlce;
362 	}
363 	cache = nlce;
364 }
365 
366 
usnic_nl_rt_lookup(uint32_t src_addr,uint32_t dst_addr,int oif,uint32_t * nh_addr)367 int usnic_nl_rt_lookup(uint32_t src_addr, uint32_t dst_addr, int oif,
368 			uint32_t *nh_addr)
369 {
370 	struct usnic_nl_sk	*unlsk;
371 	struct nl_msg		*nlm;
372 	struct rtmsg		rmsg;
373 	struct usnic_rt_cb_arg	arg;
374 	int			err;
375 
376 	/* See if we have this NL result cached */
377 	struct usd_nl_cache_entry *nlce;
378 	nlce = usd_nl_cache_lookup(src_addr, dst_addr, oif);
379 	if (nlce) {
380 		if (nlce->reachable) {
381 			*nh_addr = nlce->nh_addr;
382 			return 0;
383 		} else {
384 			return EHOSTUNREACH;
385 		}
386 	}
387 
388 retry:
389 	unlsk = NULL;
390 	err = usnic_nl_sk_alloc(&unlsk, NETLINK_ROUTE);
391 	if (err)
392 		return err;
393 
394 	memset(&rmsg, 0, sizeof(rmsg));
395 	rmsg.rtm_family = AF_INET;
396 	rmsg.rtm_dst_len = sizeof(dst_addr) * CHAR_BIT;
397 	rmsg.rtm_src_len = sizeof(src_addr) * CHAR_BIT;
398 
399 	nlm = nlmsg_alloc_simple(RTM_GETROUTE, 0);
400 	if (!nlm) {
401 		usnic_err("Failed to alloc nl message, %s\n",
402 				NL_GETERROR(err));
403 		err = ENOMEM;
404 		goto out;
405 	}
406 	nlmsg_append(nlm, &rmsg, sizeof(rmsg), NLMSG_ALIGNTO);
407 	nla_put_u32(nlm, RTA_DST, dst_addr);
408 	nla_put_u32(nlm, RTA_SRC, src_addr);
409 
410 	err = usnic_nl_send_query(unlsk, nlm, NETLINK_ROUTE, NLM_F_REQUEST);
411 	nlmsg_free(nlm);
412 	if (err < 0) {
413 		usnic_err("Failed to send RTM_GETROUTE query message, error %s\n",
414 				NL_GETERROR(err));
415                 err = EINVAL;
416 		goto out;
417 	}
418 
419 	memset(&arg, 0, sizeof(arg));
420 	arg.oif		= oif;
421 	arg.unlsk	= unlsk;
422 	err = nl_socket_modify_cb(unlsk->nlh, NL_CB_MSG_IN, NL_CB_CUSTOM,
423 					usnic_rt_raw_parse_cb, &arg);
424 	if (err != 0) {
425 		usnic_err("Failed to setup callback function, error %s\n",
426 				NL_GETERROR(err));
427                 err = EINVAL;
428 		goto out;
429 	}
430 
431 	/* Sometimes the recvmsg can fail because something is
432 	 * temporarily out of resources.  In this case, delay a little
433 	 * and try again. */
434 	do {
435 		err = 0;
436 		NL_RECVMSGS(unlsk->nlh, arg, EAGAIN, err, out);
437 		if (err == EAGAIN) {
438 			usleep(5);
439 		}
440 	} while (err == EAGAIN);
441 
442 	/* If we got a reply back that indicated that the kernel was
443 	 * too busy to handle this request, delay a little and try
444 	 * again. */
445         if (arg.retry) {
446             usleep(5);
447             goto retry;
448         }
449 
450 	if (arg.found) {
451 		*nh_addr = arg.nh_addr;
452 		err = 0;
453 	} else {
454 		err = EHOSTUNREACH;
455 	}
456 
457 	/* Save this result in the cache */
458 	usd_nl_cache_save(src_addr, dst_addr, oif,
459 			arg.nh_addr, arg.found);
460 
461 out:
462 	usnic_nl_sk_free(unlsk);
463 	return err;
464 }
465 
466