1 /*
2 * Copyright (c) 2014-2016, Cisco Systems, Inc. All rights reserved.
3 *
4 * LICENSE_BEGIN
5 *
6 * This software is available to you under a choice of one of two
7 * licenses. You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * BSD license below:
11 *
12 * Redistribution and use in source and binary forms, with or
13 * without modification, are permitted provided that the following
14 * conditions are met:
15 *
16 * - Redistributions of source code must retain the above
17 * copyright notice, this list of conditions and the following
18 * disclaimer.
19 *
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials
23 * provided with the distribution.
24 *
25 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
28 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
29 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
32 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
33 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
35 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 * POSSIBILITY OF SUCH DAMAGE.
37 *
38 * LICENSE_END
39 *
40 *
41 */
42 #include <errno.h>
43 #include <arpa/inet.h>
44 #include <time.h>
45 #include <unistd.h>
46
47 #include "libnl_utils.h"
48 #include "usnic_user_utils.h"
49
50 #if USNIC_LOG_LVL >= USNIC_LOG_LVL_ERR
51 #define usnic_nlmsg_dump(msg) nl_msg_dump(msg, stderr)
52 #else
53 #define usnic_nlmsg_dump(msg)
54 #endif
55
56 /*
57 * Querying the routing tables via netlink is expensive, especially
58 * when many processes are doing so at the same time on a single
59 * server (e.g., in an MPI job). As such, we cache netlink responses
60 * to alleviate pressure on the netlink kernel interface.
61 */
62 struct usd_nl_cache_entry {
63 time_t timestamp;
64
65 uint32_t src_ipaddr_be;
66 uint32_t dest_ipaddr_be;
67 int ifindex;
68 uint32_t nh_addr;
69 int reachable;
70
71 /* For now, this cache is a simple linked list. Eventually,
72 * this cache should be a better data structure, such as a
73 * hash table. */
74 struct usd_nl_cache_entry *prev;
75 struct usd_nl_cache_entry *next;
76 };
77
78 /* Semi-arbitrarily set cache TTL to 2 minutes */
79 static time_t usd_nl_cache_timeout = 120;
80
81 static struct usd_nl_cache_entry *cache = NULL;
82
83
84 static struct nla_policy route_policy[RTA_MAX+1] = {
85 [RTA_IIF] = { .type = NLA_STRING,
86 .maxlen = IFNAMSIZ, },
87 [RTA_OIF] = { .type = NLA_U32 },
88 [RTA_PRIORITY] = { .type = NLA_U32 },
89 [RTA_FLOW] = { .type = NLA_U32 },
90 [RTA_MP_ALGO] = { .type = NLA_U32 },
91 [RTA_CACHEINFO] = { .minlen = sizeof(struct rta_cacheinfo) },
92 [RTA_METRICS] = { .type = NLA_NESTED },
93 [RTA_MULTIPATH] = { .type = NLA_NESTED },
94 };
95
usnic_is_nlreply_expected(struct usnic_nl_sk * unlsk,struct nlmsghdr * nlm_hdr)96 static int usnic_is_nlreply_expected(struct usnic_nl_sk *unlsk,
97 struct nlmsghdr *nlm_hdr)
98 {
99 if (nlm_hdr->nlmsg_pid != nl_socket_get_local_port(unlsk->nlh)
100 || nlm_hdr->nlmsg_seq != unlsk->seq) {
101 usnic_err("Not an expected reply msg pid: %u local pid: %u msg seq: %u expected seq: %u\n",
102 nlm_hdr->nlmsg_pid,
103 nl_socket_get_local_port(unlsk->nlh),
104 nlm_hdr->nlmsg_seq, unlsk->seq);
105 return 0;
106 }
107
108 return 1;
109 }
110
usnic_is_nlreply_err(struct nlmsghdr * nlm_hdr,struct usnic_rt_cb_arg * arg)111 static int usnic_is_nlreply_err(struct nlmsghdr *nlm_hdr,
112 struct usnic_rt_cb_arg *arg)
113 {
114 if (nlm_hdr->nlmsg_type == NLMSG_ERROR) {
115 struct nlmsgerr *e = (struct nlmsgerr *)nlmsg_data(nlm_hdr);
116 if (nlm_hdr->nlmsg_len >= (__u32)NLMSG_SIZE(sizeof(*e))) {
117 usnic_strerror(e->error,
118 "Received a netlink error message");
119 /* Sometimes nl_send() succeeds, but the
120 * request fails because the kernel is
121 * temporarily out of resources. In these
122 * cases, we should tell the caller that they
123 * should try again. */
124 if (e->error == -ECONNREFUSED) {
125 arg->retry = 1;
126 }
127 } else
128 usnic_err(
129 "Received a truncated netlink error message\n");
130 return 1;
131 }
132
133 return 0;
134 }
135
usnic_nl_send_query(struct usnic_nl_sk * unlsk,struct nl_msg * msg,int protocol,int flag)136 static int usnic_nl_send_query(struct usnic_nl_sk *unlsk, struct nl_msg *msg,
137 int protocol, int flag)
138 {
139 int ret, retry;
140 struct nlmsghdr *nlhdr;
141
142 nlhdr = nlmsg_hdr(msg);
143 while (1) {
144 nlhdr->nlmsg_pid = nl_socket_get_local_port(unlsk->nlh);
145 nlhdr->nlmsg_seq = ++unlsk->seq;
146 nlmsg_set_proto(msg, protocol);
147 nlhdr->nlmsg_flags = flag;
148
149 /* Sometimes nl_send() can fail simply because the
150 * kernel is temporarily out of resources, and we
151 * should just try again. libnl1 and libnl3 handle
152 * this case a little differently, so use the
153 * USD_NL_SEND() macro to hide the differences. If
154 * retry comes back as true, then sleep a little and
155 * try again. */
156 USD_NL_SEND(unlsk->nlh, msg, ret, retry);
157 if (retry) {
158 usleep(5);
159 continue;
160 }
161 break;
162 }
163
164 return ret;
165 }
166
usnic_nl_set_rcvsk_timer(NL_HANDLE * nlh)167 static int usnic_nl_set_rcvsk_timer(NL_HANDLE *nlh)
168 {
169 int err = 0;
170 struct timeval timeout;
171
172 timeout.tv_sec = 1;
173 timeout.tv_usec = 0;
174
175 err = setsockopt(nl_socket_get_fd(nlh), SOL_SOCKET, SO_RCVTIMEO,
176 (char *)&timeout, sizeof(timeout));
177 if (err < 0)
178 usnic_perr("Failed to set SO_RCVTIMEO for nl socket");
179
180 return err;
181 }
182
usnic_nl_sk_alloc(struct usnic_nl_sk ** p_sk,int protocol)183 static int usnic_nl_sk_alloc(struct usnic_nl_sk **p_sk, int protocol)
184 {
185 struct usnic_nl_sk *unlsk;
186 NL_HANDLE *nlh;
187 int err;
188
189 unlsk = calloc(1, sizeof(*unlsk));
190 if (!unlsk) {
191 usnic_err("Failed to allocate usnic_nl_sk struct\n");
192 return ENOMEM;
193 }
194
195 nlh = NL_HANDLE_ALLOC();
196 if (!nlh) {
197 usnic_err("Failed to allocate nl handle\n");
198 err = ENOMEM;
199 goto err_free_unlsk;
200 }
201
202 err = nl_connect(nlh, protocol);
203 if (err < 0) {
204 usnic_err("Failed to connnect netlink route socket error: %s\n",
205 NL_GETERROR(err));
206 err = EINVAL;
207 goto err_free_nlh;
208 }
209
210 NL_DISABLE_SEQ_CHECK(nlh);
211 err = usnic_nl_set_rcvsk_timer(nlh);
212 if (err < 0)
213 goto err_close_nlh;
214
215 unlsk->nlh = nlh;
216 unlsk->seq = time(NULL);
217 *p_sk = unlsk;
218 return 0;
219
220 err_close_nlh:
221 nl_close(nlh);
222 err_free_nlh:
223 NL_HANDLE_FREE(nlh);
224 err_free_unlsk:
225 free(unlsk);
226 return err;
227 }
228
usnic_nl_sk_free(struct usnic_nl_sk * unlsk)229 static void usnic_nl_sk_free(struct usnic_nl_sk *unlsk)
230 {
231 nl_close(unlsk->nlh);
232 NL_HANDLE_FREE(unlsk->nlh);
233 free(unlsk);
234 }
235
usnic_rt_raw_parse_cb(struct nl_msg * msg,void * arg)236 static int usnic_rt_raw_parse_cb(struct nl_msg *msg, void *arg)
237 {
238 struct usnic_rt_cb_arg *lookup_arg = (struct usnic_rt_cb_arg *)arg;
239 struct usnic_nl_sk *unlsk = lookup_arg->unlsk;
240 struct nlmsghdr *nlm_hdr = nlmsg_hdr(msg);
241 struct rtmsg *rtm;
242 struct nlattr *tb[RTA_MAX + 1];
243 int found = 0;
244 int err;
245
246 INC_CB_MSGCNT(lookup_arg);
247
248 if (!usnic_is_nlreply_expected(unlsk, nlm_hdr)) {
249 usnic_nlmsg_dump(msg);
250 return NL_SKIP;
251 }
252
253 if (usnic_is_nlreply_err(nlm_hdr, lookup_arg)) {
254 usnic_nlmsg_dump(msg);
255 return NL_SKIP;
256 }
257
258 if (nlm_hdr->nlmsg_type != RTM_NEWROUTE) {
259 char buf[128];
260 nl_nlmsgtype2str(nlm_hdr->nlmsg_type, buf, sizeof(buf));
261 usnic_err("Received an invalid route request reply message type: %s\n",
262 buf);
263 usnic_nlmsg_dump(msg);
264 return NL_SKIP;
265 }
266
267 rtm = nlmsg_data(nlm_hdr);
268 if (rtm->rtm_family != AF_INET) {
269 usnic_err("RTM message contains invalid AF family: %u\n",
270 rtm->rtm_family);
271 usnic_nlmsg_dump(msg);
272 return NL_SKIP;
273 }
274
275 err = nlmsg_parse(nlm_hdr, sizeof(struct rtmsg), tb, RTA_MAX,
276 route_policy);
277 if (err < 0) {
278 usnic_err("nlmsg parse error %s\n", NL_GETERROR(err));
279 usnic_nlmsg_dump(msg);
280 return NL_SKIP;
281 }
282
283 if (tb[RTA_OIF]) {
284 if (nla_get_u32(tb[RTA_OIF]) == (uint32_t)lookup_arg->oif)
285 found = 1;
286 else
287 usnic_err("Retrieved route has a different outgoing interface %d (expected %d)\n",
288 nla_get_u32(tb[RTA_OIF]),
289 lookup_arg->oif);
290 }
291
292 if (found && tb[RTA_GATEWAY])
293 lookup_arg->nh_addr = nla_get_u32(tb[RTA_GATEWAY]);
294
295 lookup_arg->found = found;
296 return NL_STOP;
297 }
298
299
300 static struct usd_nl_cache_entry *
usd_nl_cache_lookup(uint32_t src_ipaddr_be,uint32_t dest_ipaddr_be,int ifindex)301 usd_nl_cache_lookup(uint32_t src_ipaddr_be, uint32_t dest_ipaddr_be, int ifindex)
302 {
303 time_t now;
304 struct usd_nl_cache_entry *nlce;
305 struct usd_nl_cache_entry *stale;
306
307 now = time(NULL);
308 for (nlce = cache; NULL != nlce; ) {
309 /* While we're traversing the cache, we might as well
310 * remove stale entries */
311 if (now > nlce->timestamp + usd_nl_cache_timeout) {
312 stale = nlce;
313 nlce = nlce->next;
314
315 if (stale->prev) {
316 stale->prev->next = stale->next;
317 }
318 if (stale->next) {
319 stale->next->prev = stale->prev;
320 }
321 if (cache == stale) {
322 cache = nlce;
323 }
324 free(stale);
325
326 continue;
327 }
328
329 if (nlce->src_ipaddr_be == src_ipaddr_be &&
330 nlce->dest_ipaddr_be == dest_ipaddr_be &&
331 nlce->ifindex == ifindex) {
332 return nlce;
333 }
334
335 nlce = nlce->next;
336 }
337
338 return NULL;
339 }
340
341 static void
usd_nl_cache_save(int32_t src_ipaddr_be,uint32_t dest_ipaddr_be,int ifindex,uint32_t nh_addr,int reachable)342 usd_nl_cache_save(int32_t src_ipaddr_be, uint32_t dest_ipaddr_be, int ifindex,
343 uint32_t nh_addr, int reachable)
344 {
345 struct usd_nl_cache_entry *nlce;
346
347 nlce = calloc(1, sizeof(*nlce));
348 if (NULL == nlce) {
349 return;
350 }
351
352 nlce->timestamp = time(NULL);
353 nlce->src_ipaddr_be = src_ipaddr_be;
354 nlce->dest_ipaddr_be = dest_ipaddr_be;
355 nlce->ifindex = ifindex;
356 nlce->nh_addr = nh_addr;
357 nlce->reachable = reachable;
358
359 nlce->next = cache;
360 if (cache) {
361 cache->prev = nlce;
362 }
363 cache = nlce;
364 }
365
366
usnic_nl_rt_lookup(uint32_t src_addr,uint32_t dst_addr,int oif,uint32_t * nh_addr)367 int usnic_nl_rt_lookup(uint32_t src_addr, uint32_t dst_addr, int oif,
368 uint32_t *nh_addr)
369 {
370 struct usnic_nl_sk *unlsk;
371 struct nl_msg *nlm;
372 struct rtmsg rmsg;
373 struct usnic_rt_cb_arg arg;
374 int err;
375
376 /* See if we have this NL result cached */
377 struct usd_nl_cache_entry *nlce;
378 nlce = usd_nl_cache_lookup(src_addr, dst_addr, oif);
379 if (nlce) {
380 if (nlce->reachable) {
381 *nh_addr = nlce->nh_addr;
382 return 0;
383 } else {
384 return EHOSTUNREACH;
385 }
386 }
387
388 retry:
389 unlsk = NULL;
390 err = usnic_nl_sk_alloc(&unlsk, NETLINK_ROUTE);
391 if (err)
392 return err;
393
394 memset(&rmsg, 0, sizeof(rmsg));
395 rmsg.rtm_family = AF_INET;
396 rmsg.rtm_dst_len = sizeof(dst_addr) * CHAR_BIT;
397 rmsg.rtm_src_len = sizeof(src_addr) * CHAR_BIT;
398
399 nlm = nlmsg_alloc_simple(RTM_GETROUTE, 0);
400 if (!nlm) {
401 usnic_err("Failed to alloc nl message, %s\n",
402 NL_GETERROR(err));
403 err = ENOMEM;
404 goto out;
405 }
406 nlmsg_append(nlm, &rmsg, sizeof(rmsg), NLMSG_ALIGNTO);
407 nla_put_u32(nlm, RTA_DST, dst_addr);
408 nla_put_u32(nlm, RTA_SRC, src_addr);
409
410 err = usnic_nl_send_query(unlsk, nlm, NETLINK_ROUTE, NLM_F_REQUEST);
411 nlmsg_free(nlm);
412 if (err < 0) {
413 usnic_err("Failed to send RTM_GETROUTE query message, error %s\n",
414 NL_GETERROR(err));
415 err = EINVAL;
416 goto out;
417 }
418
419 memset(&arg, 0, sizeof(arg));
420 arg.oif = oif;
421 arg.unlsk = unlsk;
422 err = nl_socket_modify_cb(unlsk->nlh, NL_CB_MSG_IN, NL_CB_CUSTOM,
423 usnic_rt_raw_parse_cb, &arg);
424 if (err != 0) {
425 usnic_err("Failed to setup callback function, error %s\n",
426 NL_GETERROR(err));
427 err = EINVAL;
428 goto out;
429 }
430
431 /* Sometimes the recvmsg can fail because something is
432 * temporarily out of resources. In this case, delay a little
433 * and try again. */
434 do {
435 err = 0;
436 NL_RECVMSGS(unlsk->nlh, arg, EAGAIN, err, out);
437 if (err == EAGAIN) {
438 usleep(5);
439 }
440 } while (err == EAGAIN);
441
442 /* If we got a reply back that indicated that the kernel was
443 * too busy to handle this request, delay a little and try
444 * again. */
445 if (arg.retry) {
446 usleep(5);
447 goto retry;
448 }
449
450 if (arg.found) {
451 *nh_addr = arg.nh_addr;
452 err = 0;
453 } else {
454 err = EHOSTUNREACH;
455 }
456
457 /* Save this result in the cache */
458 usd_nl_cache_save(src_addr, dst_addr, oif,
459 arg.nh_addr, arg.found);
460
461 out:
462 usnic_nl_sk_free(unlsk);
463 return err;
464 }
465
466