1 /*
2    ctdb system specific code to manage raw sockets on linux
3 
4    Copyright (C) Ronnie Sahlberg  2007
5    Copyright (C) Andrew Tridgell  2007
6    Copyright (C) Marc Dequènes (Duck) 2009
7    Copyright (C) Volker Lendecke 2012
8 
9    This program is free software; you can redistribute it and/or modify
10    it under the terms of the GNU General Public License as published by
11    the Free Software Foundation; either version 3 of the License, or
12    (at your option) any later version.
13 
14    This program is distributed in the hope that it will be useful,
15    but WITHOUT ANY WARRANTY; without even the implied warranty of
16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17    GNU General Public License for more details.
18 
19    You should have received a copy of the GNU General Public License
20    along with this program; if not, see <http://www.gnu.org/licenses/>.
21 */
22 
23 #include "replace.h"
24 
25 /*
26  * Use BSD struct tcphdr field names for portability.  Modern glibc
27  * makes them available by default via <netinet/tcp.h> but older glibc
28  * requires __FAVOR_BSD to be defined.
29  *
30  * __FAVOR_BSD is normally defined in <features.h> if _DEFAULT_SOURCE
31  * (new) or _BSD_SOURCE (now deprecated) is set and _GNU_SOURCE is not
32  * set.  Including "replace.h" above causes <features.h> to be
33  * indirectly included and this will not set __FAVOR_BSD because
34  * _GNU_SOURCE is set in Samba's "config.h" (which is included by
35  * "replace.h").
36  *
37  * Therefore, set __FAVOR_BSD by hand below.
38  */
39 #define __FAVOR_BSD 1
40 #include "system/network.h"
41 
42 #ifdef HAVE_NETINET_IF_ETHER_H
43 #include <netinet/if_ether.h>
44 #endif
45 #ifdef HAVE_NETINET_IP6_H
46 #include <netinet/ip6.h>
47 #endif
48 #ifdef HAVE_NETINET_ICMP6_H
49 #include <netinet/icmp6.h>
50 #endif
51 #ifdef HAVE_LINUX_IF_PACKET_H
52 #include <linux/if_packet.h>
53 #endif
54 
55 #ifndef ETHERTYPE_IP6
56 #define ETHERTYPE_IP6 0x86dd
57 #endif
58 
59 #include "lib/util/debug.h"
60 #include "lib/util/blocking.h"
61 
62 #include "protocol/protocol.h"
63 
64 #include "common/logging.h"
65 #include "common/system_socket.h"
66 
67 /*
68   uint16 checksum for n bytes
69  */
70 static uint32_t uint16_checksum(uint8_t *data, size_t n)
71 {
72 	uint32_t sum=0;
73 	uint16_t value;
74 
75 	while (n>=2) {
76 		memcpy(&value, data, 2);
77 		sum += (uint32_t)ntohs(value);
78 		data += 2;
79 		n -= 2;
80 	}
81 	if (n == 1) {
82 		sum += (uint32_t)ntohs(*data);
83 	}
84 	return sum;
85 }
86 
87 /*
88  * See if the given IP is currently on an interface
89  */
90 bool ctdb_sys_have_ip(ctdb_sock_addr *_addr)
91 {
92 	int s;
93 	int ret;
94 	ctdb_sock_addr __addr = *_addr;
95 	ctdb_sock_addr *addr = &__addr;
96 	socklen_t addrlen = 0;
97 
98 	switch (addr->sa.sa_family) {
99 	case AF_INET:
100 		addr->ip.sin_port = 0;
101 		addrlen = sizeof(struct sockaddr_in);
102 		break;
103 	case AF_INET6:
104 		addr->ip6.sin6_port = 0;
105 		addrlen = sizeof(struct sockaddr_in6);
106 		break;
107 	}
108 
109 	s = socket(addr->sa.sa_family, SOCK_STREAM, IPPROTO_TCP);
110 	if (s == -1) {
111 		return false;
112 	}
113 
114 	ret = bind(s, (struct sockaddr *)addr, addrlen);
115 
116 	close(s);
117 	return ret == 0;
118 }
119 
120 /*
121  * simple TCP checksum - assumes data is multiple of 2 bytes long
122  */
123 static uint16_t ip_checksum(uint8_t *data, size_t n, struct ip *ip)
124 {
125 	uint32_t sum = uint16_checksum(data, n);
126 	uint16_t sum2;
127 
128 	sum += uint16_checksum((uint8_t *)&ip->ip_src, sizeof(ip->ip_src));
129 	sum += uint16_checksum((uint8_t *)&ip->ip_dst, sizeof(ip->ip_dst));
130 	sum += ip->ip_p + n;
131 	sum = (sum & 0xFFFF) + (sum >> 16);
132 	sum = (sum & 0xFFFF) + (sum >> 16);
133 	sum2 = htons(sum);
134 	sum2 = ~sum2;
135 	if (sum2 == 0) {
136 		return 0xFFFF;
137 	}
138 	return sum2;
139 }
140 
141 static uint16_t ip6_checksum(uint8_t *data, size_t n, struct ip6_hdr *ip6)
142 {
143 	uint16_t phdr[3];
144 	uint32_t sum = 0;
145 	uint16_t sum2;
146 	uint32_t len;
147 
148 	sum += uint16_checksum((uint8_t *)&ip6->ip6_src, 16);
149 	sum += uint16_checksum((uint8_t *)&ip6->ip6_dst, 16);
150 
151 	len = htonl(n);
152 	phdr[0] = len & UINT16_MAX;
153 	phdr[1] = (len >> 16) & UINT16_MAX;
154 	/* ip6_nxt is only 8 bits, so fits comfortably into a uint16_t */
155 	phdr[2] = htons(ip6->ip6_nxt);
156 	sum += uint16_checksum((uint8_t *)phdr, sizeof(phdr));
157 
158 	sum += uint16_checksum(data, n);
159 
160 	sum = (sum & 0xFFFF) + (sum >> 16);
161 	sum = (sum & 0xFFFF) + (sum >> 16);
162 	sum2 = htons(sum);
163 	sum2 = ~sum2;
164 	if (sum2 == 0) {
165 		return 0xFFFF;
166 	}
167 	return sum2;
168 }
169 
170 /*
171  * Send gratuitous ARP request/reply or IPv6 neighbor advertisement
172  */
173 
174 #ifdef HAVE_PACKETSOCKET
175 
176 /*
177  * Create IPv4 ARP requests/replies or IPv6 neighbour advertisement
178  * packets
179  */
180 
181 #define ARP_STRUCT_SIZE sizeof(struct ether_header) + \
182 			sizeof(struct ether_arp)
183 
184 #define IP6_NA_STRUCT_SIZE sizeof(struct ether_header) + \
185 			   sizeof(struct ip6_hdr) + \
186 			   sizeof(struct nd_neighbor_advert) + \
187 			   sizeof(struct nd_opt_hdr) + \
188 			   sizeof(struct ether_addr)
189 
190 #define ARP_BUFFER_SIZE MAX(ARP_STRUCT_SIZE, 64)
191 
192 #define IP6_NA_BUFFER_SIZE MAX(IP6_NA_STRUCT_SIZE, 64)
193 
194 static int arp_build(uint8_t *buffer,
195 		     size_t buflen,
196 		     const struct sockaddr_in *addr,
197 		     const struct ether_addr *hwaddr,
198 		     bool reply,
199 		     struct ether_addr **ether_dhost,
200 		     size_t *len)
201 {
202 	size_t l = ARP_BUFFER_SIZE;
203 	struct ether_header *eh;
204 	struct ether_arp *ea;
205 	struct arphdr *ah;
206 
207 	if (addr->sin_family != AF_INET) {
208 		return EINVAL;
209 	}
210 
211 	if (buflen < l) {
212 		return EMSGSIZE;
213 	}
214 
215 	memset(buffer, 0 , l);
216 
217 	eh = (struct ether_header *)buffer;
218 	memset(eh->ether_dhost, 0xff, ETH_ALEN);
219 	memcpy(eh->ether_shost, hwaddr, ETH_ALEN);
220 	eh->ether_type = htons(ETHERTYPE_ARP);
221 
222 	ea = (struct ether_arp *)(buffer + sizeof(struct ether_header));
223 	ah = &ea->ea_hdr;
224 	ah->ar_hrd = htons(ARPHRD_ETHER);
225 	ah->ar_pro = htons(ETH_P_IP);
226 	ah->ar_hln = ETH_ALEN;
227 	ah->ar_pln = sizeof(ea->arp_spa);
228 
229 	if (! reply) {
230 		ah->ar_op  = htons(ARPOP_REQUEST);
231 		memcpy(ea->arp_sha, hwaddr, ETH_ALEN);
232 		memcpy(ea->arp_spa, &addr->sin_addr, sizeof(ea->arp_spa));
233 		memset(ea->arp_tha, 0, ETH_ALEN);
234 		memcpy(ea->arp_tpa, &addr->sin_addr, sizeof(ea->arp_tpa));
235 	} else {
236 		ah->ar_op  = htons(ARPOP_REPLY);
237 		memcpy(ea->arp_sha, hwaddr, ETH_ALEN);
238 		memcpy(ea->arp_spa, &addr->sin_addr, sizeof(ea->arp_spa));
239 		memcpy(ea->arp_tha, hwaddr, ETH_ALEN);
240 		memcpy(ea->arp_tpa, &addr->sin_addr, sizeof(ea->arp_tpa));
241 	}
242 
243 	*ether_dhost = (struct ether_addr *)eh->ether_dhost;
244 	*len = l;
245 	return 0;
246 }
247 
248 static int ip6_na_build(uint8_t *buffer,
249 			size_t buflen,
250 			const struct sockaddr_in6 *addr,
251 			const struct ether_addr *hwaddr,
252 			struct ether_addr **ether_dhost,
253 			size_t *len)
254 {
255 	size_t l = IP6_NA_BUFFER_SIZE;
256 	struct ether_header *eh;
257 	struct ip6_hdr *ip6;
258 	struct nd_neighbor_advert *nd_na;
259 	struct nd_opt_hdr *nd_oh;
260 	struct ether_addr *ea;
261 	int ret;
262 
263 	if (addr->sin6_family != AF_INET6) {
264 		return EINVAL;
265 	}
266 
267 	if (buflen < l) {
268 		return EMSGSIZE;
269 	}
270 
271 	memset(buffer, 0 , l);
272 
273 	eh = (struct ether_header *)buffer;
274 	/*
275 	 * Ethernet multicast: 33:33:00:00:00:01 (see RFC2464,
276 	 * section 7) - note memset 0 above!
277 	 */
278 	eh->ether_dhost[0] = 0x33;
279 	eh->ether_dhost[1] = 0x33;
280 	eh->ether_dhost[5] = 0x01;
281 	memcpy(eh->ether_shost, hwaddr, ETH_ALEN);
282 	eh->ether_type = htons(ETHERTYPE_IP6);
283 
284 	ip6 = (struct ip6_hdr *)(buffer + sizeof(struct ether_header));
285 	ip6->ip6_vfc  = 6 << 4;
286 	ip6->ip6_plen = htons(sizeof(struct nd_neighbor_advert) +
287 			      sizeof(struct nd_opt_hdr) +
288 			      ETH_ALEN);
289 	ip6->ip6_nxt  = IPPROTO_ICMPV6;
290 	ip6->ip6_hlim = 255;
291 	ip6->ip6_src  = addr->sin6_addr;
292 	/* all-nodes multicast */
293 
294 	ret = inet_pton(AF_INET6, "ff02::1", &ip6->ip6_dst);
295 	if (ret != 1) {
296 		return EIO;
297 	}
298 
299 	nd_na = (struct nd_neighbor_advert *)(buffer +
300 					      sizeof(struct ether_header) +
301 					      sizeof(struct ip6_hdr));
302 	nd_na->nd_na_type = ND_NEIGHBOR_ADVERT;
303 	nd_na->nd_na_code = 0;
304 	nd_na->nd_na_flags_reserved = ND_NA_FLAG_OVERRIDE;
305 	nd_na->nd_na_target = addr->sin6_addr;
306 
307 	/* Option: Target link-layer address */
308 	nd_oh = (struct nd_opt_hdr *)(buffer +
309 				      sizeof(struct ether_header) +
310 				      sizeof(struct ip6_hdr) +
311 				      sizeof(struct nd_neighbor_advert));
312 	nd_oh->nd_opt_type = ND_OPT_TARGET_LINKADDR;
313 	nd_oh->nd_opt_len = 1;  /* multiple of 8 octets */
314 
315 	ea = (struct ether_addr *)(buffer +
316 				   sizeof(struct ether_header) +
317 				   sizeof(struct ip6_hdr) +
318 				   sizeof(struct nd_neighbor_advert) +
319 				   sizeof(struct nd_opt_hdr));
320 	memcpy(ea, hwaddr, ETH_ALEN);
321 
322 	nd_na->nd_na_cksum = ip6_checksum((uint8_t *)nd_na,
323 					  ntohs(ip6->ip6_plen),
324 					  ip6);
325 
326 	*ether_dhost = (struct ether_addr *)eh->ether_dhost;
327 	*len = l;
328 	return 0;
329 }
330 
331 int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
332 {
333 	int s;
334 	struct sockaddr_ll sall = {0};
335 	struct ifreq if_hwaddr = {
336 		.ifr_ifru = {
337 			.ifru_flags = 0
338 		},
339 	};
340 	uint8_t buffer[MAX(ARP_BUFFER_SIZE, IP6_NA_BUFFER_SIZE)];
341 	struct ifreq ifr = {
342 		.ifr_ifru = {
343 			.ifru_flags = 0
344 		},
345 	};
346 	struct ether_addr *hwaddr = NULL;
347 	struct ether_addr *ether_dhost = NULL;
348 	size_t len = 0;
349 	int ret = 0;
350 
351 	s = socket(AF_PACKET, SOCK_RAW, 0);
352 	if (s == -1) {
353 		ret = errno;
354 		DBG_ERR("Failed to open raw socket\n");
355 		return ret;
356 	}
357 	DBG_DEBUG("Created SOCKET FD:%d for sending arp\n", s);
358 
359 	/* Find interface */
360 	strlcpy(ifr.ifr_name, iface, sizeof(ifr.ifr_name));
361 	if (ioctl(s, SIOCGIFINDEX, &ifr) < 0) {
362 		ret = errno;
363 		DBG_ERR("Interface '%s' not found\n", iface);
364 		goto fail;
365 	}
366 
367 	/* Get MAC address */
368 	strlcpy(if_hwaddr.ifr_name, iface, sizeof(if_hwaddr.ifr_name));
369 	ret = ioctl(s, SIOCGIFHWADDR, &if_hwaddr);
370 	if ( ret < 0 ) {
371 		ret = errno;
372 		DBG_ERR("ioctl failed\n");
373 		goto fail;
374 	}
375 	if (ARPHRD_LOOPBACK == if_hwaddr.ifr_hwaddr.sa_family) {
376 		ret = 0;
377 		D_DEBUG("Ignoring loopback arp request\n");
378 		goto fail;
379 	}
380 	if (if_hwaddr.ifr_hwaddr.sa_family != ARPHRD_ETHER) {
381 		ret = EINVAL;
382 		DBG_ERR("Not an ethernet address family (0x%x)\n",
383 			if_hwaddr.ifr_hwaddr.sa_family);
384 		goto fail;;
385 	}
386 
387 	/* Set up most of destination address structure */
388 	sall.sll_family = AF_PACKET;
389 	sall.sll_halen = sizeof(struct ether_addr);
390 	sall.sll_protocol = htons(ETH_P_ALL);
391 	sall.sll_ifindex = ifr.ifr_ifindex;
392 
393 	/* For clarity */
394 	hwaddr = (struct ether_addr *)if_hwaddr.ifr_hwaddr.sa_data;
395 
396 	switch (addr->ip.sin_family) {
397 	case AF_INET:
398 		/* Send gratuitous ARP */
399 		ret = arp_build(buffer,
400 				sizeof(buffer),
401 				&addr->ip,
402 				hwaddr,
403 				false,
404 				&ether_dhost,
405 				&len);
406 		if (ret != 0) {
407 			DBG_ERR("Failed to build ARP request\n");
408 			goto fail;
409 		}
410 
411 		memcpy(&sall.sll_addr[0], ether_dhost, sall.sll_halen);
412 
413 		ret = sendto(s,
414 			     buffer,
415 			     len,
416 			     0,
417 			     (struct sockaddr *)&sall,
418 			     sizeof(sall));
419 		if (ret < 0 ) {
420 			ret = errno;
421 			DBG_ERR("Failed sendto\n");
422 			goto fail;
423 		}
424 
425 		/* Send unsolicited ARP reply */
426 		ret = arp_build(buffer,
427 				sizeof(buffer),
428 				&addr->ip,
429 				hwaddr,
430 				true,
431 				&ether_dhost,
432 				&len);
433 		if (ret != 0) {
434 			DBG_ERR("Failed to build ARP reply\n");
435 			goto fail;
436 		}
437 
438 		memcpy(&sall.sll_addr[0], ether_dhost, sall.sll_halen);
439 
440 		ret = sendto(s,
441 			     buffer,
442 			     len,
443 			     0,
444 			     (struct sockaddr *)&sall,
445 			     sizeof(sall));
446 		if (ret < 0 ) {
447 			ret = errno;
448 			DBG_ERR("Failed sendto\n");
449 			goto fail;
450 		}
451 
452 		close(s);
453 		break;
454 
455 	case AF_INET6:
456 		ret = ip6_na_build(buffer,
457 				   sizeof(buffer),
458 				   &addr->ip6,
459 				   hwaddr,
460 				   &ether_dhost,
461 				   &len);
462 		if (ret != 0) {
463 			DBG_ERR("Failed to build IPv6 neighbor advertisement\n");
464 			goto fail;
465 		}
466 
467 		memcpy(&sall.sll_addr[0], ether_dhost, sall.sll_halen);
468 
469 		ret = sendto(s,
470 			     buffer,
471 			     len,
472 			     0,
473 			     (struct sockaddr *)&sall,
474 			     sizeof(sall));
475 		if (ret < 0 ) {
476 			ret = errno;
477 			DBG_ERR("Failed sendto\n");
478 			goto fail;
479 		}
480 
481 		close(s);
482 		break;
483 
484 	default:
485 		ret = EINVAL;
486 		DBG_ERR("Not an ipv4/ipv6 address (family is %u)\n",
487 			addr->ip.sin_family);
488 		goto fail;
489 	}
490 
491 	return 0;
492 
493 fail:
494 	close(s);
495 	return ret;
496 }
497 
498 #else /* HAVE_PACKETSOCKET */
499 
500 int ctdb_sys_send_arp(const ctdb_sock_addr *addr, const char *iface)
501 {
502 	/* Not implemented */
503 	return ENOSYS;
504 }
505 
506 #endif /* HAVE_PACKETSOCKET */
507 
508 
509 #define IP4_TCP_BUFFER_SIZE sizeof(struct ip) + \
510 			    sizeof(struct tcphdr)
511 
512 #define IP6_TCP_BUFFER_SIZE sizeof(struct ip6_hdr) + \
513 			    sizeof(struct tcphdr)
514 
515 static int tcp4_build(uint8_t *buf,
516 		      size_t buflen,
517 		      const struct sockaddr_in *src,
518 		      const struct sockaddr_in *dst,
519 		      uint32_t seq,
520 		      uint32_t ack,
521 		      int rst,
522 		      size_t *len)
523 {
524 	size_t l = IP4_TCP_BUFFER_SIZE;
525 	struct {
526 		struct ip ip;
527 		struct tcphdr tcp;
528 	} *ip4pkt;
529 
530 	if (l != sizeof(*ip4pkt)) {
531 		return EMSGSIZE;
532 	}
533 
534 	if (buflen < l) {
535 		return EMSGSIZE;
536 	}
537 
538 	ip4pkt = (void *)buf;
539 	memset(ip4pkt, 0, l);
540 
541 	ip4pkt->ip.ip_v     = 4;
542 	ip4pkt->ip.ip_hl    = sizeof(ip4pkt->ip)/sizeof(uint32_t);
543 	ip4pkt->ip.ip_len   = htons(sizeof(ip4pkt));
544 	ip4pkt->ip.ip_ttl   = 255;
545 	ip4pkt->ip.ip_p     = IPPROTO_TCP;
546 	ip4pkt->ip.ip_src.s_addr = src->sin_addr.s_addr;
547 	ip4pkt->ip.ip_dst.s_addr = dst->sin_addr.s_addr;
548 	ip4pkt->ip.ip_sum   = 0;
549 
550 	ip4pkt->tcp.th_sport = src->sin_port;
551 	ip4pkt->tcp.th_dport = dst->sin_port;
552 	ip4pkt->tcp.th_seq   = seq;
553 	ip4pkt->tcp.th_ack   = ack;
554 	ip4pkt->tcp.th_flags = 0;
555 	ip4pkt->tcp.th_flags |= TH_ACK;
556 	if (rst) {
557 		ip4pkt->tcp.th_flags |= TH_RST;
558 	}
559 	ip4pkt->tcp.th_off   = sizeof(ip4pkt->tcp)/sizeof(uint32_t);
560 	/* this makes it easier to spot in a sniffer */
561 	ip4pkt->tcp.th_win   = htons(1234);
562 	ip4pkt->tcp.th_sum   = ip_checksum((uint8_t *)&ip4pkt->tcp,
563 					   sizeof(ip4pkt->tcp),
564 					   &ip4pkt->ip);
565 
566 	*len = l;
567 	return 0;
568 }
569 
570 static int tcp6_build(uint8_t *buf,
571 		      size_t buflen,
572 		      const struct sockaddr_in6 *src,
573 		      const struct sockaddr_in6 *dst,
574 		      uint32_t seq,
575 		      uint32_t ack,
576 		      int rst,
577 		      size_t *len)
578 {
579 	size_t l = IP6_TCP_BUFFER_SIZE;
580 	struct {
581 		struct ip6_hdr ip6;
582 		struct tcphdr tcp;
583 	} *ip6pkt;
584 
585 	if (l != sizeof(*ip6pkt)) {
586 		return EMSGSIZE;
587 	}
588 
589 	if (buflen < l) {
590 		return EMSGSIZE;
591 	}
592 
593 	ip6pkt = (void *)buf;
594 	memset(ip6pkt, 0, l);
595 
596 	ip6pkt->ip6.ip6_vfc  = 6 << 4;
597 	ip6pkt->ip6.ip6_plen = htons(sizeof(struct tcphdr));
598 	ip6pkt->ip6.ip6_nxt  = IPPROTO_TCP;
599 	ip6pkt->ip6.ip6_hlim = 64;
600 	ip6pkt->ip6.ip6_src  = src->sin6_addr;
601 	ip6pkt->ip6.ip6_dst  = dst->sin6_addr;
602 
603 	ip6pkt->tcp.th_sport = src->sin6_port;
604 	ip6pkt->tcp.th_dport = dst->sin6_port;
605 	ip6pkt->tcp.th_seq   = seq;
606 	ip6pkt->tcp.th_ack   = ack;
607 	ip6pkt->tcp.th_flags = 0;
608 	ip6pkt->tcp.th_flags |= TH_ACK;
609 	if (rst) {
610 		ip6pkt->tcp.th_flags |= TH_RST;
611 	}
612 	ip6pkt->tcp.th_off    = sizeof(ip6pkt->tcp)/sizeof(uint32_t);
613 	/* this makes it easier to spot in a sniffer */
614 	ip6pkt->tcp.th_win   = htons(1234);
615 	ip6pkt->tcp.th_sum   = ip6_checksum((uint8_t *)&ip6pkt->tcp,
616 					    sizeof(ip6pkt->tcp),
617 					    &ip6pkt->ip6);
618 
619 	*len = l;
620 	return 0;
621 }
622 
623 /*
624  * Send tcp segment from the specified IP/port to the specified
625  * destination IP/port.
626  *
627  * This is used to trigger the receiving host into sending its own ACK,
628  * which should trigger early detection of TCP reset by the client
629  * after IP takeover
630  *
631  * This can also be used to send RST segments (if rst is true) and also
632  * if correct seq and ack numbers are provided.
633  */
634 int ctdb_sys_send_tcp(const ctdb_sock_addr *dest,
635 		      const ctdb_sock_addr *src,
636 		      uint32_t seq,
637 		      uint32_t ack,
638 		      int rst)
639 {
640 	uint8_t buf[MAX(IP4_TCP_BUFFER_SIZE, IP6_TCP_BUFFER_SIZE)];
641 	size_t len = 0;
642 	int ret;
643 	int s;
644 	uint32_t one = 1;
645 	struct sockaddr_in6 tmpdest = { 0 };
646 	int saved_errno;
647 
648 	switch (src->ip.sin_family) {
649 	case AF_INET:
650 		ret = tcp4_build(buf,
651 				 sizeof(buf),
652 				 &src->ip,
653 				 &dest->ip,
654 				 seq,
655 				 ack,
656 				 rst,
657 				 &len);
658 		if (ret != 0) {
659 			DBG_ERR("Failed to build TCP packet (%d)\n", ret);
660 			return ret;
661 		}
662 
663 		/* open a raw socket to send this segment from */
664 		s = socket(AF_INET, SOCK_RAW, IPPROTO_RAW);
665 		if (s == -1) {
666 			DBG_ERR("Failed to open raw socket (%s)\n",
667 				strerror(errno));
668 			return -1;
669 		}
670 
671 		ret = setsockopt(s, IPPROTO_IP, IP_HDRINCL, &one, sizeof(one));
672 		if (ret != 0) {
673 			DBG_ERR("Failed to setup IP headers (%s)\n",
674 				strerror(errno));
675 			close(s);
676 			return -1;
677 		}
678 
679 		ret = sendto(s,
680 			     buf,
681 			     len,
682 			     0,
683 			     (const struct sockaddr *)&dest->ip,
684 			     sizeof(dest->ip));
685 		saved_errno = errno;
686 		close(s);
687 		if (ret == -1) {
688 			D_ERR("Failed sendto (%s)\n", strerror(saved_errno));
689 			return -1;
690 		}
691 		if ((size_t)ret != len) {
692 			DBG_ERR("Failed sendto - didn't send full packet\n");
693 			return -1;
694 		}
695 		break;
696 
697 	case AF_INET6:
698 		ret = tcp6_build(buf,
699 				 sizeof(buf),
700 				 &src->ip6,
701 				 &dest->ip6,
702 				 seq,
703 				 ack,
704 				 rst,
705 				 &len);
706 		if (ret != 0) {
707 			DBG_ERR("Failed to build TCP packet (%d)\n", ret);
708 			return ret;
709 		}
710 
711 		s = socket(AF_INET6, SOCK_RAW, IPPROTO_RAW);
712 		if (s == -1) {
713 			DBG_ERR("Failed to open sending socket\n");
714 			return -1;
715 
716 		}
717 		/*
718 		 * sendto() on an IPv6 raw socket requires the port to
719 		 * be either 0 or a protocol value
720 		 */
721 		tmpdest = dest->ip6;
722 		tmpdest.sin6_port = 0;
723 
724 		ret = sendto(s,
725 			     buf,
726 			     len,
727 			     0,
728 			     (const struct sockaddr *)&tmpdest,
729 			     sizeof(tmpdest));
730 		saved_errno = errno;
731 		close(s);
732 		if (ret == -1) {
733 			D_ERR("Failed sendto (%s)\n", strerror(saved_errno));
734 			return -1;
735 		}
736 		if ((size_t)ret != len) {
737 			DBG_ERR("Failed sendto - didn't send full packet\n");
738 			return -1;
739 		}
740 		break;
741 
742 	default:
743 		DBG_ERR("Not an ipv4/v6 address\n");
744 		return -1;
745 	}
746 
747 	return 0;
748 }
749 
750 /*
751  * Packet capture
752  *
753  * If AF_PACKET is available then use a raw socket otherwise use pcap.
754  * wscript has checked to make sure that pcap is available if needed.
755  */
756 
757 static int tcp4_extract(const uint8_t *ip_pkt,
758 			size_t pktlen,
759 			struct sockaddr_in *src,
760 			struct sockaddr_in *dst,
761 			uint32_t *ack_seq,
762 			uint32_t *seq,
763 			int *rst,
764 			uint16_t *window)
765 {
766 	const struct ip *ip;
767 	const struct tcphdr *tcp;
768 
769 	if (pktlen < sizeof(struct ip)) {
770 		return EMSGSIZE;
771 	}
772 
773 	ip = (const struct ip *)ip_pkt;
774 
775 	/* IPv4 only */
776 	if (ip->ip_v != 4) {
777 		return ENOMSG;
778 	}
779 	/* Don't look at fragments */
780 	if ((ntohs(ip->ip_off)&0x1fff) != 0) {
781 		return ENOMSG;
782 	}
783 	/* TCP only */
784 	if (ip->ip_p != IPPROTO_TCP) {
785 		return ENOMSG;
786 	}
787 
788 	/* Ensure there is enough of the packet to gather required fields */
789 	if (pktlen <
790 	    (ip->ip_hl * sizeof(uint32_t)) + offsetof(struct tcphdr, th_sum)) {
791 		return EMSGSIZE;
792 	}
793 
794 	tcp = (const struct tcphdr *)(ip_pkt + (ip->ip_hl * sizeof(uint32_t)));
795 
796 	src->sin_family      = AF_INET;
797 	src->sin_addr.s_addr = ip->ip_src.s_addr;
798 	src->sin_port        = tcp->th_sport;
799 
800 	dst->sin_family      = AF_INET;
801 	dst->sin_addr.s_addr = ip->ip_dst.s_addr;
802 	dst->sin_port        = tcp->th_dport;
803 
804 	*ack_seq             = tcp->th_ack;
805 	*seq                 = tcp->th_seq;
806 	if (window != NULL) {
807 		*window = tcp->th_win;
808 	}
809 	if (rst != NULL) {
810 		*rst = tcp->th_flags & TH_RST;
811 	}
812 
813 	return 0;
814 }
815 
816 static int tcp6_extract(const uint8_t *ip_pkt,
817 			size_t pktlen,
818 			struct sockaddr_in6 *src,
819 			struct sockaddr_in6 *dst,
820 			uint32_t *ack_seq,
821 			uint32_t *seq,
822 			int *rst,
823 			uint16_t *window)
824 {
825 	const struct ip6_hdr *ip6;
826 	const struct tcphdr *tcp;
827 
828 	/* Ensure there is enough of the packet to gather required fields */
829 	if (pktlen < sizeof(struct ip6_hdr) + offsetof(struct tcphdr, th_sum)) {
830 		return EMSGSIZE;
831 	}
832 
833 	ip6 = (const struct ip6_hdr *)ip_pkt;
834 
835 	/* IPv6 only */
836 	if ((ip6->ip6_vfc >> 4) != 6){
837 		return ENOMSG;
838 	}
839 
840 	/* TCP only */
841 	if (ip6->ip6_nxt != IPPROTO_TCP) {
842 		return ENOMSG;
843 	}
844 
845 	tcp = (const struct tcphdr *)(ip_pkt + sizeof(struct ip6_hdr));
846 
847 	src->sin6_family = AF_INET6;
848 	src->sin6_port   = tcp->th_sport;
849 	src->sin6_addr   = ip6->ip6_src;
850 
851 	dst->sin6_family = AF_INET6;
852 	dst->sin6_port   = tcp->th_dport;
853 	dst->sin6_addr   = ip6->ip6_dst;
854 
855 	*ack_seq             = tcp->th_ack;
856 	*seq                 = tcp->th_seq;
857 	if (window != NULL) {
858 		*window = tcp->th_win;
859 	}
860 	if (rst != NULL) {
861 		*rst = tcp->th_flags & TH_RST;
862 	}
863 
864 	return 0;
865 }
866 
867 
868 #ifdef HAVE_AF_PACKET
869 
870 /*
871  * This function is used to open a raw socket to capture from
872  */
873 int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
874 {
875 	int s, ret;
876 
877 	/* Open a socket to capture all traffic */
878 	s = socket(AF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
879 	if (s == -1) {
880 		DBG_ERR("Failed to open raw socket\n");
881 		return -1;
882 	}
883 
884 	DBG_DEBUG("Created RAW SOCKET FD:%d for tcp tickle\n", s);
885 
886 	ret = set_blocking(s, false);
887 	if (ret != 0) {
888 		DBG_ERR("Failed to set socket non-blocking (%s)\n",
889 			strerror(errno));
890 		close(s);
891 		return -1;
892 	}
893 
894 	set_close_on_exec(s);
895 
896 	return s;
897 }
898 
899 /*
900  * This function is used to do any additional cleanup required when closing
901  * a capture socket.
902  * Note that the socket itself is closed automatically in the caller.
903  */
904 int ctdb_sys_close_capture_socket(void *private_data)
905 {
906 	return 0;
907 }
908 
909 
910 /*
911  * called when the raw socket becomes readable
912  */
913 int ctdb_sys_read_tcp_packet(int s, void *private_data,
914 			     ctdb_sock_addr *src,
915 			     ctdb_sock_addr *dst,
916 			     uint32_t *ack_seq,
917 			     uint32_t *seq,
918 			     int *rst,
919 			     uint16_t *window)
920 {
921 	ssize_t nread;
922 	uint8_t pkt[100]; /* Large enough for simple ACK/RST packets */
923 	struct ether_header *eth;
924 	int ret;
925 
926 	nread = recv(s, pkt, sizeof(pkt), MSG_TRUNC);
927 	if (nread == -1) {
928 		return errno;
929 	}
930 	if ((size_t)nread < sizeof(*eth)) {
931 		return EMSGSIZE;
932 	}
933 
934 	ZERO_STRUCTP(src);
935 	ZERO_STRUCTP(dst);
936 
937 	/* Ethernet */
938 	eth = (struct ether_header *)pkt;
939 
940 	/* we want either IPv4 or IPv6 */
941 	if (ntohs(eth->ether_type) == ETHERTYPE_IP) {
942 		ret = tcp4_extract(pkt + sizeof(struct ether_header),
943 				   (size_t)nread - sizeof(struct ether_header),
944 				   &src->ip,
945 				   &dst->ip,
946 				   ack_seq,
947 				   seq,
948 				   rst,
949 				   window);
950 		return ret;
951 
952 	} else if (ntohs(eth->ether_type) == ETHERTYPE_IP6) {
953 		ret = tcp6_extract(pkt + sizeof(struct ether_header),
954 				   (size_t)nread - sizeof(struct ether_header),
955 				   &src->ip6,
956 				   &dst->ip6,
957 				   ack_seq,
958 				   seq,
959 				   rst,
960 				   window);
961 		return ret;
962 	}
963 
964 	return ENOMSG;
965 }
966 
967 #else /* HAVE_AF_PACKET */
968 
969 #include <pcap.h>
970 
971 int ctdb_sys_open_capture_socket(const char *iface, void **private_data)
972 {
973 	pcap_t *pt;
974 
975 	pt=pcap_open_live(iface, 100, 0, 0, NULL);
976 	if (pt == NULL) {
977 		DBG_ERR("Failed to open capture device %s\n", iface);
978 		return -1;
979 	}
980 	*((pcap_t **)private_data) = pt;
981 
982 	return pcap_fileno(pt);
983 }
984 
985 int ctdb_sys_close_capture_socket(void *private_data)
986 {
987 	pcap_t *pt = (pcap_t *)private_data;
988 	pcap_close(pt);
989 	return 0;
990 }
991 
992 int ctdb_sys_read_tcp_packet(int s,
993 			     void *private_data,
994 			     ctdb_sock_addr *src,
995 			     ctdb_sock_addr *dst,
996 			     uint32_t *ack_seq,
997 			     uint32_t *seq,
998 			     int *rst,
999 			     uint16_t *window)
1000 {
1001 	int ret;
1002 	struct ether_header *eth;
1003 	struct pcap_pkthdr pkthdr;
1004 	const u_char *buffer;
1005 	pcap_t *pt = (pcap_t *)private_data;
1006 
1007 	buffer=pcap_next(pt, &pkthdr);
1008 	if (buffer==NULL) {
1009 		return ENOMSG;
1010 	}
1011 
1012 	ZERO_STRUCTP(src);
1013 	ZERO_STRUCTP(dst);
1014 
1015 	/* Ethernet */
1016 	eth = (struct ether_header *)buffer;
1017 
1018 	/* we want either IPv4 or IPv6 */
1019 	if (eth->ether_type == htons(ETHERTYPE_IP)) {
1020 		ret = tcp4_extract(buffer + sizeof(struct ether_header),
1021 				   (size_t)(pkthdr.caplen -
1022 					    sizeof(struct ether_header)),
1023 				   &src->ip,
1024 				   &dst->ip,
1025 				   ack_seq,
1026 				   seq,
1027 				   rst,
1028 				   window);
1029 		return ret;
1030 
1031 	} else if (eth->ether_type == htons(ETHERTYPE_IP6)) {
1032 		ret = tcp6_extract(buffer + sizeof(struct ether_header),
1033 				   (size_t)(pkthdr.caplen -
1034 					    sizeof(struct ether_header)),
1035 				   &src->ip6,
1036 				   &dst->ip6,
1037 				   ack_seq,
1038 				   seq,
1039 				   rst,
1040 				   window);
1041 		return ret;
1042 	}
1043 
1044 	return ENOMSG;
1045 }
1046 
1047 #endif /* HAVE_AF_PACKET */
1048