1 /*
2  * services/listen_dnsport.c - listen on port 53 for incoming DNS queries.
3  *
4  * Copyright (c) 2007, NLnet Labs. All rights reserved.
5  *
6  * This software is open source.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * Redistributions of source code must retain the above copyright notice,
13  * this list of conditions and the following disclaimer.
14  *
15  * Redistributions in binary form must reproduce the above copyright notice,
16  * this list of conditions and the following disclaimer in the documentation
17  * and/or other materials provided with the distribution.
18  *
19  * Neither the name of the NLNET LABS nor the names of its contributors may
20  * be used to endorse or promote products derived from this software without
21  * specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  */
35 
36 /**
37  * \file
38  *
39  * This file has functions to get queries from clients.
40  */
41 #include "config.h"
42 #ifdef HAVE_SYS_TYPES_H
43 #  include <sys/types.h>
44 #endif
45 #include <sys/time.h>
46 #include <limits.h>
47 #ifdef USE_TCP_FASTOPEN
48 #include <netinet/tcp.h>
49 #endif
50 #include <ctype.h>
51 #include "services/listen_dnsport.h"
52 #include "services/outside_network.h"
53 #include "util/netevent.h"
54 #include "util/log.h"
55 #include "util/config_file.h"
56 #include "util/net_help.h"
57 #include "sldns/sbuffer.h"
58 #include "sldns/parseutil.h"
59 #include "services/mesh.h"
60 #include "util/fptr_wlist.h"
61 #include "util/locks.h"
62 
63 #ifdef HAVE_NETDB_H
64 #include <netdb.h>
65 #endif
66 #include <fcntl.h>
67 
68 #ifdef HAVE_SYS_UN_H
69 #include <sys/un.h>
70 #endif
71 
72 #ifdef HAVE_SYSTEMD
73 #include <systemd/sd-daemon.h>
74 #endif
75 
76 #ifdef HAVE_IFADDRS_H
77 #include <ifaddrs.h>
78 #endif
79 #ifdef HAVE_NET_IF_H
80 #include <net/if.h>
81 #endif
82 #ifdef HAVE_LINUX_NET_TSTAMP_H
83 #include <linux/net_tstamp.h>
84 #endif
85 /** number of queued TCP connections for listen() */
86 #define TCP_BACKLOG 256
87 
88 #ifndef THREADS_DISABLED
89 /** lock on the counter of stream buffer memory */
90 static lock_basic_type stream_wait_count_lock;
91 /** lock on the counter of HTTP2 query buffer memory */
92 static lock_basic_type http2_query_buffer_count_lock;
93 /** lock on the counter of HTTP2 response buffer memory */
94 static lock_basic_type http2_response_buffer_count_lock;
95 #endif
96 /** size (in bytes) of stream wait buffers */
97 static size_t stream_wait_count = 0;
98 /** is the lock initialised for stream wait buffers */
99 static int stream_wait_lock_inited = 0;
100 /** size (in bytes) of HTTP2 query buffers */
101 static size_t http2_query_buffer_count = 0;
102 /** is the lock initialised for HTTP2 query buffers */
103 static int http2_query_buffer_lock_inited = 0;
104 /** size (in bytes) of HTTP2 response buffers */
105 static size_t http2_response_buffer_count = 0;
106 /** is the lock initialised for HTTP2 response buffers */
107 static int http2_response_buffer_lock_inited = 0;
108 
109 /**
110  * Debug print of the getaddrinfo returned address.
111  * @param addr: the address returned.
112  */
113 static void
114 verbose_print_addr(struct addrinfo *addr)
115 {
116 	if(verbosity >= VERB_ALGO) {
117 		char buf[100];
118 		void* sinaddr = &((struct sockaddr_in*)addr->ai_addr)->sin_addr;
119 #ifdef INET6
120 		if(addr->ai_family == AF_INET6)
121 			sinaddr = &((struct sockaddr_in6*)addr->ai_addr)->
122 				sin6_addr;
123 #endif /* INET6 */
124 		if(inet_ntop(addr->ai_family, sinaddr, buf,
125 			(socklen_t)sizeof(buf)) == 0) {
126 			(void)strlcpy(buf, "(null)", sizeof(buf));
127 		}
128 		buf[sizeof(buf)-1] = 0;
129 		verbose(VERB_ALGO, "creating %s%s socket %s %d",
130 			addr->ai_socktype==SOCK_DGRAM?"udp":
131 			addr->ai_socktype==SOCK_STREAM?"tcp":"otherproto",
132 			addr->ai_family==AF_INET?"4":
133 			addr->ai_family==AF_INET6?"6":
134 			"_otherfam", buf,
135 			ntohs(((struct sockaddr_in*)addr->ai_addr)->sin_port));
136 	}
137 }
138 
139 void
140 verbose_print_unbound_socket(struct unbound_socket* ub_sock)
141 {
142 	if(verbosity >= VERB_ALGO) {
143 		log_info("listing of unbound_socket structure:");
144 		verbose_print_addr(ub_sock->addr);
145 		log_info("s is: %d, fam is: %s, acl: %s", ub_sock->s,
146 			ub_sock->fam == AF_INET?"AF_INET":"AF_INET6",
147 			ub_sock->acl?"yes":"no");
148 	}
149 }
150 
151 #ifdef HAVE_SYSTEMD
152 static int
153 systemd_get_activated(int family, int socktype, int listen,
154 		      struct sockaddr *addr, socklen_t addrlen,
155 		      const char *path)
156 {
157 	int i = 0;
158 	int r = 0;
159 	int s = -1;
160 	const char* listen_pid, *listen_fds;
161 
162 	/* We should use "listen" option only for stream protocols. For UDP it should be -1 */
163 
164 	if((r = sd_booted()) < 1) {
165 		if(r == 0)
166 			log_warn("systemd is not running");
167 		else
168 			log_err("systemd sd_booted(): %s", strerror(-r));
169 		return -1;
170 	}
171 
172 	listen_pid = getenv("LISTEN_PID");
173 	listen_fds = getenv("LISTEN_FDS");
174 
175 	if (!listen_pid) {
176 		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_PID");
177 		return -1;
178 	}
179 
180 	if (!listen_fds) {
181 		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_FDS");
182 		return -1;
183 	}
184 
185 	if((r = sd_listen_fds(0)) < 1) {
186 		if(r == 0)
187 			log_warn("systemd: did not return socket, check unit configuration");
188 		else
189 			log_err("systemd sd_listen_fds(): %s", strerror(-r));
190 		return -1;
191 	}
192 
193 	for(i = 0; i < r; i++) {
194 		if(sd_is_socket(SD_LISTEN_FDS_START + i, family, socktype, listen)) {
195 			s = SD_LISTEN_FDS_START + i;
196 			break;
197 		}
198 	}
199 	if (s == -1) {
200 		if (addr)
201 			log_err_addr("systemd sd_listen_fds()",
202 				     "no such socket",
203 				     (struct sockaddr_storage *)addr, addrlen);
204 		else
205 			log_err("systemd sd_listen_fds(): %s", path);
206 	}
207 	return s;
208 }
209 #endif
210 
211 int
212 create_udp_sock(int family, int socktype, struct sockaddr* addr,
213         socklen_t addrlen, int v6only, int* inuse, int* noproto,
214 	int rcv, int snd, int listen, int* reuseport, int transparent,
215 	int freebind, int use_systemd, int dscp)
216 {
217 	int s;
218 	char* err;
219 #if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_USE_MIN_MTU)  || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined (SO_BINDANY)
220 	int on=1;
221 #endif
222 #ifdef IPV6_MTU
223 	int mtu = IPV6_MIN_MTU;
224 #endif
225 #if !defined(SO_RCVBUFFORCE) && !defined(SO_RCVBUF)
226 	(void)rcv;
227 #endif
228 #if !defined(SO_SNDBUFFORCE) && !defined(SO_SNDBUF)
229 	(void)snd;
230 #endif
231 #ifndef IPV6_V6ONLY
232 	(void)v6only;
233 #endif
234 #if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
235 	(void)transparent;
236 #endif
237 #if !defined(IP_FREEBIND)
238 	(void)freebind;
239 #endif
240 #ifdef HAVE_SYSTEMD
241 	int got_fd_from_systemd = 0;
242 
243 	if (!use_systemd
244 	    || (use_systemd
245 		&& (s = systemd_get_activated(family, socktype, -1, addr,
246 					      addrlen, NULL)) == -1)) {
247 #else
248 	(void)use_systemd;
249 #endif
250 	if((s = socket(family, socktype, 0)) == -1) {
251 		*inuse = 0;
252 #ifndef USE_WINSOCK
253 		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
254 			*noproto = 1;
255 			return -1;
256 		}
257 #else
258 		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
259 			WSAGetLastError() == WSAEPROTONOSUPPORT) {
260 			*noproto = 1;
261 			return -1;
262 		}
263 #endif
264 		log_err("can't create socket: %s", sock_strerror(errno));
265 		*noproto = 0;
266 		return -1;
267 	}
268 #ifdef HAVE_SYSTEMD
269 	} else {
270 		got_fd_from_systemd = 1;
271 	}
272 #endif
273 	if(listen) {
274 #ifdef SO_REUSEADDR
275 		if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
276 			(socklen_t)sizeof(on)) < 0) {
277 			log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
278 				sock_strerror(errno));
279 #ifndef USE_WINSOCK
280 			if(errno != ENOSYS) {
281 				close(s);
282 				*noproto = 0;
283 				*inuse = 0;
284 				return -1;
285 			}
286 #else
287 			closesocket(s);
288 			*noproto = 0;
289 			*inuse = 0;
290 			return -1;
291 #endif
292 		}
293 #endif /* SO_REUSEADDR */
294 #ifdef SO_REUSEPORT
295 #  ifdef SO_REUSEPORT_LB
296 		/* on FreeBSD 12 we have SO_REUSEPORT_LB that does loadbalance
297 		 * like SO_REUSEPORT on Linux.  This is what the users want
298 		 * with the config option in unbound.conf; if we actually
299 		 * need local address and port reuse they'll also need to
300 		 * have SO_REUSEPORT set for them, assume it was _LB they want.
301 		 */
302 		if (reuseport && *reuseport &&
303 		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT_LB, (void*)&on,
304 			(socklen_t)sizeof(on)) < 0) {
305 #ifdef ENOPROTOOPT
306 			if(errno != ENOPROTOOPT || verbosity >= 3)
307 				log_warn("setsockopt(.. SO_REUSEPORT_LB ..) failed: %s",
308 					strerror(errno));
309 #endif
310 			/* this option is not essential, we can continue */
311 			*reuseport = 0;
312 		}
313 #  else /* no SO_REUSEPORT_LB */
314 
315 		/* try to set SO_REUSEPORT so that incoming
316 		 * queries are distributed evenly among the receiving threads.
317 		 * Each thread must have its own socket bound to the same port,
318 		 * with SO_REUSEPORT set on each socket.
319 		 */
320 		if (reuseport && *reuseport &&
321 		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
322 			(socklen_t)sizeof(on)) < 0) {
323 #ifdef ENOPROTOOPT
324 			if(errno != ENOPROTOOPT || verbosity >= 3)
325 				log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
326 					strerror(errno));
327 #endif
328 			/* this option is not essential, we can continue */
329 			*reuseport = 0;
330 		}
331 #  endif /* SO_REUSEPORT_LB */
332 #else
333 		(void)reuseport;
334 #endif /* defined(SO_REUSEPORT) */
335 #ifdef IP_TRANSPARENT
336 		if (transparent &&
337 		    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
338 		    (socklen_t)sizeof(on)) < 0) {
339 			log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
340 			strerror(errno));
341 		}
342 #elif defined(IP_BINDANY)
343 		if (transparent &&
344 		    setsockopt(s, (family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
345 		    (family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
346 		    (void*)&on, (socklen_t)sizeof(on)) < 0) {
347 			log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
348 			(family==AF_INET6?"V6":""), strerror(errno));
349 		}
350 #elif defined(SO_BINDANY)
351 		if (transparent &&
352 		    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on,
353 		    (socklen_t)sizeof(on)) < 0) {
354 			log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
355 			strerror(errno));
356 		}
357 #endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
358 	}
359 #ifdef IP_FREEBIND
360 	if(freebind &&
361 	    setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
362 	    (socklen_t)sizeof(on)) < 0) {
363 		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
364 		strerror(errno));
365 	}
366 #endif /* IP_FREEBIND */
367 	if(rcv) {
368 #ifdef SO_RCVBUF
369 		int got;
370 		socklen_t slen = (socklen_t)sizeof(got);
371 #  ifdef SO_RCVBUFFORCE
372 		/* Linux specific: try to use root permission to override
373 		 * system limits on rcvbuf. The limit is stored in
374 		 * /proc/sys/net/core/rmem_max or sysctl net.core.rmem_max */
375 		if(setsockopt(s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
376 			(socklen_t)sizeof(rcv)) < 0) {
377 			if(errno != EPERM) {
378 				log_err("setsockopt(..., SO_RCVBUFFORCE, "
379 					"...) failed: %s", sock_strerror(errno));
380 				sock_close(s);
381 				*noproto = 0;
382 				*inuse = 0;
383 				return -1;
384 			}
385 #  endif /* SO_RCVBUFFORCE */
386 			if(setsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
387 				(socklen_t)sizeof(rcv)) < 0) {
388 				log_err("setsockopt(..., SO_RCVBUF, "
389 					"...) failed: %s", sock_strerror(errno));
390 				sock_close(s);
391 				*noproto = 0;
392 				*inuse = 0;
393 				return -1;
394 			}
395 			/* check if we got the right thing or if system
396 			 * reduced to some system max.  Warn if so */
397 			if(getsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&got,
398 				&slen) >= 0 && got < rcv/2) {
399 				log_warn("so-rcvbuf %u was not granted. "
400 					"Got %u. To fix: start with "
401 					"root permissions(linux) or sysctl "
402 					"bigger net.core.rmem_max(linux) or "
403 					"kern.ipc.maxsockbuf(bsd) values.",
404 					(unsigned)rcv, (unsigned)got);
405 			}
406 #  ifdef SO_RCVBUFFORCE
407 		}
408 #  endif
409 #endif /* SO_RCVBUF */
410 	}
411 	/* first do RCVBUF as the receive buffer is more important */
412 	if(snd) {
413 #ifdef SO_SNDBUF
414 		int got;
415 		socklen_t slen = (socklen_t)sizeof(got);
416 #  ifdef SO_SNDBUFFORCE
417 		/* Linux specific: try to use root permission to override
418 		 * system limits on sndbuf. The limit is stored in
419 		 * /proc/sys/net/core/wmem_max or sysctl net.core.wmem_max */
420 		if(setsockopt(s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
421 			(socklen_t)sizeof(snd)) < 0) {
422 			if(errno != EPERM) {
423 				log_err("setsockopt(..., SO_SNDBUFFORCE, "
424 					"...) failed: %s", sock_strerror(errno));
425 				sock_close(s);
426 				*noproto = 0;
427 				*inuse = 0;
428 				return -1;
429 			}
430 #  endif /* SO_SNDBUFFORCE */
431 			if(setsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
432 				(socklen_t)sizeof(snd)) < 0) {
433 				log_err("setsockopt(..., SO_SNDBUF, "
434 					"...) failed: %s", sock_strerror(errno));
435 				sock_close(s);
436 				*noproto = 0;
437 				*inuse = 0;
438 				return -1;
439 			}
440 			/* check if we got the right thing or if system
441 			 * reduced to some system max.  Warn if so */
442 			if(getsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&got,
443 				&slen) >= 0 && got < snd/2) {
444 				log_warn("so-sndbuf %u was not granted. "
445 					"Got %u. To fix: start with "
446 					"root permissions(linux) or sysctl "
447 					"bigger net.core.wmem_max(linux) or "
448 					"kern.ipc.maxsockbuf(bsd) values.",
449 					(unsigned)snd, (unsigned)got);
450 			}
451 #  ifdef SO_SNDBUFFORCE
452 		}
453 #  endif
454 #endif /* SO_SNDBUF */
455 	}
456 	err = set_ip_dscp(s, family, dscp);
457 	if(err != NULL)
458 		log_warn("error setting IP DiffServ codepoint %d on UDP socket: %s", dscp, err);
459 	if(family == AF_INET6) {
460 # if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
461 		int omit6_set = 0;
462 		int action;
463 # endif
464 # if defined(IPV6_V6ONLY)
465 		if(v6only
466 #   ifdef HAVE_SYSTEMD
467 			/* Systemd wants to control if the socket is v6 only
468 			 * or both, with BindIPv6Only=default, ipv6-only or
469 			 * both in systemd.socket, so it is not set here. */
470 			&& !got_fd_from_systemd
471 #   endif
472 			) {
473 			int val=(v6only==2)?0:1;
474 			if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
475 				(void*)&val, (socklen_t)sizeof(val)) < 0) {
476 				log_err("setsockopt(..., IPV6_V6ONLY"
477 					", ...) failed: %s", sock_strerror(errno));
478 				sock_close(s);
479 				*noproto = 0;
480 				*inuse = 0;
481 				return -1;
482 			}
483 		}
484 # endif
485 # if defined(IPV6_USE_MIN_MTU)
486 		/*
487 		 * There is no fragmentation of IPv6 datagrams
488 		 * during forwarding in the network. Therefore
489 		 * we do not send UDP datagrams larger than
490 		 * the minimum IPv6 MTU of 1280 octets. The
491 		 * EDNS0 message length can be larger if the
492 		 * network stack supports IPV6_USE_MIN_MTU.
493 		 */
494 		if (setsockopt(s, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
495 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
496 			log_err("setsockopt(..., IPV6_USE_MIN_MTU, "
497 				"...) failed: %s", sock_strerror(errno));
498 			sock_close(s);
499 			*noproto = 0;
500 			*inuse = 0;
501 			return -1;
502 		}
503 # elif defined(IPV6_MTU)
504 #   ifndef USE_WINSOCK
505 		/*
506 		 * On Linux, to send no larger than 1280, the PMTUD is
507 		 * disabled by default for datagrams anyway, so we set
508 		 * the MTU to use.
509 		 */
510 		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU,
511 			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
512 			log_err("setsockopt(..., IPV6_MTU, ...) failed: %s",
513 				sock_strerror(errno));
514 			sock_close(s);
515 			*noproto = 0;
516 			*inuse = 0;
517 			return -1;
518 		}
519 #   elif defined(IPV6_USER_MTU)
520 		/* As later versions of the mingw crosscompiler define
521 		 * IPV6_MTU, do the same for windows but use IPV6_USER_MTU
522 		 * instead which is writable; IPV6_MTU is readonly there. */
523 		if (setsockopt(s, IPPROTO_IPV6, IPV6_USER_MTU,
524 			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
525 			if (WSAGetLastError() != WSAENOPROTOOPT) {
526 				log_err("setsockopt(..., IPV6_USER_MTU, ...) failed: %s",
527 					wsa_strerror(WSAGetLastError()));
528 				sock_close(s);
529 				*noproto = 0;
530 				*inuse = 0;
531 				return -1;
532 			}
533 		}
534 #   endif /* USE_WINSOCK */
535 # endif /* IPv6 MTU */
536 # if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
537 #  if defined(IP_PMTUDISC_OMIT)
538 		action = IP_PMTUDISC_OMIT;
539 		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
540 			&action, (socklen_t)sizeof(action)) < 0) {
541 
542 			if (errno != EINVAL) {
543 				log_err("setsockopt(..., IPV6_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
544 					strerror(errno));
545 				sock_close(s);
546 				*noproto = 0;
547 				*inuse = 0;
548 				return -1;
549 			}
550 		}
551 		else
552 		{
553 		    omit6_set = 1;
554 		}
555 #  endif
556 		if (omit6_set == 0) {
557 			action = IP_PMTUDISC_DONT;
558 			if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
559 				&action, (socklen_t)sizeof(action)) < 0) {
560 				log_err("setsockopt(..., IPV6_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
561 					strerror(errno));
562 				sock_close(s);
563 				*noproto = 0;
564 				*inuse = 0;
565 				return -1;
566 			}
567 		}
568 # endif /* IPV6_MTU_DISCOVER */
569 	} else if(family == AF_INET) {
570 #  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
571 /* linux 3.15 has IP_PMTUDISC_OMIT, Hannes Frederic Sowa made it so that
572  * PMTU information is not accepted, but fragmentation is allowed
573  * if and only if the packet size exceeds the outgoing interface MTU
574  * (and also uses the interface mtu to determine the size of the packets).
575  * So there won't be any EMSGSIZE error.  Against DNS fragmentation attacks.
576  * FreeBSD already has same semantics without setting the option. */
577 		int omit_set = 0;
578 		int action;
579 #   if defined(IP_PMTUDISC_OMIT)
580 		action = IP_PMTUDISC_OMIT;
581 		if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
582 			&action, (socklen_t)sizeof(action)) < 0) {
583 
584 			if (errno != EINVAL) {
585 				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
586 					strerror(errno));
587 				sock_close(s);
588 				*noproto = 0;
589 				*inuse = 0;
590 				return -1;
591 			}
592 		}
593 		else
594 		{
595 		    omit_set = 1;
596 		}
597 #   endif
598 		if (omit_set == 0) {
599    			action = IP_PMTUDISC_DONT;
600 			if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
601 				&action, (socklen_t)sizeof(action)) < 0) {
602 				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
603 					strerror(errno));
604 				sock_close(s);
605 				*noproto = 0;
606 				*inuse = 0;
607 				return -1;
608 			}
609 		}
610 #  elif defined(IP_DONTFRAG) && !defined(__APPLE__)
611 		/* the IP_DONTFRAG option if defined in the 11.0 OSX headers,
612 		 * but does not work on that version, so we exclude it */
613 		int off = 0;
614 		if (setsockopt(s, IPPROTO_IP, IP_DONTFRAG,
615 			&off, (socklen_t)sizeof(off)) < 0) {
616 			log_err("setsockopt(..., IP_DONTFRAG, ...) failed: %s",
617 				strerror(errno));
618 			sock_close(s);
619 			*noproto = 0;
620 			*inuse = 0;
621 			return -1;
622 		}
623 #  endif /* IPv4 MTU */
624 	}
625 	if(
626 #ifdef HAVE_SYSTEMD
627 		!got_fd_from_systemd &&
628 #endif
629 		bind(s, (struct sockaddr*)addr, addrlen) != 0) {
630 		*noproto = 0;
631 		*inuse = 0;
632 #ifndef USE_WINSOCK
633 #ifdef EADDRINUSE
634 		*inuse = (errno == EADDRINUSE);
635 		/* detect freebsd jail with no ipv6 permission */
636 		if(family==AF_INET6 && errno==EINVAL)
637 			*noproto = 1;
638 		else if(errno != EADDRINUSE &&
639 			!(errno == EACCES && verbosity < 4 && !listen)
640 #ifdef EADDRNOTAVAIL
641 			&& !(errno == EADDRNOTAVAIL && verbosity < 4 && !listen)
642 #endif
643 			) {
644 			log_err_addr("can't bind socket", strerror(errno),
645 				(struct sockaddr_storage*)addr, addrlen);
646 		}
647 #endif /* EADDRINUSE */
648 #else /* USE_WINSOCK */
649 		if(WSAGetLastError() != WSAEADDRINUSE &&
650 			WSAGetLastError() != WSAEADDRNOTAVAIL &&
651 			!(WSAGetLastError() == WSAEACCES && verbosity < 4 && !listen)) {
652 			log_err_addr("can't bind socket",
653 				wsa_strerror(WSAGetLastError()),
654 				(struct sockaddr_storage*)addr, addrlen);
655 		}
656 #endif /* USE_WINSOCK */
657 		sock_close(s);
658 		return -1;
659 	}
660 	if(!fd_set_nonblock(s)) {
661 		*noproto = 0;
662 		*inuse = 0;
663 		sock_close(s);
664 		return -1;
665 	}
666 	return s;
667 }
668 
669 int
670 create_tcp_accept_sock(struct addrinfo *addr, int v6only, int* noproto,
671 	int* reuseport, int transparent, int mss, int nodelay, int freebind,
672 	int use_systemd, int dscp)
673 {
674 	int s;
675 	char* err;
676 #if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_V6ONLY) || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined(SO_BINDANY)
677 	int on = 1;
678 #endif
679 #ifdef HAVE_SYSTEMD
680 	int got_fd_from_systemd = 0;
681 #endif
682 #ifdef USE_TCP_FASTOPEN
683 	int qlen;
684 #endif
685 #if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
686 	(void)transparent;
687 #endif
688 #if !defined(IP_FREEBIND)
689 	(void)freebind;
690 #endif
691 	verbose_print_addr(addr);
692 	*noproto = 0;
693 #ifdef HAVE_SYSTEMD
694 	if (!use_systemd ||
695 	    (use_systemd
696 	     && (s = systemd_get_activated(addr->ai_family, addr->ai_socktype, 1,
697 					   addr->ai_addr, addr->ai_addrlen,
698 					   NULL)) == -1)) {
699 #else
700 	(void)use_systemd;
701 #endif
702 	if((s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
703 #ifndef USE_WINSOCK
704 		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
705 			*noproto = 1;
706 			return -1;
707 		}
708 #else
709 		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
710 			WSAGetLastError() == WSAEPROTONOSUPPORT) {
711 			*noproto = 1;
712 			return -1;
713 		}
714 #endif
715 		log_err("can't create socket: %s", sock_strerror(errno));
716 		return -1;
717 	}
718 	if(nodelay) {
719 #if defined(IPPROTO_TCP) && defined(TCP_NODELAY)
720 		if(setsockopt(s, IPPROTO_TCP, TCP_NODELAY, (void*)&on,
721 			(socklen_t)sizeof(on)) < 0) {
722 			#ifndef USE_WINSOCK
723 			log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
724 				strerror(errno));
725 			#else
726 			log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
727 				wsa_strerror(WSAGetLastError()));
728 			#endif
729 		}
730 #else
731 		log_warn(" setsockopt(TCP_NODELAY) unsupported");
732 #endif /* defined(IPPROTO_TCP) && defined(TCP_NODELAY) */
733 	}
734 	if (mss > 0) {
735 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
736 		if(setsockopt(s, IPPROTO_TCP, TCP_MAXSEG, (void*)&mss,
737 			(socklen_t)sizeof(mss)) < 0) {
738 			log_err(" setsockopt(.. TCP_MAXSEG ..) failed: %s",
739 				sock_strerror(errno));
740 		} else {
741 			verbose(VERB_ALGO,
742 				" tcp socket mss set to %d", mss);
743 		}
744 #else
745 		log_warn(" setsockopt(TCP_MAXSEG) unsupported");
746 #endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
747 	}
748 #ifdef HAVE_SYSTEMD
749 	} else {
750 		got_fd_from_systemd = 1;
751     }
752 #endif
753 #ifdef SO_REUSEADDR
754 	if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
755 		(socklen_t)sizeof(on)) < 0) {
756 		log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
757 			sock_strerror(errno));
758 		sock_close(s);
759 		return -1;
760 	}
761 #endif /* SO_REUSEADDR */
762 #ifdef IP_FREEBIND
763 	if (freebind && setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
764 	    (socklen_t)sizeof(on)) < 0) {
765 		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
766 		strerror(errno));
767 	}
768 #endif /* IP_FREEBIND */
769 #ifdef SO_REUSEPORT
770 	/* try to set SO_REUSEPORT so that incoming
771 	 * connections are distributed evenly among the receiving threads.
772 	 * Each thread must have its own socket bound to the same port,
773 	 * with SO_REUSEPORT set on each socket.
774 	 */
775 	if (reuseport && *reuseport &&
776 		setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
777 		(socklen_t)sizeof(on)) < 0) {
778 #ifdef ENOPROTOOPT
779 		if(errno != ENOPROTOOPT || verbosity >= 3)
780 			log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
781 				strerror(errno));
782 #endif
783 		/* this option is not essential, we can continue */
784 		*reuseport = 0;
785 	}
786 #else
787 	(void)reuseport;
788 #endif /* defined(SO_REUSEPORT) */
789 #if defined(IPV6_V6ONLY)
790 	if(addr->ai_family == AF_INET6 && v6only
791 #  ifdef HAVE_SYSTEMD
792 		/* Systemd wants to control if the socket is v6 only
793 		 * or both, with BindIPv6Only=default, ipv6-only or
794 		 * both in systemd.socket, so it is not set here. */
795 		&& !got_fd_from_systemd
796 #  endif
797 		) {
798 		if(setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
799 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
800 			log_err("setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
801 				sock_strerror(errno));
802 			sock_close(s);
803 			return -1;
804 		}
805 	}
806 #else
807 	(void)v6only;
808 #endif /* IPV6_V6ONLY */
809 #ifdef IP_TRANSPARENT
810 	if (transparent &&
811 	    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
812 	    (socklen_t)sizeof(on)) < 0) {
813 		log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
814 			strerror(errno));
815 	}
816 #elif defined(IP_BINDANY)
817 	if (transparent &&
818 	    setsockopt(s, (addr->ai_family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
819 	    (addr->ai_family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
820 	    (void*)&on, (socklen_t)sizeof(on)) < 0) {
821 		log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
822 		(addr->ai_family==AF_INET6?"V6":""), strerror(errno));
823 	}
824 #elif defined(SO_BINDANY)
825 	if (transparent &&
826 	    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on, (socklen_t)
827 	    sizeof(on)) < 0) {
828 		log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
829 		strerror(errno));
830 	}
831 #endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
832 	err = set_ip_dscp(s, addr->ai_family, dscp);
833 	if(err != NULL)
834 		log_warn("error setting IP DiffServ codepoint %d on TCP socket: %s", dscp, err);
835 	if(
836 #ifdef HAVE_SYSTEMD
837 		!got_fd_from_systemd &&
838 #endif
839         bind(s, addr->ai_addr, addr->ai_addrlen) != 0) {
840 #ifndef USE_WINSOCK
841 		/* detect freebsd jail with no ipv6 permission */
842 		if(addr->ai_family==AF_INET6 && errno==EINVAL)
843 			*noproto = 1;
844 		else {
845 			log_err_addr("can't bind socket", strerror(errno),
846 				(struct sockaddr_storage*)addr->ai_addr,
847 				addr->ai_addrlen);
848 		}
849 #else
850 		log_err_addr("can't bind socket",
851 			wsa_strerror(WSAGetLastError()),
852 			(struct sockaddr_storage*)addr->ai_addr,
853 			addr->ai_addrlen);
854 #endif
855 		sock_close(s);
856 		return -1;
857 	}
858 	if(!fd_set_nonblock(s)) {
859 		sock_close(s);
860 		return -1;
861 	}
862 	if(listen(s, TCP_BACKLOG) == -1) {
863 		log_err("can't listen: %s", sock_strerror(errno));
864 		sock_close(s);
865 		return -1;
866 	}
867 #ifdef USE_TCP_FASTOPEN
868 	/* qlen specifies how many outstanding TFO requests to allow. Limit is a defense
869 	   against IP spoofing attacks as suggested in RFC7413 */
870 #ifdef __APPLE__
871 	/* OS X implementation only supports qlen of 1 via this call. Actual
872 	   value is configured by the net.inet.tcp.fastopen_backlog kernel parm. */
873 	qlen = 1;
874 #else
875 	/* 5 is recommended on linux */
876 	qlen = 5;
877 #endif
878 	if ((setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN, &qlen,
879 		  sizeof(qlen))) == -1 ) {
880 #ifdef ENOPROTOOPT
881 		/* squelch ENOPROTOOPT: freebsd server mode with kernel support
882 		   disabled, except when verbosity enabled for debugging */
883 		if(errno != ENOPROTOOPT || verbosity >= 3) {
884 #endif
885 		  if(errno == EPERM) {
886 		  	log_warn("Setting TCP Fast Open as server failed: %s ; this could likely be because sysctl net.inet.tcp.fastopen.enabled, net.inet.tcp.fastopen.server_enable, or net.ipv4.tcp_fastopen is disabled", strerror(errno));
887 		  } else {
888 		  	log_err("Setting TCP Fast Open as server failed: %s", strerror(errno));
889 		  }
890 #ifdef ENOPROTOOPT
891 		}
892 #endif
893 	}
894 #endif
895 	return s;
896 }
897 
898 char*
899 set_ip_dscp(int socket, int addrfamily, int dscp)
900 {
901 	int ds;
902 
903 	if(dscp == 0)
904 		return NULL;
905 	ds = dscp << 2;
906 	switch(addrfamily) {
907 	case AF_INET6:
908 	#ifdef IPV6_TCLASS
909 		if(setsockopt(socket, IPPROTO_IPV6, IPV6_TCLASS, (void*)&ds,
910 			sizeof(ds)) < 0)
911 			return sock_strerror(errno);
912 		break;
913 	#else
914 		return "IPV6_TCLASS not defined on this system";
915 	#endif
916 	default:
917 		if(setsockopt(socket, IPPROTO_IP, IP_TOS, (void*)&ds, sizeof(ds)) < 0)
918 			return sock_strerror(errno);
919 		break;
920 	}
921 	return NULL;
922 }
923 
924 int
925 create_local_accept_sock(const char *path, int* noproto, int use_systemd)
926 {
927 #ifdef HAVE_SYSTEMD
928 	int ret;
929 
930 	if (use_systemd && (ret = systemd_get_activated(AF_LOCAL, SOCK_STREAM, 1, NULL, 0, path)) != -1)
931 		return ret;
932 	else {
933 #endif
934 #ifdef HAVE_SYS_UN_H
935 	int s;
936 	struct sockaddr_un usock;
937 #ifndef HAVE_SYSTEMD
938 	(void)use_systemd;
939 #endif
940 
941 	verbose(VERB_ALGO, "creating unix socket %s", path);
942 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
943 	/* this member exists on BSDs, not Linux */
944 	usock.sun_len = (unsigned)sizeof(usock);
945 #endif
946 	usock.sun_family = AF_LOCAL;
947 	/* length is 92-108, 104 on FreeBSD */
948 	(void)strlcpy(usock.sun_path, path, sizeof(usock.sun_path));
949 
950 	if ((s = socket(AF_LOCAL, SOCK_STREAM, 0)) == -1) {
951 		log_err("Cannot create local socket %s (%s)",
952 			path, strerror(errno));
953 		return -1;
954 	}
955 
956 	if (unlink(path) && errno != ENOENT) {
957 		/* The socket already exists and cannot be removed */
958 		log_err("Cannot remove old local socket %s (%s)",
959 			path, strerror(errno));
960 		goto err;
961 	}
962 
963 	if (bind(s, (struct sockaddr *)&usock,
964 		(socklen_t)sizeof(struct sockaddr_un)) == -1) {
965 		log_err("Cannot bind local socket %s (%s)",
966 			path, strerror(errno));
967 		goto err;
968 	}
969 
970 	if (!fd_set_nonblock(s)) {
971 		log_err("Cannot set non-blocking mode");
972 		goto err;
973 	}
974 
975 	if (listen(s, TCP_BACKLOG) == -1) {
976 		log_err("can't listen: %s", strerror(errno));
977 		goto err;
978 	}
979 
980 	(void)noproto; /*unused*/
981 	return s;
982 
983 err:
984 	sock_close(s);
985 	return -1;
986 
987 #ifdef HAVE_SYSTEMD
988 	}
989 #endif
990 #else
991 	(void)use_systemd;
992 	(void)path;
993 	log_err("Local sockets are not supported");
994 	*noproto = 1;
995 	return -1;
996 #endif
997 }
998 
999 
1000 /**
1001  * Create socket from getaddrinfo results
1002  */
1003 static int
1004 make_sock(int stype, const char* ifname, const char* port,
1005 	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1006 	int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
1007 	int use_systemd, int dscp, struct unbound_socket* ub_sock)
1008 {
1009 	struct addrinfo *res = NULL;
1010 	int r, s, inuse, noproto;
1011 	hints->ai_socktype = stype;
1012 	*noip6 = 0;
1013 	if((r=getaddrinfo(ifname, port, hints, &res)) != 0 || !res) {
1014 #ifdef USE_WINSOCK
1015 		if(r == EAI_NONAME && hints->ai_family == AF_INET6){
1016 			*noip6 = 1; /* 'Host not found' for IP6 on winXP */
1017 			return -1;
1018 		}
1019 #endif
1020 		log_err("node %s:%s getaddrinfo: %s %s",
1021 			ifname?ifname:"default", port, gai_strerror(r),
1022 #ifdef EAI_SYSTEM
1023 			(r==EAI_SYSTEM?(char*)strerror(errno):"")
1024 #else
1025 			""
1026 #endif
1027 		);
1028 		return -1;
1029 	}
1030 	if(stype == SOCK_DGRAM) {
1031 		verbose_print_addr(res);
1032 		s = create_udp_sock(res->ai_family, res->ai_socktype,
1033 			(struct sockaddr*)res->ai_addr, res->ai_addrlen,
1034 			v6only, &inuse, &noproto, (int)rcv, (int)snd, 1,
1035 			reuseport, transparent, freebind, use_systemd, dscp);
1036 		if(s == -1 && inuse) {
1037 			log_err("bind: address already in use");
1038 		} else if(s == -1 && noproto && hints->ai_family == AF_INET6){
1039 			*noip6 = 1;
1040 		}
1041 	} else	{
1042 		s = create_tcp_accept_sock(res, v6only, &noproto, reuseport,
1043 			transparent, tcp_mss, nodelay, freebind, use_systemd,
1044 			dscp);
1045 		if(s == -1 && noproto && hints->ai_family == AF_INET6){
1046 			*noip6 = 1;
1047 		}
1048 	}
1049 
1050 	ub_sock->addr = res;
1051 	ub_sock->s = s;
1052 	ub_sock->fam = hints->ai_family;
1053 	ub_sock->acl = NULL;
1054 
1055 	return s;
1056 }
1057 
1058 /** make socket and first see if ifname contains port override info */
1059 static int
1060 make_sock_port(int stype, const char* ifname, const char* port,
1061 	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1062 	int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
1063 	int use_systemd, int dscp, struct unbound_socket* ub_sock)
1064 {
1065 	char* s = strchr(ifname, '@');
1066 	if(s) {
1067 		/* override port with ifspec@port */
1068 		char p[16];
1069 		char newif[128];
1070 		if((size_t)(s-ifname) >= sizeof(newif)) {
1071 			log_err("ifname too long: %s", ifname);
1072 			*noip6 = 0;
1073 			return -1;
1074 		}
1075 		if(strlen(s+1) >= sizeof(p)) {
1076 			log_err("portnumber too long: %s", ifname);
1077 			*noip6 = 0;
1078 			return -1;
1079 		}
1080 		(void)strlcpy(newif, ifname, sizeof(newif));
1081 		newif[s-ifname] = 0;
1082 		(void)strlcpy(p, s+1, sizeof(p));
1083 		p[strlen(s+1)]=0;
1084 		return make_sock(stype, newif, p, hints, v6only, noip6, rcv,
1085 			snd, reuseport, transparent, tcp_mss, nodelay, freebind,
1086 			use_systemd, dscp, ub_sock);
1087 	}
1088 	return make_sock(stype, ifname, port, hints, v6only, noip6, rcv, snd,
1089 		reuseport, transparent, tcp_mss, nodelay, freebind, use_systemd,
1090 		dscp, ub_sock);
1091 }
1092 
1093 /**
1094  * Add port to open ports list.
1095  * @param list: list head. changed.
1096  * @param s: fd.
1097  * @param ftype: if fd is UDP.
1098  * @param pp2_enabled: if PROXYv2 is enabled for this port.
1099  * @param ub_sock: socket with address.
1100  * @return false on failure. list in unchanged then.
1101  */
1102 static int
1103 port_insert(struct listen_port** list, int s, enum listen_type ftype,
1104 	int pp2_enabled, struct unbound_socket* ub_sock)
1105 {
1106 	struct listen_port* item = (struct listen_port*)malloc(
1107 		sizeof(struct listen_port));
1108 	if(!item)
1109 		return 0;
1110 	item->next = *list;
1111 	item->fd = s;
1112 	item->ftype = ftype;
1113 	item->pp2_enabled = pp2_enabled;
1114 	item->socket = ub_sock;
1115 	*list = item;
1116 	return 1;
1117 }
1118 
1119 /** set fd to receive software timestamps */
1120 static int
1121 set_recvtimestamp(int s)
1122 {
1123 #ifdef HAVE_LINUX_NET_TSTAMP_H
1124 	int opt = SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE;
1125 	if (setsockopt(s, SOL_SOCKET, SO_TIMESTAMPNS, (void*)&opt, (socklen_t)sizeof(opt)) < 0) {
1126 		log_err("setsockopt(..., SO_TIMESTAMPNS, ...) failed: %s",
1127 			strerror(errno));
1128 		return 0;
1129 	}
1130 	return 1;
1131 #else
1132 	log_err("packets timestamping is not supported on this platform");
1133 	(void)s;
1134 	return 0;
1135 #endif
1136 }
1137 
1138 /** set fd to receive source address packet info */
1139 static int
1140 set_recvpktinfo(int s, int family)
1141 {
1142 #if defined(IPV6_RECVPKTINFO) || defined(IPV6_PKTINFO) || (defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)) || defined(IP_PKTINFO)
1143 	int on = 1;
1144 #else
1145 	(void)s;
1146 #endif
1147 	if(family == AF_INET6) {
1148 #           ifdef IPV6_RECVPKTINFO
1149 		if(setsockopt(s, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1150 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1151 			log_err("setsockopt(..., IPV6_RECVPKTINFO, ...) failed: %s",
1152 				strerror(errno));
1153 			return 0;
1154 		}
1155 #           elif defined(IPV6_PKTINFO)
1156 		if(setsockopt(s, IPPROTO_IPV6, IPV6_PKTINFO,
1157 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1158 			log_err("setsockopt(..., IPV6_PKTINFO, ...) failed: %s",
1159 				strerror(errno));
1160 			return 0;
1161 		}
1162 #           else
1163 		log_err("no IPV6_RECVPKTINFO and IPV6_PKTINFO options, please "
1164 			"disable interface-automatic or do-ip6 in config");
1165 		return 0;
1166 #           endif /* defined IPV6_RECVPKTINFO */
1167 
1168 	} else if(family == AF_INET) {
1169 #           ifdef IP_PKTINFO
1170 		if(setsockopt(s, IPPROTO_IP, IP_PKTINFO,
1171 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1172 			log_err("setsockopt(..., IP_PKTINFO, ...) failed: %s",
1173 				strerror(errno));
1174 			return 0;
1175 		}
1176 #           elif defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)
1177 		if(setsockopt(s, IPPROTO_IP, IP_RECVDSTADDR,
1178 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1179 			log_err("setsockopt(..., IP_RECVDSTADDR, ...) failed: %s",
1180 				strerror(errno));
1181 			return 0;
1182 		}
1183 #           else
1184 		log_err("no IP_SENDSRCADDR or IP_PKTINFO option, please disable "
1185 			"interface-automatic or do-ip4 in config");
1186 		return 0;
1187 #           endif /* IP_PKTINFO */
1188 
1189 	}
1190 	return 1;
1191 }
1192 
1193 /** see if interface is ssl, its port number == the ssl port number */
1194 static int
1195 if_is_ssl(const char* ifname, const char* port, int ssl_port,
1196 	struct config_strlist* tls_additional_port)
1197 {
1198 	struct config_strlist* s;
1199 	char* p = strchr(ifname, '@');
1200 	if(!p && atoi(port) == ssl_port)
1201 		return 1;
1202 	if(p && atoi(p+1) == ssl_port)
1203 		return 1;
1204 	for(s = tls_additional_port; s; s = s->next) {
1205 		if(p && atoi(p+1) == atoi(s->str))
1206 			return 1;
1207 		if(!p && atoi(port) == atoi(s->str))
1208 			return 1;
1209 	}
1210 	return 0;
1211 }
1212 
1213 /**
1214  * Helper for ports_open. Creates one interface (or NULL for default).
1215  * @param ifname: The interface ip address.
1216  * @param do_auto: use automatic interface detection.
1217  * 	If enabled, then ifname must be the wildcard name.
1218  * @param do_udp: if udp should be used.
1219  * @param do_tcp: if tcp should be used.
1220  * @param hints: for getaddrinfo. family and flags have to be set by caller.
1221  * @param port: Port number to use (as string).
1222  * @param list: list of open ports, appended to, changed to point to list head.
1223  * @param rcv: receive buffer size for UDP
1224  * @param snd: send buffer size for UDP
1225  * @param ssl_port: ssl service port number
1226  * @param tls_additional_port: list of additional ssl service port numbers.
1227  * @param https_port: DoH service port number
1228  * @param proxy_protocol_port: list of PROXYv2 port numbers.
1229  * @param reuseport: try to set SO_REUSEPORT if nonNULL and true.
1230  * 	set to false on exit if reuseport failed due to no kernel support.
1231  * @param transparent: set IP_TRANSPARENT socket option.
1232  * @param tcp_mss: maximum segment size of tcp socket. default if zero.
1233  * @param freebind: set IP_FREEBIND socket option.
1234  * @param http2_nodelay: set TCP_NODELAY on HTTP/2 connection
1235  * @param use_systemd: if true, fetch sockets from systemd.
1236  * @param dnscrypt_port: dnscrypt service port number
1237  * @param dscp: DSCP to use.
1238  * @param sock_queue_timeout: the sock_queue_timeout from config. Seconds to
1239  * 	wait to discard if UDP packets have waited for long in the socket
1240  * 	buffer.
1241  * @return: returns false on error.
1242  */
1243 static int
1244 ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp,
1245 	struct addrinfo *hints, const char* port, struct listen_port** list,
1246 	size_t rcv, size_t snd, int ssl_port,
1247 	struct config_strlist* tls_additional_port, int https_port,
1248 	struct config_strlist* proxy_protocol_port,
1249 	int* reuseport, int transparent, int tcp_mss, int freebind,
1250 	int http2_nodelay, int use_systemd, int dnscrypt_port, int dscp,
1251 	int sock_queue_timeout)
1252 {
1253 	int s, noip6=0;
1254 	int is_https = if_is_https(ifname, port, https_port);
1255 	int is_dnscrypt = if_is_dnscrypt(ifname, port, dnscrypt_port);
1256 	int is_pp2 = if_is_pp2(ifname, port, proxy_protocol_port);
1257 	int nodelay = is_https && http2_nodelay;
1258 	struct unbound_socket* ub_sock;
1259 
1260 	if(!do_udp && !do_tcp)
1261 		return 0;
1262 
1263 	if(is_pp2) {
1264 		if(is_dnscrypt) {
1265 			fatal_exit("PROXYv2 and DNSCrypt combination not "
1266 				"supported!");
1267 		} else if(is_https) {
1268 			fatal_exit("PROXYv2 and DoH combination not "
1269 				"supported!");
1270 		}
1271 	}
1272 
1273 	if(do_auto) {
1274 		ub_sock = calloc(1, sizeof(struct unbound_socket));
1275 		if(!ub_sock)
1276 			return 0;
1277 		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
1278 			&noip6, rcv, snd, reuseport, transparent,
1279 			tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock)) == -1) {
1280 			if(ub_sock->addr)
1281 				freeaddrinfo(ub_sock->addr);
1282 			free(ub_sock);
1283 			if(noip6) {
1284 				log_warn("IPv6 protocol not available");
1285 				return 1;
1286 			}
1287 			return 0;
1288 		}
1289 		/* getting source addr packet info is highly non-portable */
1290 		if(!set_recvpktinfo(s, hints->ai_family)) {
1291 			sock_close(s);
1292 			if(ub_sock->addr)
1293 				freeaddrinfo(ub_sock->addr);
1294 			free(ub_sock);
1295 			return 0;
1296 		}
1297 		if (sock_queue_timeout && !set_recvtimestamp(s)) {
1298 			log_warn("socket timestamping is not available");
1299 		}
1300 		if(!port_insert(list, s, is_dnscrypt
1301 			?listen_type_udpancil_dnscrypt:listen_type_udpancil,
1302 			is_pp2, ub_sock)) {
1303 			sock_close(s);
1304 			if(ub_sock->addr)
1305 				freeaddrinfo(ub_sock->addr);
1306 			free(ub_sock);
1307 			return 0;
1308 		}
1309 	} else if(do_udp) {
1310 		ub_sock = calloc(1, sizeof(struct unbound_socket));
1311 		if(!ub_sock)
1312 			return 0;
1313 		/* regular udp socket */
1314 		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
1315 			&noip6, rcv, snd, reuseport, transparent,
1316 			tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock)) == -1) {
1317 			if(ub_sock->addr)
1318 				freeaddrinfo(ub_sock->addr);
1319 			free(ub_sock);
1320 			if(noip6) {
1321 				log_warn("IPv6 protocol not available");
1322 				return 1;
1323 			}
1324 			return 0;
1325 		}
1326 		if (sock_queue_timeout && !set_recvtimestamp(s)) {
1327 			log_warn("socket timestamping is not available");
1328 		}
1329 		if(!port_insert(list, s, is_dnscrypt
1330 			?listen_type_udp_dnscrypt:listen_type_udp,
1331 			is_pp2, ub_sock)) {
1332 			sock_close(s);
1333 			if(ub_sock->addr)
1334 				freeaddrinfo(ub_sock->addr);
1335 			free(ub_sock);
1336 			return 0;
1337 		}
1338 	}
1339 	if(do_tcp) {
1340 		int is_ssl = if_is_ssl(ifname, port, ssl_port,
1341 			tls_additional_port);
1342 		enum listen_type port_type;
1343 		ub_sock = calloc(1, sizeof(struct unbound_socket));
1344 		if(!ub_sock)
1345 			return 0;
1346 		if(is_ssl)
1347 			port_type = listen_type_ssl;
1348 		else if(is_https)
1349 			port_type = listen_type_http;
1350 		else if(is_dnscrypt)
1351 			port_type = listen_type_tcp_dnscrypt;
1352 		else
1353 			port_type = listen_type_tcp;
1354 		if((s = make_sock_port(SOCK_STREAM, ifname, port, hints, 1,
1355 			&noip6, 0, 0, reuseport, transparent, tcp_mss, nodelay,
1356 			freebind, use_systemd, dscp, ub_sock)) == -1) {
1357 			if(ub_sock->addr)
1358 				freeaddrinfo(ub_sock->addr);
1359 			free(ub_sock);
1360 			if(noip6) {
1361 				/*log_warn("IPv6 protocol not available");*/
1362 				return 1;
1363 			}
1364 			return 0;
1365 		}
1366 		if(is_ssl)
1367 			verbose(VERB_ALGO, "setup TCP for SSL service");
1368 		if(!port_insert(list, s, port_type, is_pp2, ub_sock)) {
1369 			sock_close(s);
1370 			if(ub_sock->addr)
1371 				freeaddrinfo(ub_sock->addr);
1372 			free(ub_sock);
1373 			return 0;
1374 		}
1375 	}
1376 	return 1;
1377 }
1378 
1379 /**
1380  * Add items to commpoint list in front.
1381  * @param c: commpoint to add.
1382  * @param front: listen struct.
1383  * @return: false on failure.
1384  */
1385 static int
1386 listen_cp_insert(struct comm_point* c, struct listen_dnsport* front)
1387 {
1388 	struct listen_list* item = (struct listen_list*)malloc(
1389 		sizeof(struct listen_list));
1390 	if(!item)
1391 		return 0;
1392 	item->com = c;
1393 	item->next = front->cps;
1394 	front->cps = item;
1395 	return 1;
1396 }
1397 
1398 void listen_setup_locks(void)
1399 {
1400 	if(!stream_wait_lock_inited) {
1401 		lock_basic_init(&stream_wait_count_lock);
1402 		stream_wait_lock_inited = 1;
1403 	}
1404 	if(!http2_query_buffer_lock_inited) {
1405 		lock_basic_init(&http2_query_buffer_count_lock);
1406 		http2_query_buffer_lock_inited = 1;
1407 	}
1408 	if(!http2_response_buffer_lock_inited) {
1409 		lock_basic_init(&http2_response_buffer_count_lock);
1410 		http2_response_buffer_lock_inited = 1;
1411 	}
1412 }
1413 
1414 void listen_desetup_locks(void)
1415 {
1416 	if(stream_wait_lock_inited) {
1417 		stream_wait_lock_inited = 0;
1418 		lock_basic_destroy(&stream_wait_count_lock);
1419 	}
1420 	if(http2_query_buffer_lock_inited) {
1421 		http2_query_buffer_lock_inited = 0;
1422 		lock_basic_destroy(&http2_query_buffer_count_lock);
1423 	}
1424 	if(http2_response_buffer_lock_inited) {
1425 		http2_response_buffer_lock_inited = 0;
1426 		lock_basic_destroy(&http2_response_buffer_count_lock);
1427 	}
1428 }
1429 
1430 struct listen_dnsport*
1431 listen_create(struct comm_base* base, struct listen_port* ports,
1432 	size_t bufsize, int tcp_accept_count, int tcp_idle_timeout,
1433 	int harden_large_queries, uint32_t http_max_streams,
1434 	char* http_endpoint, int http_notls, struct tcl_list* tcp_conn_limit,
1435 	void* sslctx, struct dt_env* dtenv, comm_point_callback_type* cb,
1436 	void *cb_arg)
1437 {
1438 	struct listen_dnsport* front = (struct listen_dnsport*)
1439 		malloc(sizeof(struct listen_dnsport));
1440 	if(!front)
1441 		return NULL;
1442 	front->cps = NULL;
1443 	front->udp_buff = sldns_buffer_new(bufsize);
1444 #ifdef USE_DNSCRYPT
1445 	front->dnscrypt_udp_buff = NULL;
1446 #endif
1447 	if(!front->udp_buff) {
1448 		free(front);
1449 		return NULL;
1450 	}
1451 
1452 	/* create comm points as needed */
1453 	while(ports) {
1454 		struct comm_point* cp = NULL;
1455 		if(ports->ftype == listen_type_udp ||
1456 		   ports->ftype == listen_type_udp_dnscrypt) {
1457 			cp = comm_point_create_udp(base, ports->fd,
1458 				front->udp_buff, ports->pp2_enabled, cb,
1459 				cb_arg, ports->socket);
1460 		} else if(ports->ftype == listen_type_tcp ||
1461 				ports->ftype == listen_type_tcp_dnscrypt) {
1462 			cp = comm_point_create_tcp(base, ports->fd,
1463 				tcp_accept_count, tcp_idle_timeout,
1464 				harden_large_queries, 0, NULL,
1465 				tcp_conn_limit, bufsize, front->udp_buff,
1466 				ports->ftype, ports->pp2_enabled, cb, cb_arg,
1467 				ports->socket);
1468 		} else if(ports->ftype == listen_type_ssl ||
1469 			ports->ftype == listen_type_http) {
1470 			cp = comm_point_create_tcp(base, ports->fd,
1471 				tcp_accept_count, tcp_idle_timeout,
1472 				harden_large_queries,
1473 				http_max_streams, http_endpoint,
1474 				tcp_conn_limit, bufsize, front->udp_buff,
1475 				ports->ftype, ports->pp2_enabled, cb, cb_arg,
1476 				ports->socket);
1477 			if(ports->ftype == listen_type_http) {
1478 				if(!sslctx && !http_notls) {
1479 					log_warn("HTTPS port configured, but "
1480 						"no TLS tls-service-key or "
1481 						"tls-service-pem set");
1482 				}
1483 #ifndef HAVE_SSL_CTX_SET_ALPN_SELECT_CB
1484 				if(!http_notls) {
1485 					log_warn("Unbound is not compiled "
1486 						"with an OpenSSL version "
1487 						"supporting ALPN "
1488 						"(OpenSSL >= 1.0.2). This "
1489 						"is required to use "
1490 						"DNS-over-HTTPS");
1491 				}
1492 #endif
1493 #ifndef HAVE_NGHTTP2_NGHTTP2_H
1494 				log_warn("Unbound is not compiled with "
1495 					"nghttp2. This is required to use "
1496 					"DNS-over-HTTPS.");
1497 #endif
1498 			}
1499 		} else if(ports->ftype == listen_type_udpancil ||
1500 				  ports->ftype == listen_type_udpancil_dnscrypt) {
1501 			cp = comm_point_create_udp_ancil(base, ports->fd,
1502 				front->udp_buff, ports->pp2_enabled, cb,
1503 				cb_arg, ports->socket);
1504 		}
1505 		if(!cp) {
1506 			log_err("can't create commpoint");
1507 			listen_delete(front);
1508 			return NULL;
1509 		}
1510 		if((http_notls && ports->ftype == listen_type_http) ||
1511 			(ports->ftype == listen_type_tcp) ||
1512 			(ports->ftype == listen_type_udp) ||
1513 			(ports->ftype == listen_type_udpancil) ||
1514 			(ports->ftype == listen_type_tcp_dnscrypt) ||
1515 			(ports->ftype == listen_type_udp_dnscrypt) ||
1516 			(ports->ftype == listen_type_udpancil_dnscrypt))
1517 			cp->ssl = NULL;
1518 		else
1519 			cp->ssl = sslctx;
1520 		cp->dtenv = dtenv;
1521 		cp->do_not_close = 1;
1522 #ifdef USE_DNSCRYPT
1523 		if (ports->ftype == listen_type_udp_dnscrypt ||
1524 			ports->ftype == listen_type_tcp_dnscrypt ||
1525 			ports->ftype == listen_type_udpancil_dnscrypt) {
1526 			cp->dnscrypt = 1;
1527 			cp->dnscrypt_buffer = sldns_buffer_new(bufsize);
1528 			if(!cp->dnscrypt_buffer) {
1529 				log_err("can't alloc dnscrypt_buffer");
1530 				comm_point_delete(cp);
1531 				listen_delete(front);
1532 				return NULL;
1533 			}
1534 			front->dnscrypt_udp_buff = cp->dnscrypt_buffer;
1535 		}
1536 #endif
1537 		if(!listen_cp_insert(cp, front)) {
1538 			log_err("malloc failed");
1539 			comm_point_delete(cp);
1540 			listen_delete(front);
1541 			return NULL;
1542 		}
1543 		ports = ports->next;
1544 	}
1545 	if(!front->cps) {
1546 		log_err("Could not open sockets to accept queries.");
1547 		listen_delete(front);
1548 		return NULL;
1549 	}
1550 
1551 	return front;
1552 }
1553 
1554 void
1555 listen_list_delete(struct listen_list* list)
1556 {
1557 	struct listen_list *p = list, *pn;
1558 	while(p) {
1559 		pn = p->next;
1560 		comm_point_delete(p->com);
1561 		free(p);
1562 		p = pn;
1563 	}
1564 }
1565 
1566 void
1567 listen_delete(struct listen_dnsport* front)
1568 {
1569 	if(!front)
1570 		return;
1571 	listen_list_delete(front->cps);
1572 #ifdef USE_DNSCRYPT
1573 	if(front->dnscrypt_udp_buff &&
1574 		front->udp_buff != front->dnscrypt_udp_buff) {
1575 		sldns_buffer_free(front->dnscrypt_udp_buff);
1576 	}
1577 #endif
1578 	sldns_buffer_free(front->udp_buff);
1579 	free(front);
1580 }
1581 
1582 #ifdef HAVE_GETIFADDRS
1583 static int
1584 resolve_ifa_name(struct ifaddrs *ifas, const char *search_ifa, char ***ip_addresses, int *ip_addresses_size)
1585 {
1586 	struct ifaddrs *ifa;
1587 	void *tmpbuf;
1588 	int last_ip_addresses_size = *ip_addresses_size;
1589 
1590 	for(ifa = ifas; ifa != NULL; ifa = ifa->ifa_next) {
1591 		sa_family_t family;
1592 		const char* atsign;
1593 #ifdef INET6      /* |   address ip    | % |  ifa name  | @ |  port  | nul */
1594 		char addr_buf[INET6_ADDRSTRLEN + 1 + IF_NAMESIZE + 1 + 16 + 1];
1595 #else
1596 		char addr_buf[INET_ADDRSTRLEN + 1 + 16 + 1];
1597 #endif
1598 
1599 		if((atsign=strrchr(search_ifa, '@')) != NULL) {
1600 			if(strlen(ifa->ifa_name) != (size_t)(atsign-search_ifa)
1601 			   || strncmp(ifa->ifa_name, search_ifa,
1602 			   atsign-search_ifa) != 0)
1603 				continue;
1604 		} else {
1605 			if(strcmp(ifa->ifa_name, search_ifa) != 0)
1606 				continue;
1607 			atsign = "";
1608 		}
1609 
1610 		if(ifa->ifa_addr == NULL)
1611 			continue;
1612 
1613 		family = ifa->ifa_addr->sa_family;
1614 		if(family == AF_INET) {
1615 			char a4[INET_ADDRSTRLEN + 1];
1616 			struct sockaddr_in *in4 = (struct sockaddr_in *)
1617 				ifa->ifa_addr;
1618 			if(!inet_ntop(family, &in4->sin_addr, a4, sizeof(a4))) {
1619 				log_err("inet_ntop failed");
1620 				return 0;
1621 			}
1622 			snprintf(addr_buf, sizeof(addr_buf), "%s%s",
1623 				a4, atsign);
1624 		}
1625 #ifdef INET6
1626 		else if(family == AF_INET6) {
1627 			struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)
1628 				ifa->ifa_addr;
1629 			char a6[INET6_ADDRSTRLEN + 1];
1630 			char if_index_name[IF_NAMESIZE + 1];
1631 			if_index_name[0] = 0;
1632 			if(!inet_ntop(family, &in6->sin6_addr, a6, sizeof(a6))) {
1633 				log_err("inet_ntop failed");
1634 				return 0;
1635 			}
1636 			(void)if_indextoname(in6->sin6_scope_id,
1637 				(char *)if_index_name);
1638 			if (strlen(if_index_name) != 0) {
1639 				snprintf(addr_buf, sizeof(addr_buf),
1640 					"%s%%%s%s", a6, if_index_name, atsign);
1641 			} else {
1642 				snprintf(addr_buf, sizeof(addr_buf), "%s%s",
1643 					a6, atsign);
1644 			}
1645 		}
1646 #endif
1647 		else {
1648 			continue;
1649 		}
1650 		verbose(4, "interface %s has address %s", search_ifa, addr_buf);
1651 
1652 		tmpbuf = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
1653 		if(!tmpbuf) {
1654 			log_err("realloc failed: out of memory");
1655 			return 0;
1656 		} else {
1657 			*ip_addresses = tmpbuf;
1658 		}
1659 		(*ip_addresses)[*ip_addresses_size] = strdup(addr_buf);
1660 		if(!(*ip_addresses)[*ip_addresses_size]) {
1661 			log_err("strdup failed: out of memory");
1662 			return 0;
1663 		}
1664 		(*ip_addresses_size)++;
1665 	}
1666 
1667 	if (*ip_addresses_size == last_ip_addresses_size) {
1668 		tmpbuf = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
1669 		if(!tmpbuf) {
1670 			log_err("realloc failed: out of memory");
1671 			return 0;
1672 		} else {
1673 			*ip_addresses = tmpbuf;
1674 		}
1675 		(*ip_addresses)[*ip_addresses_size] = strdup(search_ifa);
1676 		if(!(*ip_addresses)[*ip_addresses_size]) {
1677 			log_err("strdup failed: out of memory");
1678 			return 0;
1679 		}
1680 		(*ip_addresses_size)++;
1681 	}
1682 	return 1;
1683 }
1684 #endif /* HAVE_GETIFADDRS */
1685 
1686 int resolve_interface_names(char** ifs, int num_ifs,
1687 	struct config_strlist* list, char*** resif, int* num_resif)
1688 {
1689 #ifdef HAVE_GETIFADDRS
1690 	struct ifaddrs *addrs = NULL;
1691 	if(num_ifs == 0 && list == NULL) {
1692 		*resif = NULL;
1693 		*num_resif = 0;
1694 		return 1;
1695 	}
1696 	if(getifaddrs(&addrs) == -1) {
1697 		log_err("failed to list interfaces: getifaddrs: %s",
1698 			strerror(errno));
1699 		freeifaddrs(addrs);
1700 		return 0;
1701 	}
1702 	if(ifs) {
1703 		int i;
1704 		for(i=0; i<num_ifs; i++) {
1705 			if(!resolve_ifa_name(addrs, ifs[i], resif, num_resif)) {
1706 				freeifaddrs(addrs);
1707 				config_del_strarray(*resif, *num_resif);
1708 				*resif = NULL;
1709 				*num_resif = 0;
1710 				return 0;
1711 			}
1712 		}
1713 	}
1714 	if(list) {
1715 		struct config_strlist* p;
1716 		for(p = list; p; p = p->next) {
1717 			if(!resolve_ifa_name(addrs, p->str, resif, num_resif)) {
1718 				freeifaddrs(addrs);
1719 				config_del_strarray(*resif, *num_resif);
1720 				*resif = NULL;
1721 				*num_resif = 0;
1722 				return 0;
1723 			}
1724 }
1725 	}
1726 	freeifaddrs(addrs);
1727 	return 1;
1728 #else
1729 	struct config_strlist* p;
1730 	if(num_ifs == 0 && list == NULL) {
1731 		*resif = NULL;
1732 		*num_resif = 0;
1733 		return 1;
1734 	}
1735 	*num_resif = num_ifs;
1736 	for(p = list; p; p = p->next) {
1737 		(*num_resif)++;
1738 	}
1739 	*resif = calloc(*num_resif, sizeof(**resif));
1740 	if(!*resif) {
1741 		log_err("out of memory");
1742 		return 0;
1743 	}
1744 	if(ifs) {
1745 		int i;
1746 		for(i=0; i<num_ifs; i++) {
1747 			(*resif)[i] = strdup(ifs[i]);
1748 			if(!((*resif)[i])) {
1749 				log_err("out of memory");
1750 				config_del_strarray(*resif, *num_resif);
1751 				*resif = NULL;
1752 				*num_resif = 0;
1753 				return 0;
1754 			}
1755 		}
1756 	}
1757 	if(list) {
1758 		int idx = num_ifs;
1759 		for(p = list; p; p = p->next) {
1760 			(*resif)[idx] = strdup(p->str);
1761 			if(!((*resif)[idx])) {
1762 				log_err("out of memory");
1763 				config_del_strarray(*resif, *num_resif);
1764 				*resif = NULL;
1765 				*num_resif = 0;
1766 				return 0;
1767 			}
1768 			idx++;
1769 		}
1770 	}
1771 	return 1;
1772 #endif /* HAVE_GETIFADDRS */
1773 }
1774 
1775 struct listen_port*
1776 listening_ports_open(struct config_file* cfg, char** ifs, int num_ifs,
1777 	int* reuseport)
1778 {
1779 	struct listen_port* list = NULL;
1780 	struct addrinfo hints;
1781 	int i, do_ip4, do_ip6;
1782 	int do_tcp, do_auto;
1783 	char portbuf[32];
1784 	snprintf(portbuf, sizeof(portbuf), "%d", cfg->port);
1785 	do_ip4 = cfg->do_ip4;
1786 	do_ip6 = cfg->do_ip6;
1787 	do_tcp = cfg->do_tcp;
1788 	do_auto = cfg->if_automatic && cfg->do_udp;
1789 	if(cfg->incoming_num_tcp == 0)
1790 		do_tcp = 0;
1791 
1792 	/* getaddrinfo */
1793 	memset(&hints, 0, sizeof(hints));
1794 	hints.ai_flags = AI_PASSIVE;
1795 	/* no name lookups on our listening ports */
1796 	if(num_ifs > 0)
1797 		hints.ai_flags |= AI_NUMERICHOST;
1798 	hints.ai_family = AF_UNSPEC;
1799 #ifndef INET6
1800 	do_ip6 = 0;
1801 #endif
1802 	if(!do_ip4 && !do_ip6) {
1803 		return NULL;
1804 	}
1805 	/* create ip4 and ip6 ports so that return addresses are nice. */
1806 	if(do_auto || num_ifs == 0) {
1807 		if(do_auto && cfg->if_automatic_ports &&
1808 			cfg->if_automatic_ports[0]!=0) {
1809 			char* now = cfg->if_automatic_ports;
1810 			while(now && *now) {
1811 				char* after;
1812 				int extraport;
1813 				while(isspace((unsigned char)*now))
1814 					now++;
1815 				if(!*now)
1816 					break;
1817 				after = now;
1818 				extraport = (int)strtol(now, &after, 10);
1819 				if(extraport < 0 || extraport > 65535) {
1820 					log_err("interface-automatic-ports port number out of range, at position %d of '%s'", (int)(now-cfg->if_automatic_ports)+1, cfg->if_automatic_ports);
1821 					listening_ports_free(list);
1822 					return NULL;
1823 				}
1824 				if(extraport == 0 && now == after) {
1825 					log_err("interface-automatic-ports could not be parsed, at position %d of '%s'", (int)(now-cfg->if_automatic_ports)+1, cfg->if_automatic_ports);
1826 					listening_ports_free(list);
1827 					return NULL;
1828 				}
1829 				now = after;
1830 				snprintf(portbuf, sizeof(portbuf), "%d", extraport);
1831 				if(do_ip6) {
1832 					hints.ai_family = AF_INET6;
1833 					if(!ports_create_if("::0",
1834 						do_auto, cfg->do_udp, do_tcp,
1835 						&hints, portbuf, &list,
1836 						cfg->so_rcvbuf, cfg->so_sndbuf,
1837 						cfg->ssl_port, cfg->tls_additional_port,
1838 						cfg->https_port,
1839 						cfg->proxy_protocol_port,
1840 						reuseport, cfg->ip_transparent,
1841 						cfg->tcp_mss, cfg->ip_freebind,
1842 						cfg->http_nodelay, cfg->use_systemd,
1843 						cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1844 						listening_ports_free(list);
1845 						return NULL;
1846 					}
1847 				}
1848 				if(do_ip4) {
1849 					hints.ai_family = AF_INET;
1850 					if(!ports_create_if("0.0.0.0",
1851 						do_auto, cfg->do_udp, do_tcp,
1852 						&hints, portbuf, &list,
1853 						cfg->so_rcvbuf, cfg->so_sndbuf,
1854 						cfg->ssl_port, cfg->tls_additional_port,
1855 						cfg->https_port,
1856 						cfg->proxy_protocol_port,
1857 						reuseport, cfg->ip_transparent,
1858 						cfg->tcp_mss, cfg->ip_freebind,
1859 						cfg->http_nodelay, cfg->use_systemd,
1860 						cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1861 						listening_ports_free(list);
1862 						return NULL;
1863 					}
1864 				}
1865 			}
1866 			return list;
1867 		}
1868 		if(do_ip6) {
1869 			hints.ai_family = AF_INET6;
1870 			if(!ports_create_if(do_auto?"::0":"::1",
1871 				do_auto, cfg->do_udp, do_tcp,
1872 				&hints, portbuf, &list,
1873 				cfg->so_rcvbuf, cfg->so_sndbuf,
1874 				cfg->ssl_port, cfg->tls_additional_port,
1875 				cfg->https_port, cfg->proxy_protocol_port,
1876 				reuseport, cfg->ip_transparent,
1877 				cfg->tcp_mss, cfg->ip_freebind,
1878 				cfg->http_nodelay, cfg->use_systemd,
1879 				cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1880 				listening_ports_free(list);
1881 				return NULL;
1882 			}
1883 		}
1884 		if(do_ip4) {
1885 			hints.ai_family = AF_INET;
1886 			if(!ports_create_if(do_auto?"0.0.0.0":"127.0.0.1",
1887 				do_auto, cfg->do_udp, do_tcp,
1888 				&hints, portbuf, &list,
1889 				cfg->so_rcvbuf, cfg->so_sndbuf,
1890 				cfg->ssl_port, cfg->tls_additional_port,
1891 				cfg->https_port, cfg->proxy_protocol_port,
1892 				reuseport, cfg->ip_transparent,
1893 				cfg->tcp_mss, cfg->ip_freebind,
1894 				cfg->http_nodelay, cfg->use_systemd,
1895 				cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1896 				listening_ports_free(list);
1897 				return NULL;
1898 			}
1899 		}
1900 	} else for(i = 0; i<num_ifs; i++) {
1901 		if(str_is_ip6(ifs[i])) {
1902 			if(!do_ip6)
1903 				continue;
1904 			hints.ai_family = AF_INET6;
1905 			if(!ports_create_if(ifs[i], 0, cfg->do_udp,
1906 				do_tcp, &hints, portbuf, &list,
1907 				cfg->so_rcvbuf, cfg->so_sndbuf,
1908 				cfg->ssl_port, cfg->tls_additional_port,
1909 				cfg->https_port, cfg->proxy_protocol_port,
1910 				reuseport, cfg->ip_transparent,
1911 				cfg->tcp_mss, cfg->ip_freebind,
1912 				cfg->http_nodelay, cfg->use_systemd,
1913 				cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1914 				listening_ports_free(list);
1915 				return NULL;
1916 			}
1917 		} else {
1918 			if(!do_ip4)
1919 				continue;
1920 			hints.ai_family = AF_INET;
1921 			if(!ports_create_if(ifs[i], 0, cfg->do_udp,
1922 				do_tcp, &hints, portbuf, &list,
1923 				cfg->so_rcvbuf, cfg->so_sndbuf,
1924 				cfg->ssl_port, cfg->tls_additional_port,
1925 				cfg->https_port, cfg->proxy_protocol_port,
1926 				reuseport, cfg->ip_transparent,
1927 				cfg->tcp_mss, cfg->ip_freebind,
1928 				cfg->http_nodelay, cfg->use_systemd,
1929 				cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1930 				listening_ports_free(list);
1931 				return NULL;
1932 			}
1933 		}
1934 	}
1935 
1936 	return list;
1937 }
1938 
1939 void listening_ports_free(struct listen_port* list)
1940 {
1941 	struct listen_port* nx;
1942 	while(list) {
1943 		nx = list->next;
1944 		if(list->fd != -1) {
1945 			sock_close(list->fd);
1946 		}
1947 		/* rc_ports don't have ub_socket */
1948 		if(list->socket) {
1949 			if(list->socket->addr)
1950 				freeaddrinfo(list->socket->addr);
1951 			free(list->socket);
1952 		}
1953 		free(list);
1954 		list = nx;
1955 	}
1956 }
1957 
1958 size_t listen_get_mem(struct listen_dnsport* listen)
1959 {
1960 	struct listen_list* p;
1961 	size_t s = sizeof(*listen) + sizeof(*listen->base) +
1962 		sizeof(*listen->udp_buff) +
1963 		sldns_buffer_capacity(listen->udp_buff);
1964 #ifdef USE_DNSCRYPT
1965 	s += sizeof(*listen->dnscrypt_udp_buff);
1966 	if(listen->udp_buff != listen->dnscrypt_udp_buff){
1967 		s += sldns_buffer_capacity(listen->dnscrypt_udp_buff);
1968 	}
1969 #endif
1970 	for(p = listen->cps; p; p = p->next) {
1971 		s += sizeof(*p);
1972 		s += comm_point_get_mem(p->com);
1973 	}
1974 	return s;
1975 }
1976 
1977 void listen_stop_accept(struct listen_dnsport* listen)
1978 {
1979 	/* do not stop the ones that have no tcp_free list
1980 	 * (they have already stopped listening) */
1981 	struct listen_list* p;
1982 	for(p=listen->cps; p; p=p->next) {
1983 		if(p->com->type == comm_tcp_accept &&
1984 			p->com->tcp_free != NULL) {
1985 			comm_point_stop_listening(p->com);
1986 		}
1987 	}
1988 }
1989 
1990 void listen_start_accept(struct listen_dnsport* listen)
1991 {
1992 	/* do not start the ones that have no tcp_free list, it is no
1993 	 * use to listen to them because they have no free tcp handlers */
1994 	struct listen_list* p;
1995 	for(p=listen->cps; p; p=p->next) {
1996 		if(p->com->type == comm_tcp_accept &&
1997 			p->com->tcp_free != NULL) {
1998 			comm_point_start_listening(p->com, -1, -1);
1999 		}
2000 	}
2001 }
2002 
2003 struct tcp_req_info*
2004 tcp_req_info_create(struct sldns_buffer* spoolbuf)
2005 {
2006 	struct tcp_req_info* req = (struct tcp_req_info*)malloc(sizeof(*req));
2007 	if(!req) {
2008 		log_err("malloc failure for new stream outoforder processing structure");
2009 		return NULL;
2010 	}
2011 	memset(req, 0, sizeof(*req));
2012 	req->spool_buffer = spoolbuf;
2013 	return req;
2014 }
2015 
2016 void
2017 tcp_req_info_delete(struct tcp_req_info* req)
2018 {
2019 	if(!req) return;
2020 	tcp_req_info_clear(req);
2021 	/* cp is pointer back to commpoint that owns this struct and
2022 	 * called delete on us */
2023 	/* spool_buffer is shared udp buffer, not deleted here */
2024 	free(req);
2025 }
2026 
2027 void tcp_req_info_clear(struct tcp_req_info* req)
2028 {
2029 	struct tcp_req_open_item* open, *nopen;
2030 	struct tcp_req_done_item* item, *nitem;
2031 	if(!req) return;
2032 
2033 	/* free outstanding request mesh reply entries */
2034 	open = req->open_req_list;
2035 	while(open) {
2036 		nopen = open->next;
2037 		mesh_state_remove_reply(open->mesh, open->mesh_state, req->cp);
2038 		free(open);
2039 		open = nopen;
2040 	}
2041 	req->open_req_list = NULL;
2042 	req->num_open_req = 0;
2043 
2044 	/* free pending writable result packets */
2045 	item = req->done_req_list;
2046 	while(item) {
2047 		nitem = item->next;
2048 		lock_basic_lock(&stream_wait_count_lock);
2049 		stream_wait_count -= (sizeof(struct tcp_req_done_item)
2050 			+item->len);
2051 		lock_basic_unlock(&stream_wait_count_lock);
2052 		free(item->buf);
2053 		free(item);
2054 		item = nitem;
2055 	}
2056 	req->done_req_list = NULL;
2057 	req->num_done_req = 0;
2058 	req->read_is_closed = 0;
2059 }
2060 
2061 void
2062 tcp_req_info_remove_mesh_state(struct tcp_req_info* req, struct mesh_state* m)
2063 {
2064 	struct tcp_req_open_item* open, *prev = NULL;
2065 	if(!req || !m) return;
2066 	open = req->open_req_list;
2067 	while(open) {
2068 		if(open->mesh_state == m) {
2069 			struct tcp_req_open_item* next;
2070 			if(prev) prev->next = open->next;
2071 			else req->open_req_list = open->next;
2072 			/* caller has to manage the mesh state reply entry */
2073 			next = open->next;
2074 			free(open);
2075 			req->num_open_req --;
2076 
2077 			/* prev = prev; */
2078 			open = next;
2079 			continue;
2080 		}
2081 		prev = open;
2082 		open = open->next;
2083 	}
2084 }
2085 
2086 /** setup listening for read or write */
2087 static void
2088 tcp_req_info_setup_listen(struct tcp_req_info* req)
2089 {
2090 	int wr = 0;
2091 	int rd = 0;
2092 
2093 	if(req->cp->tcp_byte_count != 0) {
2094 		/* cannot change, halfway through */
2095 		return;
2096 	}
2097 
2098 	if(!req->cp->tcp_is_reading)
2099 		wr = 1;
2100 	if(!req->read_is_closed)
2101 		rd = 1;
2102 
2103 	if(wr) {
2104 		req->cp->tcp_is_reading = 0;
2105 		comm_point_stop_listening(req->cp);
2106 		comm_point_start_listening(req->cp, -1,
2107 			adjusted_tcp_timeout(req->cp));
2108 	} else if(rd) {
2109 		req->cp->tcp_is_reading = 1;
2110 		comm_point_stop_listening(req->cp);
2111 		comm_point_start_listening(req->cp, -1,
2112 			adjusted_tcp_timeout(req->cp));
2113 		/* and also read it (from SSL stack buffers), so
2114 		 * no event read event is expected since the remainder of
2115 		 * the TLS frame is sitting in the buffers. */
2116 		req->read_again = 1;
2117 	} else {
2118 		comm_point_stop_listening(req->cp);
2119 		comm_point_start_listening(req->cp, -1,
2120 			adjusted_tcp_timeout(req->cp));
2121 		comm_point_listen_for_rw(req->cp, 0, 0);
2122 	}
2123 }
2124 
2125 /** remove first item from list of pending results */
2126 static struct tcp_req_done_item*
2127 tcp_req_info_pop_done(struct tcp_req_info* req)
2128 {
2129 	struct tcp_req_done_item* item;
2130 	log_assert(req->num_done_req > 0 && req->done_req_list);
2131 	item = req->done_req_list;
2132 	lock_basic_lock(&stream_wait_count_lock);
2133 	stream_wait_count -= (sizeof(struct tcp_req_done_item)+item->len);
2134 	lock_basic_unlock(&stream_wait_count_lock);
2135 	req->done_req_list = req->done_req_list->next;
2136 	req->num_done_req --;
2137 	return item;
2138 }
2139 
2140 /** Send given buffer and setup to write */
2141 static void
2142 tcp_req_info_start_write_buf(struct tcp_req_info* req, uint8_t* buf,
2143 	size_t len)
2144 {
2145 	sldns_buffer_clear(req->cp->buffer);
2146 	sldns_buffer_write(req->cp->buffer, buf, len);
2147 	sldns_buffer_flip(req->cp->buffer);
2148 
2149 	req->cp->tcp_is_reading = 0; /* we are now writing */
2150 }
2151 
2152 /** pick up the next result and start writing it to the channel */
2153 static void
2154 tcp_req_pickup_next_result(struct tcp_req_info* req)
2155 {
2156 	if(req->num_done_req > 0) {
2157 		/* unlist the done item from the list of pending results */
2158 		struct tcp_req_done_item* item = tcp_req_info_pop_done(req);
2159 		tcp_req_info_start_write_buf(req, item->buf, item->len);
2160 		free(item->buf);
2161 		free(item);
2162 	}
2163 }
2164 
2165 /** the read channel has closed */
2166 int
2167 tcp_req_info_handle_read_close(struct tcp_req_info* req)
2168 {
2169 	verbose(VERB_ALGO, "tcp channel read side closed %d", req->cp->fd);
2170 	/* reset byte count for (potential) partial read */
2171 	req->cp->tcp_byte_count = 0;
2172 	/* if we still have results to write, pick up next and write it */
2173 	if(req->num_done_req != 0) {
2174 		tcp_req_pickup_next_result(req);
2175 		tcp_req_info_setup_listen(req);
2176 		return 1;
2177 	}
2178 	/* if nothing to do, this closes the connection */
2179 	if(req->num_open_req == 0 && req->num_done_req == 0)
2180 		return 0;
2181 	/* otherwise, we must be waiting for dns resolve, wait with timeout */
2182 	req->read_is_closed = 1;
2183 	tcp_req_info_setup_listen(req);
2184 	return 1;
2185 }
2186 
2187 void
2188 tcp_req_info_handle_writedone(struct tcp_req_info* req)
2189 {
2190 	/* back to reading state, we finished this write event */
2191 	sldns_buffer_clear(req->cp->buffer);
2192 	if(req->num_done_req == 0 && req->read_is_closed) {
2193 		/* no more to write and nothing to read, close it */
2194 		comm_point_drop_reply(&req->cp->repinfo);
2195 		return;
2196 	}
2197 	req->cp->tcp_is_reading = 1;
2198 	/* see if another result needs writing */
2199 	tcp_req_pickup_next_result(req);
2200 
2201 	/* see if there is more to write, if not stop_listening for writing */
2202 	/* see if new requests are allowed, if so, start_listening
2203 	 * for reading */
2204 	tcp_req_info_setup_listen(req);
2205 }
2206 
2207 void
2208 tcp_req_info_handle_readdone(struct tcp_req_info* req)
2209 {
2210 	struct comm_point* c = req->cp;
2211 
2212 	/* we want to read up several requests, unless there are
2213 	 * pending answers */
2214 
2215 	req->is_drop = 0;
2216 	req->is_reply = 0;
2217 	req->in_worker_handle = 1;
2218 	sldns_buffer_set_limit(req->spool_buffer, 0);
2219 	/* handle the current request */
2220 	/* this calls the worker handle request routine that could give
2221 	 * a cache response, or localdata response, or drop the reply,
2222 	 * or schedule a mesh entry for later */
2223 	fptr_ok(fptr_whitelist_comm_point(c->callback));
2224 	if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo) ) {
2225 		req->in_worker_handle = 0;
2226 		/* there is an answer, put it up.  It is already in the
2227 		 * c->buffer, just send it. */
2228 		/* since we were just reading a query, the channel is
2229 		 * clear to write to */
2230 	send_it:
2231 		c->tcp_is_reading = 0;
2232 		comm_point_stop_listening(c);
2233 		comm_point_start_listening(c, -1, adjusted_tcp_timeout(c));
2234 		return;
2235 	}
2236 	req->in_worker_handle = 0;
2237 	/* it should be waiting in the mesh for recursion.
2238 	 * If mesh failed to add a new entry and called commpoint_drop_reply.
2239 	 * Then the mesh state has been cleared. */
2240 	if(req->is_drop) {
2241 		/* the reply has been dropped, stream has been closed. */
2242 		return;
2243 	}
2244 	/* If mesh failed(mallocfail) and called commpoint_send_reply with
2245 	 * something like servfail then we pick up that reply below. */
2246 	if(req->is_reply) {
2247 		goto send_it;
2248 	}
2249 
2250 	sldns_buffer_clear(c->buffer);
2251 	/* if pending answers, pick up an answer and start sending it */
2252 	tcp_req_pickup_next_result(req);
2253 
2254 	/* if answers pending, start sending answers */
2255 	/* read more requests if we can have more requests */
2256 	tcp_req_info_setup_listen(req);
2257 }
2258 
2259 int
2260 tcp_req_info_add_meshstate(struct tcp_req_info* req,
2261 	struct mesh_area* mesh, struct mesh_state* m)
2262 {
2263 	struct tcp_req_open_item* item;
2264 	log_assert(req && mesh && m);
2265 	item = (struct tcp_req_open_item*)malloc(sizeof(*item));
2266 	if(!item) return 0;
2267 	item->next = req->open_req_list;
2268 	item->mesh = mesh;
2269 	item->mesh_state = m;
2270 	req->open_req_list = item;
2271 	req->num_open_req++;
2272 	return 1;
2273 }
2274 
2275 /** Add a result to the result list.  At the end. */
2276 static int
2277 tcp_req_info_add_result(struct tcp_req_info* req, uint8_t* buf, size_t len)
2278 {
2279 	struct tcp_req_done_item* last = NULL;
2280 	struct tcp_req_done_item* item;
2281 	size_t space;
2282 
2283 	/* see if we have space */
2284 	space = sizeof(struct tcp_req_done_item) + len;
2285 	lock_basic_lock(&stream_wait_count_lock);
2286 	if(stream_wait_count + space > stream_wait_max) {
2287 		lock_basic_unlock(&stream_wait_count_lock);
2288 		verbose(VERB_ALGO, "drop stream reply, no space left, in stream-wait-size");
2289 		return 0;
2290 	}
2291 	stream_wait_count += space;
2292 	lock_basic_unlock(&stream_wait_count_lock);
2293 
2294 	/* find last element */
2295 	last = req->done_req_list;
2296 	while(last && last->next)
2297 		last = last->next;
2298 
2299 	/* create new element */
2300 	item = (struct tcp_req_done_item*)malloc(sizeof(*item));
2301 	if(!item) {
2302 		log_err("malloc failure, for stream result list");
2303 		return 0;
2304 	}
2305 	item->next = NULL;
2306 	item->len = len;
2307 	item->buf = memdup(buf, len);
2308 	if(!item->buf) {
2309 		free(item);
2310 		log_err("malloc failure, adding reply to stream result list");
2311 		return 0;
2312 	}
2313 
2314 	/* link in */
2315 	if(last) last->next = item;
2316 	else req->done_req_list = item;
2317 	req->num_done_req++;
2318 	return 1;
2319 }
2320 
2321 void
2322 tcp_req_info_send_reply(struct tcp_req_info* req)
2323 {
2324 	if(req->in_worker_handle) {
2325 		/* reply from mesh is in the spool_buffer */
2326 		/* copy now, so that the spool buffer is free for other tasks
2327 		 * before the callback is done */
2328 		sldns_buffer_clear(req->cp->buffer);
2329 		sldns_buffer_write(req->cp->buffer,
2330 			sldns_buffer_begin(req->spool_buffer),
2331 			sldns_buffer_limit(req->spool_buffer));
2332 		sldns_buffer_flip(req->cp->buffer);
2333 		req->is_reply = 1;
2334 		return;
2335 	}
2336 	/* now that the query has been handled, that mesh_reply entry
2337 	 * should be removed, from the tcp_req_info list,
2338 	 * the mesh state cleanup removes then with region_cleanup and
2339 	 * replies_sent true. */
2340 	/* see if we can send it straight away (we are not doing
2341 	 * anything else).  If so, copy to buffer and start */
2342 	if(req->cp->tcp_is_reading && req->cp->tcp_byte_count == 0) {
2343 		/* buffer is free, and was ready to read new query into,
2344 		 * but we are now going to use it to send this answer */
2345 		tcp_req_info_start_write_buf(req,
2346 			sldns_buffer_begin(req->spool_buffer),
2347 			sldns_buffer_limit(req->spool_buffer));
2348 		/* switch to listen to write events */
2349 		comm_point_stop_listening(req->cp);
2350 		comm_point_start_listening(req->cp, -1,
2351 			adjusted_tcp_timeout(req->cp));
2352 		return;
2353 	}
2354 	/* queue up the answer behind the others already pending */
2355 	if(!tcp_req_info_add_result(req, sldns_buffer_begin(req->spool_buffer),
2356 		sldns_buffer_limit(req->spool_buffer))) {
2357 		/* drop the connection, we are out of resources */
2358 		comm_point_drop_reply(&req->cp->repinfo);
2359 	}
2360 }
2361 
2362 size_t tcp_req_info_get_stream_buffer_size(void)
2363 {
2364 	size_t s;
2365 	if(!stream_wait_lock_inited)
2366 		return stream_wait_count;
2367 	lock_basic_lock(&stream_wait_count_lock);
2368 	s = stream_wait_count;
2369 	lock_basic_unlock(&stream_wait_count_lock);
2370 	return s;
2371 }
2372 
2373 size_t http2_get_query_buffer_size(void)
2374 {
2375 	size_t s;
2376 	if(!http2_query_buffer_lock_inited)
2377 		return http2_query_buffer_count;
2378 	lock_basic_lock(&http2_query_buffer_count_lock);
2379 	s = http2_query_buffer_count;
2380 	lock_basic_unlock(&http2_query_buffer_count_lock);
2381 	return s;
2382 }
2383 
2384 size_t http2_get_response_buffer_size(void)
2385 {
2386 	size_t s;
2387 	if(!http2_response_buffer_lock_inited)
2388 		return http2_response_buffer_count;
2389 	lock_basic_lock(&http2_response_buffer_count_lock);
2390 	s = http2_response_buffer_count;
2391 	lock_basic_unlock(&http2_response_buffer_count_lock);
2392 	return s;
2393 }
2394 
2395 #ifdef HAVE_NGHTTP2
2396 /** nghttp2 callback. Used to copy response from rbuffer to nghttp2 session */
2397 static ssize_t http2_submit_response_read_callback(
2398 	nghttp2_session* ATTR_UNUSED(session),
2399 	int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
2400 	nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
2401 {
2402 	struct http2_stream* h2_stream;
2403 	struct http2_session* h2_session = source->ptr;
2404 	size_t copylen = length;
2405 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2406 		h2_session->session, stream_id))) {
2407 		verbose(VERB_QUERY, "http2: cannot get stream data, closing "
2408 			"stream");
2409 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2410 	}
2411 	if(!h2_stream->rbuffer ||
2412 		sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
2413 		verbose(VERB_QUERY, "http2: cannot submit buffer. No data "
2414 			"available in rbuffer");
2415 		/* rbuffer will be free'd in frame close cb */
2416 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2417 	}
2418 
2419 	if(copylen > sldns_buffer_remaining(h2_stream->rbuffer))
2420 		copylen = sldns_buffer_remaining(h2_stream->rbuffer);
2421 	if(copylen > SSIZE_MAX)
2422 		copylen = SSIZE_MAX; /* will probably never happen */
2423 
2424 	memcpy(buf, sldns_buffer_current(h2_stream->rbuffer), copylen);
2425 	sldns_buffer_skip(h2_stream->rbuffer, copylen);
2426 
2427 	if(sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
2428 		*data_flags |= NGHTTP2_DATA_FLAG_EOF;
2429 		lock_basic_lock(&http2_response_buffer_count_lock);
2430 		http2_response_buffer_count -=
2431 			sldns_buffer_capacity(h2_stream->rbuffer);
2432 		lock_basic_unlock(&http2_response_buffer_count_lock);
2433 		sldns_buffer_free(h2_stream->rbuffer);
2434 		h2_stream->rbuffer = NULL;
2435 	}
2436 
2437 	return copylen;
2438 }
2439 
2440 /**
2441  * Send RST_STREAM frame for stream.
2442  * @param h2_session: http2 session to submit frame to
2443  * @param h2_stream: http2 stream containing frame ID to use in RST_STREAM
2444  * @return 0 on error, 1 otherwise
2445  */
2446 static int http2_submit_rst_stream(struct http2_session* h2_session,
2447 		struct http2_stream* h2_stream)
2448 {
2449 	int ret = nghttp2_submit_rst_stream(h2_session->session,
2450 		NGHTTP2_FLAG_NONE, h2_stream->stream_id,
2451 		NGHTTP2_INTERNAL_ERROR);
2452 	if(ret) {
2453 		verbose(VERB_QUERY, "http2: nghttp2_submit_rst_stream failed, "
2454 			"error: %s", nghttp2_strerror(ret));
2455 		return 0;
2456 	}
2457 	return 1;
2458 }
2459 
2460 /**
2461  * DNS response ready to be submitted to nghttp2, to be prepared for sending
2462  * out. Response is stored in c->buffer. Copy to rbuffer because the c->buffer
2463  * might be used before this will be sent out.
2464  * @param h2_session: http2 session, containing c->buffer which contains answer
2465  * @return 0 on error, 1 otherwise
2466  */
2467 int http2_submit_dns_response(struct http2_session* h2_session)
2468 {
2469 	int ret;
2470 	nghttp2_data_provider data_prd;
2471 	char status[4];
2472 	nghttp2_nv headers[3];
2473 	struct http2_stream* h2_stream = h2_session->c->h2_stream;
2474 	size_t rlen;
2475 	char rlen_str[32];
2476 
2477 	if(h2_stream->rbuffer) {
2478 		log_err("http2 submit response error: rbuffer already "
2479 			"exists");
2480 		return 0;
2481 	}
2482 	if(sldns_buffer_remaining(h2_session->c->buffer) == 0) {
2483 		log_err("http2 submit response error: c->buffer not complete");
2484 		return 0;
2485 	}
2486 
2487 	if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
2488 		verbose(VERB_QUERY, "http2: submit response error: "
2489 			"invalid status");
2490 		return 0;
2491 	}
2492 
2493 	rlen = sldns_buffer_remaining(h2_session->c->buffer);
2494 	snprintf(rlen_str, sizeof(rlen_str), "%u", (unsigned)rlen);
2495 
2496 	lock_basic_lock(&http2_response_buffer_count_lock);
2497 	if(http2_response_buffer_count + rlen > http2_response_buffer_max) {
2498 		lock_basic_unlock(&http2_response_buffer_count_lock);
2499 		verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2500 			"in https-response-buffer-size");
2501 		return http2_submit_rst_stream(h2_session, h2_stream);
2502 	}
2503 	http2_response_buffer_count += rlen;
2504 	lock_basic_unlock(&http2_response_buffer_count_lock);
2505 
2506 	if(!(h2_stream->rbuffer = sldns_buffer_new(rlen))) {
2507 		lock_basic_lock(&http2_response_buffer_count_lock);
2508 		http2_response_buffer_count -= rlen;
2509 		lock_basic_unlock(&http2_response_buffer_count_lock);
2510 		log_err("http2 submit response error: malloc failure");
2511 		return 0;
2512 	}
2513 
2514 	headers[0].name = (uint8_t*)":status";
2515 	headers[0].namelen = 7;
2516 	headers[0].value = (uint8_t*)status;
2517 	headers[0].valuelen = 3;
2518 	headers[0].flags = NGHTTP2_NV_FLAG_NONE;
2519 
2520 	headers[1].name = (uint8_t*)"content-type";
2521 	headers[1].namelen = 12;
2522 	headers[1].value = (uint8_t*)"application/dns-message";
2523 	headers[1].valuelen = 23;
2524 	headers[1].flags = NGHTTP2_NV_FLAG_NONE;
2525 
2526 	headers[2].name = (uint8_t*)"content-length";
2527 	headers[2].namelen = 14;
2528 	headers[2].value = (uint8_t*)rlen_str;
2529 	headers[2].valuelen = strlen(rlen_str);
2530 	headers[2].flags = NGHTTP2_NV_FLAG_NONE;
2531 
2532 	sldns_buffer_write(h2_stream->rbuffer,
2533 		sldns_buffer_current(h2_session->c->buffer),
2534 		sldns_buffer_remaining(h2_session->c->buffer));
2535 	sldns_buffer_flip(h2_stream->rbuffer);
2536 
2537 	data_prd.source.ptr = h2_session;
2538 	data_prd.read_callback = http2_submit_response_read_callback;
2539 	ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
2540 		headers, 3, &data_prd);
2541 	if(ret) {
2542 		verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
2543 			"error: %s", nghttp2_strerror(ret));
2544 		return 0;
2545 	}
2546 	return 1;
2547 }
2548 #else
2549 int http2_submit_dns_response(void* ATTR_UNUSED(v))
2550 {
2551 	return 0;
2552 }
2553 #endif
2554 
2555 #ifdef HAVE_NGHTTP2
2556 /** HTTP status to descriptive string */
2557 static char* http_status_to_str(enum http_status s)
2558 {
2559 	switch(s) {
2560 		case HTTP_STATUS_OK:
2561 			return "OK";
2562 		case HTTP_STATUS_BAD_REQUEST:
2563 			return "Bad Request";
2564 		case HTTP_STATUS_NOT_FOUND:
2565 			return "Not Found";
2566 		case HTTP_STATUS_PAYLOAD_TOO_LARGE:
2567 			return "Payload Too Large";
2568 		case HTTP_STATUS_URI_TOO_LONG:
2569 			return "URI Too Long";
2570 		case HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE:
2571 			return "Unsupported Media Type";
2572 		case HTTP_STATUS_NOT_IMPLEMENTED:
2573 			return "Not Implemented";
2574 	}
2575 	return "Status Unknown";
2576 }
2577 
2578 /** nghttp2 callback. Used to copy error message to nghttp2 session */
2579 static ssize_t http2_submit_error_read_callback(
2580 	nghttp2_session* ATTR_UNUSED(session),
2581 	int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
2582 	nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
2583 {
2584 	struct http2_stream* h2_stream;
2585 	struct http2_session* h2_session = source->ptr;
2586 	char* msg;
2587 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2588 		h2_session->session, stream_id))) {
2589 		verbose(VERB_QUERY, "http2: cannot get stream data, closing "
2590 			"stream");
2591 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2592 	}
2593 	*data_flags |= NGHTTP2_DATA_FLAG_EOF;
2594 	msg = http_status_to_str(h2_stream->status);
2595 	if(length < strlen(msg))
2596 		return 0; /* not worth trying over multiple frames */
2597 	memcpy(buf, msg, strlen(msg));
2598 	return strlen(msg);
2599 
2600 }
2601 
2602 /**
2603  * HTTP error response ready to be submitted to nghttp2, to be prepared for
2604  * sending out. Message body will contain descriptive string for HTTP status.
2605  * @param h2_session: http2 session to submit to
2606  * @param h2_stream: http2 stream containing HTTP status to use for error
2607  * @return 0 on error, 1 otherwise
2608  */
2609 static int http2_submit_error(struct http2_session* h2_session,
2610 	struct http2_stream* h2_stream)
2611 {
2612 	int ret;
2613 	char status[4];
2614 	nghttp2_data_provider data_prd;
2615 	nghttp2_nv headers[1]; /* will be copied by nghttp */
2616 	if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
2617 		verbose(VERB_QUERY, "http2: submit error failed, "
2618 			"invalid status");
2619 		return 0;
2620 	}
2621 	headers[0].name = (uint8_t*)":status";
2622 	headers[0].namelen = 7;
2623 	headers[0].value = (uint8_t*)status;
2624 	headers[0].valuelen = 3;
2625 	headers[0].flags = NGHTTP2_NV_FLAG_NONE;
2626 
2627 	data_prd.source.ptr = h2_session;
2628 	data_prd.read_callback = http2_submit_error_read_callback;
2629 
2630 	ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
2631 		headers, 1, &data_prd);
2632 	if(ret) {
2633 		verbose(VERB_QUERY, "http2: submit error failed, "
2634 			"error: %s", nghttp2_strerror(ret));
2635 		return 0;
2636 	}
2637 	return 1;
2638 }
2639 
2640 /**
2641  * Start query handling. Query is stored in the stream, and will be free'd here.
2642  * @param h2_session: http2 session, containing comm point
2643  * @param h2_stream: stream containing buffered query
2644  * @return: -1 on error, 1 if answer is stored in c->buffer, 0 if there is no
2645  * reply available (yet).
2646  */
2647 static int http2_query_read_done(struct http2_session* h2_session,
2648 	struct http2_stream* h2_stream)
2649 {
2650 	log_assert(h2_stream->qbuffer);
2651 
2652 	if(h2_session->c->h2_stream) {
2653 		verbose(VERB_ALGO, "http2_query_read_done failure: shared "
2654 			"buffer already assigned to stream");
2655 		return -1;
2656 	}
2657 
2658     /* the c->buffer might be used by mesh_send_reply and no be cleard
2659 	 * need to be cleared before use */
2660 	sldns_buffer_clear(h2_session->c->buffer);
2661 	if(sldns_buffer_remaining(h2_session->c->buffer) <
2662 		sldns_buffer_remaining(h2_stream->qbuffer)) {
2663 		/* qbuffer will be free'd in frame close cb */
2664 		sldns_buffer_clear(h2_session->c->buffer);
2665 		verbose(VERB_ALGO, "http2_query_read_done failure: can't fit "
2666 			"qbuffer in c->buffer");
2667 		return -1;
2668 	}
2669 
2670 	sldns_buffer_write(h2_session->c->buffer,
2671 		sldns_buffer_current(h2_stream->qbuffer),
2672 		sldns_buffer_remaining(h2_stream->qbuffer));
2673 
2674 	lock_basic_lock(&http2_query_buffer_count_lock);
2675 	http2_query_buffer_count -= sldns_buffer_capacity(h2_stream->qbuffer);
2676 	lock_basic_unlock(&http2_query_buffer_count_lock);
2677 	sldns_buffer_free(h2_stream->qbuffer);
2678 	h2_stream->qbuffer = NULL;
2679 
2680 	sldns_buffer_flip(h2_session->c->buffer);
2681 	h2_session->c->h2_stream = h2_stream;
2682 	fptr_ok(fptr_whitelist_comm_point(h2_session->c->callback));
2683 	if((*h2_session->c->callback)(h2_session->c, h2_session->c->cb_arg,
2684 		NETEVENT_NOERROR, &h2_session->c->repinfo)) {
2685 		return 1; /* answer in c->buffer */
2686 	}
2687 	sldns_buffer_clear(h2_session->c->buffer);
2688 	h2_session->c->h2_stream = NULL;
2689 	return 0; /* mesh state added, or dropped */
2690 }
2691 
2692 /** nghttp2 callback. Used to check if the received frame indicates the end of a
2693  * stream. Gather collected request data and start query handling. */
2694 static int http2_req_frame_recv_cb(nghttp2_session* session,
2695 	const nghttp2_frame* frame, void* cb_arg)
2696 {
2697 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2698 	struct http2_stream* h2_stream;
2699 	int query_read_done;
2700 
2701 	if((frame->hd.type != NGHTTP2_DATA &&
2702 		frame->hd.type != NGHTTP2_HEADERS) ||
2703 		!(frame->hd.flags & NGHTTP2_FLAG_END_STREAM)) {
2704 			return 0;
2705 	}
2706 
2707 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2708 		session, frame->hd.stream_id)))
2709 		return 0;
2710 
2711 	if(h2_stream->invalid_endpoint) {
2712 		h2_stream->status = HTTP_STATUS_NOT_FOUND;
2713 		goto submit_http_error;
2714 	}
2715 
2716 	if(h2_stream->invalid_content_type) {
2717 		h2_stream->status = HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE;
2718 		goto submit_http_error;
2719 	}
2720 
2721 	if(h2_stream->http_method != HTTP_METHOD_GET &&
2722 		h2_stream->http_method != HTTP_METHOD_POST) {
2723 		h2_stream->status = HTTP_STATUS_NOT_IMPLEMENTED;
2724 		goto submit_http_error;
2725 	}
2726 
2727 	if(h2_stream->query_too_large) {
2728 		if(h2_stream->http_method == HTTP_METHOD_POST)
2729 			h2_stream->status = HTTP_STATUS_PAYLOAD_TOO_LARGE;
2730 		else
2731 			h2_stream->status = HTTP_STATUS_URI_TOO_LONG;
2732 		goto submit_http_error;
2733 	}
2734 
2735 	if(!h2_stream->qbuffer) {
2736 		h2_stream->status = HTTP_STATUS_BAD_REQUEST;
2737 		goto submit_http_error;
2738 	}
2739 
2740 	if(h2_stream->status) {
2741 submit_http_error:
2742 		verbose(VERB_QUERY, "http2 request invalid, returning :status="
2743 			"%d", h2_stream->status);
2744 		if(!http2_submit_error(h2_session, h2_stream)) {
2745 			return NGHTTP2_ERR_CALLBACK_FAILURE;
2746 		}
2747 		return 0;
2748 	}
2749 	h2_stream->status = HTTP_STATUS_OK;
2750 
2751 	sldns_buffer_flip(h2_stream->qbuffer);
2752 	h2_session->postpone_drop = 1;
2753 	query_read_done = http2_query_read_done(h2_session, h2_stream);
2754 	if(query_read_done < 0)
2755 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2756 	else if(!query_read_done) {
2757 		if(h2_session->is_drop) {
2758 			/* connection needs to be closed. Return failure to make
2759 			 * sure no other action are taken anymore on comm point.
2760 			 * failure will result in reclaiming (and closing)
2761 			 * of comm point. */
2762 			verbose(VERB_QUERY, "http2 query dropped in worker cb");
2763 			h2_session->postpone_drop = 0;
2764 			return NGHTTP2_ERR_CALLBACK_FAILURE;
2765 		}
2766 		/* nothing to submit right now, query added to mesh. */
2767 		h2_session->postpone_drop = 0;
2768 		return 0;
2769 	}
2770 	if(!http2_submit_dns_response(h2_session)) {
2771 		sldns_buffer_clear(h2_session->c->buffer);
2772 		h2_session->c->h2_stream = NULL;
2773 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2774 	}
2775 	verbose(VERB_QUERY, "http2 query submitted to session");
2776 	sldns_buffer_clear(h2_session->c->buffer);
2777 	h2_session->c->h2_stream = NULL;
2778 	return 0;
2779 }
2780 
2781 /** nghttp2 callback. Used to detect start of new streams. */
2782 static int http2_req_begin_headers_cb(nghttp2_session* session,
2783 	const nghttp2_frame* frame, void* cb_arg)
2784 {
2785 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2786 	struct http2_stream* h2_stream;
2787 	int ret;
2788 	if(frame->hd.type != NGHTTP2_HEADERS ||
2789 		frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
2790 		/* only interested in request headers */
2791 		return 0;
2792 	}
2793 	if(!(h2_stream = http2_stream_create(frame->hd.stream_id))) {
2794 		log_err("malloc failure while creating http2 stream");
2795 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2796 	}
2797 	http2_session_add_stream(h2_session, h2_stream);
2798 	ret = nghttp2_session_set_stream_user_data(session,
2799 		frame->hd.stream_id, h2_stream);
2800 	if(ret) {
2801 		/* stream does not exist */
2802 		verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
2803 			"error: %s", nghttp2_strerror(ret));
2804 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2805 	}
2806 
2807 	return 0;
2808 }
2809 
2810 /**
2811  * base64url decode, store in qbuffer
2812  * @param h2_session: http2 session
2813  * @param h2_stream: http2 stream
2814  * @param start: start of the base64 string
2815  * @param length: length of the base64 string
2816  * @return: 0 on error, 1 otherwise. query will be stored in h2_stream->qbuffer,
2817  * buffer will be NULL is unparseble.
2818  */
2819 static int http2_buffer_uri_query(struct http2_session* h2_session,
2820 	struct http2_stream* h2_stream, const uint8_t* start, size_t length)
2821 {
2822 	size_t expectb64len;
2823 	int b64len;
2824 	if(h2_stream->http_method == HTTP_METHOD_POST)
2825 		return 1;
2826 	if(length == 0)
2827 		return 1;
2828 	if(h2_stream->qbuffer) {
2829 		verbose(VERB_ALGO, "http2_req_header fail, "
2830 			"qbuffer already set");
2831 		return 0;
2832 	}
2833 
2834 	/* calculate size, might be a bit bigger than the real
2835 	 * decoded buffer size */
2836 	expectb64len = sldns_b64_pton_calculate_size(length);
2837 	log_assert(expectb64len > 0);
2838 	if(expectb64len >
2839 		h2_session->c->http2_stream_max_qbuffer_size) {
2840 		h2_stream->query_too_large = 1;
2841 		return 1;
2842 	}
2843 
2844 	lock_basic_lock(&http2_query_buffer_count_lock);
2845 	if(http2_query_buffer_count + expectb64len > http2_query_buffer_max) {
2846 		lock_basic_unlock(&http2_query_buffer_count_lock);
2847 		verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2848 			"in http2-query-buffer-size");
2849 		return http2_submit_rst_stream(h2_session, h2_stream);
2850 	}
2851 	http2_query_buffer_count += expectb64len;
2852 	lock_basic_unlock(&http2_query_buffer_count_lock);
2853 	if(!(h2_stream->qbuffer = sldns_buffer_new(expectb64len))) {
2854 		lock_basic_lock(&http2_query_buffer_count_lock);
2855 		http2_query_buffer_count -= expectb64len;
2856 		lock_basic_unlock(&http2_query_buffer_count_lock);
2857 		log_err("http2_req_header fail, qbuffer "
2858 			"malloc failure");
2859 		return 0;
2860 	}
2861 
2862 	if(sldns_b64_contains_nonurl((char const*)start, length)) {
2863 		char buf[65536+4];
2864 		verbose(VERB_ALGO, "HTTP2 stream contains wrong b64 encoding");
2865 		/* copy to the scratch buffer temporarily to terminate the
2866 		 * string with a zero */
2867 		if(length+1 > sizeof(buf)) {
2868 			/* too long */
2869 			lock_basic_lock(&http2_query_buffer_count_lock);
2870 			http2_query_buffer_count -= expectb64len;
2871 			lock_basic_unlock(&http2_query_buffer_count_lock);
2872 			sldns_buffer_free(h2_stream->qbuffer);
2873 			h2_stream->qbuffer = NULL;
2874 			return 1;
2875 		}
2876 		memmove(buf, start, length);
2877 		buf[length] = 0;
2878 		if(!(b64len = sldns_b64_pton(buf, sldns_buffer_current(
2879 			h2_stream->qbuffer), expectb64len)) || b64len < 0) {
2880 			lock_basic_lock(&http2_query_buffer_count_lock);
2881 			http2_query_buffer_count -= expectb64len;
2882 			lock_basic_unlock(&http2_query_buffer_count_lock);
2883 			sldns_buffer_free(h2_stream->qbuffer);
2884 			h2_stream->qbuffer = NULL;
2885 			return 1;
2886 		}
2887 	} else {
2888 		if(!(b64len = sldns_b64url_pton(
2889 			(char const *)start, length,
2890 			sldns_buffer_current(h2_stream->qbuffer),
2891 			expectb64len)) || b64len < 0) {
2892 			lock_basic_lock(&http2_query_buffer_count_lock);
2893 			http2_query_buffer_count -= expectb64len;
2894 			lock_basic_unlock(&http2_query_buffer_count_lock);
2895 			sldns_buffer_free(h2_stream->qbuffer);
2896 			h2_stream->qbuffer = NULL;
2897 			/* return without error, method can be an
2898 			 * unknown POST */
2899 			return 1;
2900 		}
2901 	}
2902 	sldns_buffer_skip(h2_stream->qbuffer, (size_t)b64len);
2903 	return 1;
2904 }
2905 
2906 /** nghttp2 callback. Used to parse headers from HEADER frames. */
2907 static int http2_req_header_cb(nghttp2_session* session,
2908 	const nghttp2_frame* frame, const uint8_t* name, size_t namelen,
2909 	const uint8_t* value, size_t valuelen, uint8_t ATTR_UNUSED(flags),
2910 	void* cb_arg)
2911 {
2912 	struct http2_stream* h2_stream = NULL;
2913 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2914 	/* nghttp2 deals with CONTINUATION frames and provides them as part of
2915 	 * the HEADER */
2916 	if(frame->hd.type != NGHTTP2_HEADERS ||
2917 		frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
2918 		/* only interested in request headers */
2919 		return 0;
2920 	}
2921 	if(!(h2_stream = nghttp2_session_get_stream_user_data(session,
2922 		frame->hd.stream_id)))
2923 		return 0;
2924 
2925 	/* earlier checks already indicate we can stop handling this query */
2926 	if(h2_stream->http_method == HTTP_METHOD_UNSUPPORTED ||
2927 		h2_stream->invalid_content_type ||
2928 		h2_stream->invalid_endpoint)
2929 		return 0;
2930 
2931 
2932 	/* nghttp2 performs some sanity checks in the headers, including:
2933 	 * name and value are guaranteed to be null terminated
2934 	 * name is guaranteed to be lowercase
2935 	 * content-length value is guaranteed to contain digits
2936 	 */
2937 
2938 	if(!h2_stream->http_method && namelen == 7 &&
2939 		memcmp(":method", name, namelen) == 0) {
2940 		/* Case insensitive check on :method value to be on the safe
2941 		 * side. I failed to find text about case sensitivity in specs.
2942 		 */
2943 		if(valuelen == 3 && strcasecmp("GET", (const char*)value) == 0)
2944 			h2_stream->http_method = HTTP_METHOD_GET;
2945 		else if(valuelen == 4 &&
2946 			strcasecmp("POST", (const char*)value) == 0) {
2947 			h2_stream->http_method = HTTP_METHOD_POST;
2948 			if(h2_stream->qbuffer) {
2949 				/* POST method uses query from DATA frames */
2950 				lock_basic_lock(&http2_query_buffer_count_lock);
2951 				http2_query_buffer_count -=
2952 					sldns_buffer_capacity(h2_stream->qbuffer);
2953 				lock_basic_unlock(&http2_query_buffer_count_lock);
2954 				sldns_buffer_free(h2_stream->qbuffer);
2955 				h2_stream->qbuffer = NULL;
2956 			}
2957 		} else
2958 			h2_stream->http_method = HTTP_METHOD_UNSUPPORTED;
2959 		return 0;
2960 	}
2961 	if(namelen == 5 && memcmp(":path", name, namelen) == 0) {
2962 		/* :path may contain DNS query, depending on method. Method might
2963 		 * not be known yet here, so check after finishing receiving
2964 		 * stream. */
2965 #define	HTTP_QUERY_PARAM "?dns="
2966 		size_t el = strlen(h2_session->c->http_endpoint);
2967 		size_t qpl = strlen(HTTP_QUERY_PARAM);
2968 
2969 		if(valuelen < el || memcmp(h2_session->c->http_endpoint,
2970 			value, el) != 0) {
2971 			h2_stream->invalid_endpoint = 1;
2972 			return 0;
2973 		}
2974 		/* larger than endpoint only allowed if it is for the query
2975 		 * parameter */
2976 		if(valuelen <= el+qpl ||
2977 			memcmp(HTTP_QUERY_PARAM, value+el, qpl) != 0) {
2978 			if(valuelen != el)
2979 				h2_stream->invalid_endpoint = 1;
2980 			return 0;
2981 		}
2982 
2983 		if(!http2_buffer_uri_query(h2_session, h2_stream,
2984 			value+(el+qpl), valuelen-(el+qpl))) {
2985 			return NGHTTP2_ERR_CALLBACK_FAILURE;
2986 		}
2987 		return 0;
2988 	}
2989 	/* Content type is a SHOULD (rfc7231#section-3.1.1.5) when using POST,
2990 	 * and not needed when using GET. Don't enfore.
2991 	 * If set only allow lowercase "application/dns-message".
2992 	 *
2993 	 * Clients SHOULD (rfc8484#section-4.1) set an accept header, but MUST
2994 	 * be able to handle "application/dns-message". Since that is the only
2995 	 * content-type supported we can ignore the accept header.
2996 	 */
2997 	if((namelen == 12 && memcmp("content-type", name, namelen) == 0)) {
2998 		if(valuelen != 23 || memcmp("application/dns-message", value,
2999 			valuelen) != 0) {
3000 			h2_stream->invalid_content_type = 1;
3001 		}
3002 	}
3003 
3004 	/* Only interested in content-lentg for POST (on not yet known) method.
3005 	 */
3006 	if((!h2_stream->http_method ||
3007 		h2_stream->http_method == HTTP_METHOD_POST) &&
3008 		!h2_stream->content_length && namelen  == 14 &&
3009 		memcmp("content-length", name, namelen) == 0) {
3010 		if(valuelen > 5) {
3011 			h2_stream->query_too_large = 1;
3012 			return 0;
3013 		}
3014 		/* guaranteed to only contain digits and be null terminated */
3015 		h2_stream->content_length = atoi((const char*)value);
3016 		if(h2_stream->content_length >
3017 			h2_session->c->http2_stream_max_qbuffer_size) {
3018 			h2_stream->query_too_large = 1;
3019 			return 0;
3020 		}
3021 	}
3022 	return 0;
3023 }
3024 
3025 /** nghttp2 callback. Used to get data from DATA frames, which can contain
3026  * queries in POST requests. */
3027 static int http2_req_data_chunk_recv_cb(nghttp2_session* ATTR_UNUSED(session),
3028 	uint8_t ATTR_UNUSED(flags), int32_t stream_id, const uint8_t* data,
3029 	size_t len, void* cb_arg)
3030 {
3031 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
3032 	struct http2_stream* h2_stream;
3033 	size_t qlen = 0;
3034 
3035 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
3036 		h2_session->session, stream_id))) {
3037 		return 0;
3038 	}
3039 
3040 	if(h2_stream->query_too_large)
3041 		return 0;
3042 
3043 	if(!h2_stream->qbuffer) {
3044 		if(h2_stream->content_length) {
3045 			if(h2_stream->content_length < len)
3046 				/* getting more data in DATA frame than
3047 				 * advertised in content-length header. */
3048 				return NGHTTP2_ERR_CALLBACK_FAILURE;
3049 			qlen = h2_stream->content_length;
3050 		} else if(len <= h2_session->c->http2_stream_max_qbuffer_size) {
3051 			/* setting this to msg-buffer-size can result in a lot
3052 			 * of memory consuption. Most queries should fit in a
3053 			 * single DATA frame, and most POST queries will
3054 			 * contain content-length which does not impose this
3055 			 * limit. */
3056 			qlen = len;
3057 		}
3058 	}
3059 	if(!h2_stream->qbuffer && qlen) {
3060 		lock_basic_lock(&http2_query_buffer_count_lock);
3061 		if(http2_query_buffer_count + qlen > http2_query_buffer_max) {
3062 			lock_basic_unlock(&http2_query_buffer_count_lock);
3063 			verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
3064 				"in http2-query-buffer-size");
3065 			return http2_submit_rst_stream(h2_session, h2_stream);
3066 		}
3067 		http2_query_buffer_count += qlen;
3068 		lock_basic_unlock(&http2_query_buffer_count_lock);
3069 		if(!(h2_stream->qbuffer = sldns_buffer_new(qlen))) {
3070 			lock_basic_lock(&http2_query_buffer_count_lock);
3071 			http2_query_buffer_count -= qlen;
3072 			lock_basic_unlock(&http2_query_buffer_count_lock);
3073 		}
3074 	}
3075 
3076 	if(!h2_stream->qbuffer ||
3077 		sldns_buffer_remaining(h2_stream->qbuffer) < len) {
3078 		verbose(VERB_ALGO, "http2 data_chunck_recv failed. Not enough "
3079 			"buffer space for POST query. Can happen on multi "
3080 			"frame requests without content-length header");
3081 		h2_stream->query_too_large = 1;
3082 		return 0;
3083 	}
3084 
3085 	sldns_buffer_write(h2_stream->qbuffer, data, len);
3086 
3087 	return 0;
3088 }
3089 
3090 void http2_req_stream_clear(struct http2_stream* h2_stream)
3091 {
3092 	if(h2_stream->qbuffer) {
3093 		lock_basic_lock(&http2_query_buffer_count_lock);
3094 		http2_query_buffer_count -=
3095 			sldns_buffer_capacity(h2_stream->qbuffer);
3096 		lock_basic_unlock(&http2_query_buffer_count_lock);
3097 		sldns_buffer_free(h2_stream->qbuffer);
3098 		h2_stream->qbuffer = NULL;
3099 	}
3100 	if(h2_stream->rbuffer) {
3101 		lock_basic_lock(&http2_response_buffer_count_lock);
3102 		http2_response_buffer_count -=
3103 			sldns_buffer_capacity(h2_stream->rbuffer);
3104 		lock_basic_unlock(&http2_response_buffer_count_lock);
3105 		sldns_buffer_free(h2_stream->rbuffer);
3106 		h2_stream->rbuffer = NULL;
3107 	}
3108 }
3109 
3110 nghttp2_session_callbacks* http2_req_callbacks_create(void)
3111 {
3112 	nghttp2_session_callbacks *callbacks;
3113 	if(nghttp2_session_callbacks_new(&callbacks) == NGHTTP2_ERR_NOMEM) {
3114 		log_err("failed to initialize nghttp2 callback");
3115 		return NULL;
3116 	}
3117 	/* reception of header block started, used to create h2_stream */
3118 	nghttp2_session_callbacks_set_on_begin_headers_callback(callbacks,
3119 		http2_req_begin_headers_cb);
3120 	/* complete frame received, used to get data from stream if frame
3121 	 * has end stream flag, and start processing query */
3122 	nghttp2_session_callbacks_set_on_frame_recv_callback(callbacks,
3123 		http2_req_frame_recv_cb);
3124 	/* get request info from headers */
3125 	nghttp2_session_callbacks_set_on_header_callback(callbacks,
3126 		http2_req_header_cb);
3127 	/* get data from DATA frames, containing POST query */
3128 	nghttp2_session_callbacks_set_on_data_chunk_recv_callback(callbacks,
3129 		http2_req_data_chunk_recv_cb);
3130 
3131 	/* generic HTTP2 callbacks */
3132 	nghttp2_session_callbacks_set_recv_callback(callbacks, http2_recv_cb);
3133 	nghttp2_session_callbacks_set_send_callback(callbacks, http2_send_cb);
3134 	nghttp2_session_callbacks_set_on_stream_close_callback(callbacks,
3135 		http2_stream_close_cb);
3136 
3137 	return callbacks;
3138 }
3139 #endif /* HAVE_NGHTTP2 */
3140