1 /*
2  * services/listen_dnsport.c - listen on port 53 for incoming DNS queries.
3  *
4  * Copyright (c) 2007, NLnet Labs. All rights reserved.
5  *
6  * This software is open source.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * Redistributions of source code must retain the above copyright notice,
13  * this list of conditions and the following disclaimer.
14  *
15  * Redistributions in binary form must reproduce the above copyright notice,
16  * this list of conditions and the following disclaimer in the documentation
17  * and/or other materials provided with the distribution.
18  *
19  * Neither the name of the NLNET LABS nor the names of its contributors may
20  * be used to endorse or promote products derived from this software without
21  * specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  */
35 
36 /**
37  * \file
38  *
39  * This file has functions to get queries from clients.
40  */
41 #include "config.h"
42 #ifdef HAVE_SYS_TYPES_H
43 #  include <sys/types.h>
44 #endif
45 #include <sys/time.h>
46 #include <limits.h>
47 #ifdef USE_TCP_FASTOPEN
48 #include <netinet/tcp.h>
49 #endif
50 #include <ctype.h>
51 #include "services/listen_dnsport.h"
52 #include "services/outside_network.h"
53 #include "util/netevent.h"
54 #include "util/log.h"
55 #include "util/config_file.h"
56 #include "util/net_help.h"
57 #include "sldns/sbuffer.h"
58 #include "sldns/parseutil.h"
59 #include "services/mesh.h"
60 #include "util/fptr_wlist.h"
61 #include "util/locks.h"
62 
63 #ifdef HAVE_NETDB_H
64 #include <netdb.h>
65 #endif
66 #include <fcntl.h>
67 
68 #ifdef HAVE_SYS_UN_H
69 #include <sys/un.h>
70 #endif
71 
72 #ifdef HAVE_SYSTEMD
73 #include <systemd/sd-daemon.h>
74 #endif
75 
76 #ifdef HAVE_IFADDRS_H
77 #include <ifaddrs.h>
78 #endif
79 #ifdef HAVE_NET_IF_H
80 #include <net/if.h>
81 #endif
82 #ifdef HAVE_LINUX_NET_TSTAMP_H
83 #include <linux/net_tstamp.h>
84 #endif
85 /** number of queued TCP connections for listen() */
86 #define TCP_BACKLOG 256
87 
88 #ifndef THREADS_DISABLED
89 /** lock on the counter of stream buffer memory */
90 static lock_basic_type stream_wait_count_lock;
91 /** lock on the counter of HTTP2 query buffer memory */
92 static lock_basic_type http2_query_buffer_count_lock;
93 /** lock on the counter of HTTP2 response buffer memory */
94 static lock_basic_type http2_response_buffer_count_lock;
95 #endif
96 /** size (in bytes) of stream wait buffers */
97 static size_t stream_wait_count = 0;
98 /** is the lock initialised for stream wait buffers */
99 static int stream_wait_lock_inited = 0;
100 /** size (in bytes) of HTTP2 query buffers */
101 static size_t http2_query_buffer_count = 0;
102 /** is the lock initialised for HTTP2 query buffers */
103 static int http2_query_buffer_lock_inited = 0;
104 /** size (in bytes) of HTTP2 response buffers */
105 static size_t http2_response_buffer_count = 0;
106 /** is the lock initialised for HTTP2 response buffers */
107 static int http2_response_buffer_lock_inited = 0;
108 
109 /**
110  * Debug print of the getaddrinfo returned address.
111  * @param addr: the address returned.
112  */
113 static void
114 verbose_print_addr(struct addrinfo *addr)
115 {
116 	if(verbosity >= VERB_ALGO) {
117 		char buf[100];
118 		void* sinaddr = &((struct sockaddr_in*)addr->ai_addr)->sin_addr;
119 #ifdef INET6
120 		if(addr->ai_family == AF_INET6)
121 			sinaddr = &((struct sockaddr_in6*)addr->ai_addr)->
122 				sin6_addr;
123 #endif /* INET6 */
124 		if(inet_ntop(addr->ai_family, sinaddr, buf,
125 			(socklen_t)sizeof(buf)) == 0) {
126 			(void)strlcpy(buf, "(null)", sizeof(buf));
127 		}
128 		buf[sizeof(buf)-1] = 0;
129 		verbose(VERB_ALGO, "creating %s%s socket %s %d",
130 			addr->ai_socktype==SOCK_DGRAM?"udp":
131 			addr->ai_socktype==SOCK_STREAM?"tcp":"otherproto",
132 			addr->ai_family==AF_INET?"4":
133 			addr->ai_family==AF_INET6?"6":
134 			"_otherfam", buf,
135 			ntohs(((struct sockaddr_in*)addr->ai_addr)->sin_port));
136 	}
137 }
138 
139 void
140 verbose_print_unbound_socket(struct unbound_socket* ub_sock)
141 {
142 	if(verbosity >= VERB_ALGO) {
143 		log_info("listing of unbound_socket structure:");
144 		verbose_print_addr(ub_sock->addr);
145 		log_info("s is: %d, fam is: %s, acl: %s", ub_sock->s,
146 			ub_sock->fam == AF_INET?"AF_INET":"AF_INET6",
147 			ub_sock->acl?"yes":"no");
148 	}
149 }
150 
151 #ifdef HAVE_SYSTEMD
152 static int
153 systemd_get_activated(int family, int socktype, int listen,
154 		      struct sockaddr *addr, socklen_t addrlen,
155 		      const char *path)
156 {
157 	int i = 0;
158 	int r = 0;
159 	int s = -1;
160 	const char* listen_pid, *listen_fds;
161 
162 	/* We should use "listen" option only for stream protocols. For UDP it should be -1 */
163 
164 	if((r = sd_booted()) < 1) {
165 		if(r == 0)
166 			log_warn("systemd is not running");
167 		else
168 			log_err("systemd sd_booted(): %s", strerror(-r));
169 		return -1;
170 	}
171 
172 	listen_pid = getenv("LISTEN_PID");
173 	listen_fds = getenv("LISTEN_FDS");
174 
175 	if (!listen_pid) {
176 		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_PID");
177 		return -1;
178 	}
179 
180 	if (!listen_fds) {
181 		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_FDS");
182 		return -1;
183 	}
184 
185 	if((r = sd_listen_fds(0)) < 1) {
186 		if(r == 0)
187 			log_warn("systemd: did not return socket, check unit configuration");
188 		else
189 			log_err("systemd sd_listen_fds(): %s", strerror(-r));
190 		return -1;
191 	}
192 
193 	for(i = 0; i < r; i++) {
194 		if(sd_is_socket(SD_LISTEN_FDS_START + i, family, socktype, listen)) {
195 			s = SD_LISTEN_FDS_START + i;
196 			break;
197 		}
198 	}
199 	if (s == -1) {
200 		if (addr)
201 			log_err_addr("systemd sd_listen_fds()",
202 				     "no such socket",
203 				     (struct sockaddr_storage *)addr, addrlen);
204 		else
205 			log_err("systemd sd_listen_fds(): %s", path);
206 	}
207 	return s;
208 }
209 #endif
210 
211 int
212 create_udp_sock(int family, int socktype, struct sockaddr* addr,
213         socklen_t addrlen, int v6only, int* inuse, int* noproto,
214 	int rcv, int snd, int listen, int* reuseport, int transparent,
215 	int freebind, int use_systemd, int dscp)
216 {
217 	int s;
218 	char* err;
219 #if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_USE_MIN_MTU)  || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined (SO_BINDANY)
220 	int on=1;
221 #endif
222 #ifdef IPV6_MTU
223 	int mtu = IPV6_MIN_MTU;
224 #endif
225 #if !defined(SO_RCVBUFFORCE) && !defined(SO_RCVBUF)
226 	(void)rcv;
227 #endif
228 #if !defined(SO_SNDBUFFORCE) && !defined(SO_SNDBUF)
229 	(void)snd;
230 #endif
231 #ifndef IPV6_V6ONLY
232 	(void)v6only;
233 #endif
234 #if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
235 	(void)transparent;
236 #endif
237 #if !defined(IP_FREEBIND)
238 	(void)freebind;
239 #endif
240 #ifdef HAVE_SYSTEMD
241 	int got_fd_from_systemd = 0;
242 
243 	if (!use_systemd
244 	    || (use_systemd
245 		&& (s = systemd_get_activated(family, socktype, -1, addr,
246 					      addrlen, NULL)) == -1)) {
247 #else
248 	(void)use_systemd;
249 #endif
250 	if((s = socket(family, socktype, 0)) == -1) {
251 		*inuse = 0;
252 #ifndef USE_WINSOCK
253 		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
254 			*noproto = 1;
255 			return -1;
256 		}
257 #else
258 		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
259 			WSAGetLastError() == WSAEPROTONOSUPPORT) {
260 			*noproto = 1;
261 			return -1;
262 		}
263 #endif
264 		log_err("can't create socket: %s", sock_strerror(errno));
265 		*noproto = 0;
266 		return -1;
267 	}
268 #ifdef HAVE_SYSTEMD
269 	} else {
270 		got_fd_from_systemd = 1;
271 	}
272 #endif
273 	if(listen) {
274 #ifdef SO_REUSEADDR
275 		if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
276 			(socklen_t)sizeof(on)) < 0) {
277 			log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
278 				sock_strerror(errno));
279 #ifndef USE_WINSOCK
280 			if(errno != ENOSYS) {
281 				close(s);
282 				*noproto = 0;
283 				*inuse = 0;
284 				return -1;
285 			}
286 #else
287 			closesocket(s);
288 			*noproto = 0;
289 			*inuse = 0;
290 			return -1;
291 #endif
292 		}
293 #endif /* SO_REUSEADDR */
294 #ifdef SO_REUSEPORT
295 #  ifdef SO_REUSEPORT_LB
296 		/* on FreeBSD 12 we have SO_REUSEPORT_LB that does loadbalance
297 		 * like SO_REUSEPORT on Linux.  This is what the users want
298 		 * with the config option in unbound.conf; if we actually
299 		 * need local address and port reuse they'll also need to
300 		 * have SO_REUSEPORT set for them, assume it was _LB they want.
301 		 */
302 		if (reuseport && *reuseport &&
303 		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT_LB, (void*)&on,
304 			(socklen_t)sizeof(on)) < 0) {
305 #ifdef ENOPROTOOPT
306 			if(errno != ENOPROTOOPT || verbosity >= 3)
307 				log_warn("setsockopt(.. SO_REUSEPORT_LB ..) failed: %s",
308 					strerror(errno));
309 #endif
310 			/* this option is not essential, we can continue */
311 			*reuseport = 0;
312 		}
313 #  else /* no SO_REUSEPORT_LB */
314 
315 		/* try to set SO_REUSEPORT so that incoming
316 		 * queries are distributed evenly among the receiving threads.
317 		 * Each thread must have its own socket bound to the same port,
318 		 * with SO_REUSEPORT set on each socket.
319 		 */
320 		if (reuseport && *reuseport &&
321 		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
322 			(socklen_t)sizeof(on)) < 0) {
323 #ifdef ENOPROTOOPT
324 			if(errno != ENOPROTOOPT || verbosity >= 3)
325 				log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
326 					strerror(errno));
327 #endif
328 			/* this option is not essential, we can continue */
329 			*reuseport = 0;
330 		}
331 #  endif /* SO_REUSEPORT_LB */
332 #else
333 		(void)reuseport;
334 #endif /* defined(SO_REUSEPORT) */
335 #ifdef IP_TRANSPARENT
336 		if (transparent &&
337 		    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
338 		    (socklen_t)sizeof(on)) < 0) {
339 			log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
340 			strerror(errno));
341 		}
342 #elif defined(IP_BINDANY)
343 		if (transparent &&
344 		    setsockopt(s, (family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
345 		    (family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
346 		    (void*)&on, (socklen_t)sizeof(on)) < 0) {
347 			log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
348 			(family==AF_INET6?"V6":""), strerror(errno));
349 		}
350 #elif defined(SO_BINDANY)
351 		if (transparent &&
352 		    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on,
353 		    (socklen_t)sizeof(on)) < 0) {
354 			log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
355 			strerror(errno));
356 		}
357 #endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
358 	}
359 #ifdef IP_FREEBIND
360 	if(freebind &&
361 	    setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
362 	    (socklen_t)sizeof(on)) < 0) {
363 		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
364 		strerror(errno));
365 	}
366 #endif /* IP_FREEBIND */
367 	if(rcv) {
368 #ifdef SO_RCVBUF
369 		int got;
370 		socklen_t slen = (socklen_t)sizeof(got);
371 #  ifdef SO_RCVBUFFORCE
372 		/* Linux specific: try to use root permission to override
373 		 * system limits on rcvbuf. The limit is stored in
374 		 * /proc/sys/net/core/rmem_max or sysctl net.core.rmem_max */
375 		if(setsockopt(s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
376 			(socklen_t)sizeof(rcv)) < 0) {
377 			if(errno != EPERM) {
378 				log_err("setsockopt(..., SO_RCVBUFFORCE, "
379 					"...) failed: %s", sock_strerror(errno));
380 				sock_close(s);
381 				*noproto = 0;
382 				*inuse = 0;
383 				return -1;
384 			}
385 #  endif /* SO_RCVBUFFORCE */
386 			if(setsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
387 				(socklen_t)sizeof(rcv)) < 0) {
388 				log_err("setsockopt(..., SO_RCVBUF, "
389 					"...) failed: %s", sock_strerror(errno));
390 				sock_close(s);
391 				*noproto = 0;
392 				*inuse = 0;
393 				return -1;
394 			}
395 			/* check if we got the right thing or if system
396 			 * reduced to some system max.  Warn if so */
397 			if(getsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&got,
398 				&slen) >= 0 && got < rcv/2) {
399 				log_warn("so-rcvbuf %u was not granted. "
400 					"Got %u. To fix: start with "
401 					"root permissions(linux) or sysctl "
402 					"bigger net.core.rmem_max(linux) or "
403 					"kern.ipc.maxsockbuf(bsd) values.",
404 					(unsigned)rcv, (unsigned)got);
405 			}
406 #  ifdef SO_RCVBUFFORCE
407 		}
408 #  endif
409 #endif /* SO_RCVBUF */
410 	}
411 	/* first do RCVBUF as the receive buffer is more important */
412 	if(snd) {
413 #ifdef SO_SNDBUF
414 		int got;
415 		socklen_t slen = (socklen_t)sizeof(got);
416 #  ifdef SO_SNDBUFFORCE
417 		/* Linux specific: try to use root permission to override
418 		 * system limits on sndbuf. The limit is stored in
419 		 * /proc/sys/net/core/wmem_max or sysctl net.core.wmem_max */
420 		if(setsockopt(s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
421 			(socklen_t)sizeof(snd)) < 0) {
422 			if(errno != EPERM) {
423 				log_err("setsockopt(..., SO_SNDBUFFORCE, "
424 					"...) failed: %s", sock_strerror(errno));
425 				sock_close(s);
426 				*noproto = 0;
427 				*inuse = 0;
428 				return -1;
429 			}
430 #  endif /* SO_SNDBUFFORCE */
431 			if(setsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
432 				(socklen_t)sizeof(snd)) < 0) {
433 				log_err("setsockopt(..., SO_SNDBUF, "
434 					"...) failed: %s", sock_strerror(errno));
435 				sock_close(s);
436 				*noproto = 0;
437 				*inuse = 0;
438 				return -1;
439 			}
440 			/* check if we got the right thing or if system
441 			 * reduced to some system max.  Warn if so */
442 			if(getsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&got,
443 				&slen) >= 0 && got < snd/2) {
444 				log_warn("so-sndbuf %u was not granted. "
445 					"Got %u. To fix: start with "
446 					"root permissions(linux) or sysctl "
447 					"bigger net.core.wmem_max(linux) or "
448 					"kern.ipc.maxsockbuf(bsd) values.",
449 					(unsigned)snd, (unsigned)got);
450 			}
451 #  ifdef SO_SNDBUFFORCE
452 		}
453 #  endif
454 #endif /* SO_SNDBUF */
455 	}
456 	err = set_ip_dscp(s, family, dscp);
457 	if(err != NULL)
458 		log_warn("error setting IP DiffServ codepoint %d on UDP socket: %s", dscp, err);
459 	if(family == AF_INET6) {
460 # if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
461 		int omit6_set = 0;
462 		int action;
463 # endif
464 # if defined(IPV6_V6ONLY)
465 		if(v6only
466 #   ifdef HAVE_SYSTEMD
467 			/* Systemd wants to control if the socket is v6 only
468 			 * or both, with BindIPv6Only=default, ipv6-only or
469 			 * both in systemd.socket, so it is not set here. */
470 			&& !got_fd_from_systemd
471 #   endif
472 			) {
473 			int val=(v6only==2)?0:1;
474 			if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
475 				(void*)&val, (socklen_t)sizeof(val)) < 0) {
476 				log_err("setsockopt(..., IPV6_V6ONLY"
477 					", ...) failed: %s", sock_strerror(errno));
478 				sock_close(s);
479 				*noproto = 0;
480 				*inuse = 0;
481 				return -1;
482 			}
483 		}
484 # endif
485 # if defined(IPV6_USE_MIN_MTU)
486 		/*
487 		 * There is no fragmentation of IPv6 datagrams
488 		 * during forwarding in the network. Therefore
489 		 * we do not send UDP datagrams larger than
490 		 * the minimum IPv6 MTU of 1280 octets. The
491 		 * EDNS0 message length can be larger if the
492 		 * network stack supports IPV6_USE_MIN_MTU.
493 		 */
494 		if (setsockopt(s, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
495 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
496 			log_err("setsockopt(..., IPV6_USE_MIN_MTU, "
497 				"...) failed: %s", sock_strerror(errno));
498 			sock_close(s);
499 			*noproto = 0;
500 			*inuse = 0;
501 			return -1;
502 		}
503 # elif defined(IPV6_MTU)
504 #   ifndef USE_WINSOCK
505 		/*
506 		 * On Linux, to send no larger than 1280, the PMTUD is
507 		 * disabled by default for datagrams anyway, so we set
508 		 * the MTU to use.
509 		 */
510 		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU,
511 			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
512 			log_err("setsockopt(..., IPV6_MTU, ...) failed: %s",
513 				sock_strerror(errno));
514 			sock_close(s);
515 			*noproto = 0;
516 			*inuse = 0;
517 			return -1;
518 		}
519 #   elif defined(IPV6_USER_MTU)
520 		/* As later versions of the mingw crosscompiler define
521 		 * IPV6_MTU, do the same for windows but use IPV6_USER_MTU
522 		 * instead which is writable; IPV6_MTU is readonly there. */
523 		if (setsockopt(s, IPPROTO_IPV6, IPV6_USER_MTU,
524 			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
525 			if (WSAGetLastError() != WSAENOPROTOOPT) {
526 				log_err("setsockopt(..., IPV6_USER_MTU, ...) failed: %s",
527 					wsa_strerror(WSAGetLastError()));
528 				sock_close(s);
529 				*noproto = 0;
530 				*inuse = 0;
531 				return -1;
532 			}
533 		}
534 #   endif /* USE_WINSOCK */
535 # endif /* IPv6 MTU */
536 # if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
537 #  if defined(IP_PMTUDISC_OMIT)
538 		action = IP_PMTUDISC_OMIT;
539 		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
540 			&action, (socklen_t)sizeof(action)) < 0) {
541 
542 			if (errno != EINVAL) {
543 				log_err("setsockopt(..., IPV6_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
544 					strerror(errno));
545 				sock_close(s);
546 				*noproto = 0;
547 				*inuse = 0;
548 				return -1;
549 			}
550 		}
551 		else
552 		{
553 		    omit6_set = 1;
554 		}
555 #  endif
556 		if (omit6_set == 0) {
557 			action = IP_PMTUDISC_DONT;
558 			if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
559 				&action, (socklen_t)sizeof(action)) < 0) {
560 				log_err("setsockopt(..., IPV6_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
561 					strerror(errno));
562 				sock_close(s);
563 				*noproto = 0;
564 				*inuse = 0;
565 				return -1;
566 			}
567 		}
568 # endif /* IPV6_MTU_DISCOVER */
569 	} else if(family == AF_INET) {
570 #  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
571 /* linux 3.15 has IP_PMTUDISC_OMIT, Hannes Frederic Sowa made it so that
572  * PMTU information is not accepted, but fragmentation is allowed
573  * if and only if the packet size exceeds the outgoing interface MTU
574  * (and also uses the interface mtu to determine the size of the packets).
575  * So there won't be any EMSGSIZE error.  Against DNS fragmentation attacks.
576  * FreeBSD already has same semantics without setting the option. */
577 		int omit_set = 0;
578 		int action;
579 #   if defined(IP_PMTUDISC_OMIT)
580 		action = IP_PMTUDISC_OMIT;
581 		if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
582 			&action, (socklen_t)sizeof(action)) < 0) {
583 
584 			if (errno != EINVAL) {
585 				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
586 					strerror(errno));
587 				sock_close(s);
588 				*noproto = 0;
589 				*inuse = 0;
590 				return -1;
591 			}
592 		}
593 		else
594 		{
595 		    omit_set = 1;
596 		}
597 #   endif
598 		if (omit_set == 0) {
599    			action = IP_PMTUDISC_DONT;
600 			if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
601 				&action, (socklen_t)sizeof(action)) < 0) {
602 				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
603 					strerror(errno));
604 				sock_close(s);
605 				*noproto = 0;
606 				*inuse = 0;
607 				return -1;
608 			}
609 		}
610 #  elif defined(IP_DONTFRAG) && !defined(__APPLE__)
611 		/* the IP_DONTFRAG option if defined in the 11.0 OSX headers,
612 		 * but does not work on that version, so we exclude it */
613 		int off = 0;
614 		if (setsockopt(s, IPPROTO_IP, IP_DONTFRAG,
615 			&off, (socklen_t)sizeof(off)) < 0) {
616 			log_err("setsockopt(..., IP_DONTFRAG, ...) failed: %s",
617 				strerror(errno));
618 			sock_close(s);
619 			*noproto = 0;
620 			*inuse = 0;
621 			return -1;
622 		}
623 #  endif /* IPv4 MTU */
624 	}
625 	if(
626 #ifdef HAVE_SYSTEMD
627 		!got_fd_from_systemd &&
628 #endif
629 		bind(s, (struct sockaddr*)addr, addrlen) != 0) {
630 		*noproto = 0;
631 		*inuse = 0;
632 #ifndef USE_WINSOCK
633 #ifdef EADDRINUSE
634 		*inuse = (errno == EADDRINUSE);
635 		/* detect freebsd jail with no ipv6 permission */
636 		if(family==AF_INET6 && errno==EINVAL)
637 			*noproto = 1;
638 		else if(errno != EADDRINUSE &&
639 			!(errno == EACCES && verbosity < 4 && !listen)
640 #ifdef EADDRNOTAVAIL
641 			&& !(errno == EADDRNOTAVAIL && verbosity < 4 && !listen)
642 #endif
643 			) {
644 			log_err_addr("can't bind socket", strerror(errno),
645 				(struct sockaddr_storage*)addr, addrlen);
646 		}
647 #endif /* EADDRINUSE */
648 #else /* USE_WINSOCK */
649 		if(WSAGetLastError() != WSAEADDRINUSE &&
650 			WSAGetLastError() != WSAEADDRNOTAVAIL &&
651 			!(WSAGetLastError() == WSAEACCES && verbosity < 4 && !listen)) {
652 			log_err_addr("can't bind socket",
653 				wsa_strerror(WSAGetLastError()),
654 				(struct sockaddr_storage*)addr, addrlen);
655 		}
656 #endif /* USE_WINSOCK */
657 		sock_close(s);
658 		return -1;
659 	}
660 	if(!fd_set_nonblock(s)) {
661 		*noproto = 0;
662 		*inuse = 0;
663 		sock_close(s);
664 		return -1;
665 	}
666 	return s;
667 }
668 
669 int
670 create_tcp_accept_sock(struct addrinfo *addr, int v6only, int* noproto,
671 	int* reuseport, int transparent, int mss, int nodelay, int freebind,
672 	int use_systemd, int dscp)
673 {
674 	int s;
675 	char* err;
676 #if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_V6ONLY) || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined(SO_BINDANY)
677 	int on = 1;
678 #endif
679 #ifdef HAVE_SYSTEMD
680 	int got_fd_from_systemd = 0;
681 #endif
682 #ifdef USE_TCP_FASTOPEN
683 	int qlen;
684 #endif
685 #if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
686 	(void)transparent;
687 #endif
688 #if !defined(IP_FREEBIND)
689 	(void)freebind;
690 #endif
691 	verbose_print_addr(addr);
692 	*noproto = 0;
693 #ifdef HAVE_SYSTEMD
694 	if (!use_systemd ||
695 	    (use_systemd
696 	     && (s = systemd_get_activated(addr->ai_family, addr->ai_socktype, 1,
697 					   addr->ai_addr, addr->ai_addrlen,
698 					   NULL)) == -1)) {
699 #else
700 	(void)use_systemd;
701 #endif
702 	if((s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
703 #ifndef USE_WINSOCK
704 		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
705 			*noproto = 1;
706 			return -1;
707 		}
708 #else
709 		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
710 			WSAGetLastError() == WSAEPROTONOSUPPORT) {
711 			*noproto = 1;
712 			return -1;
713 		}
714 #endif
715 		log_err("can't create socket: %s", sock_strerror(errno));
716 		return -1;
717 	}
718 	if(nodelay) {
719 #if defined(IPPROTO_TCP) && defined(TCP_NODELAY)
720 		if(setsockopt(s, IPPROTO_TCP, TCP_NODELAY, (void*)&on,
721 			(socklen_t)sizeof(on)) < 0) {
722 			#ifndef USE_WINSOCK
723 			log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
724 				strerror(errno));
725 			#else
726 			log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
727 				wsa_strerror(WSAGetLastError()));
728 			#endif
729 		}
730 #else
731 		log_warn(" setsockopt(TCP_NODELAY) unsupported");
732 #endif /* defined(IPPROTO_TCP) && defined(TCP_NODELAY) */
733 	}
734 	if (mss > 0) {
735 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
736 		if(setsockopt(s, IPPROTO_TCP, TCP_MAXSEG, (void*)&mss,
737 			(socklen_t)sizeof(mss)) < 0) {
738 			log_err(" setsockopt(.. TCP_MAXSEG ..) failed: %s",
739 				sock_strerror(errno));
740 		} else {
741 			verbose(VERB_ALGO,
742 				" tcp socket mss set to %d", mss);
743 		}
744 #else
745 		log_warn(" setsockopt(TCP_MAXSEG) unsupported");
746 #endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
747 	}
748 #ifdef HAVE_SYSTEMD
749 	} else {
750 		got_fd_from_systemd = 1;
751     }
752 #endif
753 #ifdef SO_REUSEADDR
754 	if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
755 		(socklen_t)sizeof(on)) < 0) {
756 		log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
757 			sock_strerror(errno));
758 		sock_close(s);
759 		return -1;
760 	}
761 #endif /* SO_REUSEADDR */
762 #ifdef IP_FREEBIND
763 	if (freebind && setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
764 	    (socklen_t)sizeof(on)) < 0) {
765 		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
766 		strerror(errno));
767 	}
768 #endif /* IP_FREEBIND */
769 #ifdef SO_REUSEPORT
770 	/* try to set SO_REUSEPORT so that incoming
771 	 * connections are distributed evenly among the receiving threads.
772 	 * Each thread must have its own socket bound to the same port,
773 	 * with SO_REUSEPORT set on each socket.
774 	 */
775 	if (reuseport && *reuseport &&
776 		setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
777 		(socklen_t)sizeof(on)) < 0) {
778 #ifdef ENOPROTOOPT
779 		if(errno != ENOPROTOOPT || verbosity >= 3)
780 			log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
781 				strerror(errno));
782 #endif
783 		/* this option is not essential, we can continue */
784 		*reuseport = 0;
785 	}
786 #else
787 	(void)reuseport;
788 #endif /* defined(SO_REUSEPORT) */
789 #if defined(IPV6_V6ONLY)
790 	if(addr->ai_family == AF_INET6 && v6only
791 #  ifdef HAVE_SYSTEMD
792 		/* Systemd wants to control if the socket is v6 only
793 		 * or both, with BindIPv6Only=default, ipv6-only or
794 		 * both in systemd.socket, so it is not set here. */
795 		&& !got_fd_from_systemd
796 #  endif
797 		) {
798 		if(setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
799 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
800 			log_err("setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
801 				sock_strerror(errno));
802 			sock_close(s);
803 			return -1;
804 		}
805 	}
806 #else
807 	(void)v6only;
808 #endif /* IPV6_V6ONLY */
809 #ifdef IP_TRANSPARENT
810 	if (transparent &&
811 	    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
812 	    (socklen_t)sizeof(on)) < 0) {
813 		log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
814 			strerror(errno));
815 	}
816 #elif defined(IP_BINDANY)
817 	if (transparent &&
818 	    setsockopt(s, (addr->ai_family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
819 	    (addr->ai_family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
820 	    (void*)&on, (socklen_t)sizeof(on)) < 0) {
821 		log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
822 		(addr->ai_family==AF_INET6?"V6":""), strerror(errno));
823 	}
824 #elif defined(SO_BINDANY)
825 	if (transparent &&
826 	    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on, (socklen_t)
827 	    sizeof(on)) < 0) {
828 		log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
829 		strerror(errno));
830 	}
831 #endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
832 	err = set_ip_dscp(s, addr->ai_family, dscp);
833 	if(err != NULL)
834 		log_warn("error setting IP DiffServ codepoint %d on TCP socket: %s", dscp, err);
835 	if(
836 #ifdef HAVE_SYSTEMD
837 		!got_fd_from_systemd &&
838 #endif
839         bind(s, addr->ai_addr, addr->ai_addrlen) != 0) {
840 #ifndef USE_WINSOCK
841 		/* detect freebsd jail with no ipv6 permission */
842 		if(addr->ai_family==AF_INET6 && errno==EINVAL)
843 			*noproto = 1;
844 		else {
845 			log_err_addr("can't bind socket", strerror(errno),
846 				(struct sockaddr_storage*)addr->ai_addr,
847 				addr->ai_addrlen);
848 		}
849 #else
850 		log_err_addr("can't bind socket",
851 			wsa_strerror(WSAGetLastError()),
852 			(struct sockaddr_storage*)addr->ai_addr,
853 			addr->ai_addrlen);
854 #endif
855 		sock_close(s);
856 		return -1;
857 	}
858 	if(!fd_set_nonblock(s)) {
859 		sock_close(s);
860 		return -1;
861 	}
862 	if(listen(s, TCP_BACKLOG) == -1) {
863 		log_err("can't listen: %s", sock_strerror(errno));
864 		sock_close(s);
865 		return -1;
866 	}
867 #ifdef USE_TCP_FASTOPEN
868 	/* qlen specifies how many outstanding TFO requests to allow. Limit is a defense
869 	   against IP spoofing attacks as suggested in RFC7413 */
870 #ifdef __APPLE__
871 	/* OS X implementation only supports qlen of 1 via this call. Actual
872 	   value is configured by the net.inet.tcp.fastopen_backlog kernel parm. */
873 	qlen = 1;
874 #else
875 	/* 5 is recommended on linux */
876 	qlen = 5;
877 #endif
878 	if ((setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN, &qlen,
879 		  sizeof(qlen))) == -1 ) {
880 #ifdef ENOPROTOOPT
881 		/* squelch ENOPROTOOPT: freebsd server mode with kernel support
882 		   disabled, except when verbosity enabled for debugging */
883 		if(errno != ENOPROTOOPT || verbosity >= 3) {
884 #endif
885 		  if(errno == EPERM) {
886 		  	log_warn("Setting TCP Fast Open as server failed: %s ; this could likely be because sysctl net.inet.tcp.fastopen.enabled, net.inet.tcp.fastopen.server_enable, or net.ipv4.tcp_fastopen is disabled", strerror(errno));
887 		  } else {
888 		  	log_err("Setting TCP Fast Open as server failed: %s", strerror(errno));
889 		  }
890 #ifdef ENOPROTOOPT
891 		}
892 #endif
893 	}
894 #endif
895 	return s;
896 }
897 
898 char*
899 set_ip_dscp(int socket, int addrfamily, int dscp)
900 {
901 	int ds;
902 
903 	if(dscp == 0)
904 		return NULL;
905 	ds = dscp << 2;
906 	switch(addrfamily) {
907 	case AF_INET6:
908 	#ifdef IPV6_TCLASS
909 		if(setsockopt(socket, IPPROTO_IPV6, IPV6_TCLASS, (void*)&ds,
910 			sizeof(ds)) < 0)
911 			return sock_strerror(errno);
912 		break;
913 	#else
914 		return "IPV6_TCLASS not defined on this system";
915 	#endif
916 	default:
917 		if(setsockopt(socket, IPPROTO_IP, IP_TOS, (void*)&ds, sizeof(ds)) < 0)
918 			return sock_strerror(errno);
919 		break;
920 	}
921 	return NULL;
922 }
923 
924 int
925 create_local_accept_sock(const char *path, int* noproto, int use_systemd)
926 {
927 #ifdef HAVE_SYSTEMD
928 	int ret;
929 
930 	if (use_systemd && (ret = systemd_get_activated(AF_LOCAL, SOCK_STREAM, 1, NULL, 0, path)) != -1)
931 		return ret;
932 	else {
933 #endif
934 #ifdef HAVE_SYS_UN_H
935 	int s;
936 	struct sockaddr_un usock;
937 #ifndef HAVE_SYSTEMD
938 	(void)use_systemd;
939 #endif
940 
941 	verbose(VERB_ALGO, "creating unix socket %s", path);
942 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
943 	/* this member exists on BSDs, not Linux */
944 	usock.sun_len = (unsigned)sizeof(usock);
945 #endif
946 	usock.sun_family = AF_LOCAL;
947 	/* length is 92-108, 104 on FreeBSD */
948 	(void)strlcpy(usock.sun_path, path, sizeof(usock.sun_path));
949 
950 	if ((s = socket(AF_LOCAL, SOCK_STREAM, 0)) == -1) {
951 		log_err("Cannot create local socket %s (%s)",
952 			path, strerror(errno));
953 		return -1;
954 	}
955 
956 	if (unlink(path) && errno != ENOENT) {
957 		/* The socket already exists and cannot be removed */
958 		log_err("Cannot remove old local socket %s (%s)",
959 			path, strerror(errno));
960 		goto err;
961 	}
962 
963 	if (bind(s, (struct sockaddr *)&usock,
964 		(socklen_t)sizeof(struct sockaddr_un)) == -1) {
965 		log_err("Cannot bind local socket %s (%s)",
966 			path, strerror(errno));
967 		goto err;
968 	}
969 
970 	if (!fd_set_nonblock(s)) {
971 		log_err("Cannot set non-blocking mode");
972 		goto err;
973 	}
974 
975 	if (listen(s, TCP_BACKLOG) == -1) {
976 		log_err("can't listen: %s", strerror(errno));
977 		goto err;
978 	}
979 
980 	(void)noproto; /*unused*/
981 	return s;
982 
983 err:
984 	sock_close(s);
985 	return -1;
986 
987 #ifdef HAVE_SYSTEMD
988 	}
989 #endif
990 #else
991 	(void)use_systemd;
992 	(void)path;
993 	log_err("Local sockets are not supported");
994 	*noproto = 1;
995 	return -1;
996 #endif
997 }
998 
999 
1000 /**
1001  * Create socket from getaddrinfo results
1002  */
1003 static int
1004 make_sock(int stype, const char* ifname, const char* port,
1005 	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1006 	int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
1007 	int use_systemd, int dscp, struct unbound_socket* ub_sock)
1008 {
1009 	struct addrinfo *res = NULL;
1010 	int r, s, inuse, noproto;
1011 	hints->ai_socktype = stype;
1012 	*noip6 = 0;
1013 	if((r=getaddrinfo(ifname, port, hints, &res)) != 0 || !res) {
1014 #ifdef USE_WINSOCK
1015 		if(r == EAI_NONAME && hints->ai_family == AF_INET6){
1016 			*noip6 = 1; /* 'Host not found' for IP6 on winXP */
1017 			return -1;
1018 		}
1019 #endif
1020 		log_err("node %s:%s getaddrinfo: %s %s",
1021 			ifname?ifname:"default", port, gai_strerror(r),
1022 #ifdef EAI_SYSTEM
1023 			(r==EAI_SYSTEM?(char*)strerror(errno):"")
1024 #else
1025 			""
1026 #endif
1027 		);
1028 		return -1;
1029 	}
1030 	if(stype == SOCK_DGRAM) {
1031 		verbose_print_addr(res);
1032 		s = create_udp_sock(res->ai_family, res->ai_socktype,
1033 			(struct sockaddr*)res->ai_addr, res->ai_addrlen,
1034 			v6only, &inuse, &noproto, (int)rcv, (int)snd, 1,
1035 			reuseport, transparent, freebind, use_systemd, dscp);
1036 		if(s == -1 && inuse) {
1037 			log_err("bind: address already in use");
1038 		} else if(s == -1 && noproto && hints->ai_family == AF_INET6){
1039 			*noip6 = 1;
1040 		}
1041 	} else	{
1042 		s = create_tcp_accept_sock(res, v6only, &noproto, reuseport,
1043 			transparent, tcp_mss, nodelay, freebind, use_systemd,
1044 			dscp);
1045 		if(s == -1 && noproto && hints->ai_family == AF_INET6){
1046 			*noip6 = 1;
1047 		}
1048 	}
1049 
1050 	ub_sock->addr = res;
1051 	ub_sock->s = s;
1052 	ub_sock->fam = hints->ai_family;
1053 	ub_sock->acl = NULL;
1054 
1055 	return s;
1056 }
1057 
1058 /** make socket and first see if ifname contains port override info */
1059 static int
1060 make_sock_port(int stype, const char* ifname, const char* port,
1061 	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1062 	int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
1063 	int use_systemd, int dscp, struct unbound_socket* ub_sock)
1064 {
1065 	char* s = strchr(ifname, '@');
1066 	if(s) {
1067 		/* override port with ifspec@port */
1068 		char p[16];
1069 		char newif[128];
1070 		if((size_t)(s-ifname) >= sizeof(newif)) {
1071 			log_err("ifname too long: %s", ifname);
1072 			*noip6 = 0;
1073 			return -1;
1074 		}
1075 		if(strlen(s+1) >= sizeof(p)) {
1076 			log_err("portnumber too long: %s", ifname);
1077 			*noip6 = 0;
1078 			return -1;
1079 		}
1080 		(void)strlcpy(newif, ifname, sizeof(newif));
1081 		newif[s-ifname] = 0;
1082 		(void)strlcpy(p, s+1, sizeof(p));
1083 		p[strlen(s+1)]=0;
1084 		return make_sock(stype, newif, p, hints, v6only, noip6, rcv,
1085 			snd, reuseport, transparent, tcp_mss, nodelay, freebind,
1086 			use_systemd, dscp, ub_sock);
1087 	}
1088 	return make_sock(stype, ifname, port, hints, v6only, noip6, rcv, snd,
1089 		reuseport, transparent, tcp_mss, nodelay, freebind, use_systemd,
1090 		dscp, ub_sock);
1091 }
1092 
1093 /**
1094  * Add port to open ports list.
1095  * @param list: list head. changed.
1096  * @param s: fd.
1097  * @param ftype: if fd is UDP.
1098  * @param pp2_enabled: if PROXYv2 is enabled for this port.
1099  * @param ub_sock: socket with address.
1100  * @return false on failure. list in unchanged then.
1101  */
1102 static int
1103 port_insert(struct listen_port** list, int s, enum listen_type ftype,
1104 	int pp2_enabled, struct unbound_socket* ub_sock)
1105 {
1106 	struct listen_port* item = (struct listen_port*)malloc(
1107 		sizeof(struct listen_port));
1108 	if(!item)
1109 		return 0;
1110 	item->next = *list;
1111 	item->fd = s;
1112 	item->ftype = ftype;
1113 	item->pp2_enabled = pp2_enabled;
1114 	item->socket = ub_sock;
1115 	*list = item;
1116 	return 1;
1117 }
1118 
1119 /** set fd to receive software timestamps */
1120 static int
1121 set_recvtimestamp(int s)
1122 {
1123 #ifdef HAVE_LINUX_NET_TSTAMP_H
1124 	int opt = SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE;
1125 	if (setsockopt(s, SOL_SOCKET, SO_TIMESTAMPNS, (void*)&opt, (socklen_t)sizeof(opt)) < 0) {
1126 		log_err("setsockopt(..., SO_TIMESTAMPNS, ...) failed: %s",
1127 			strerror(errno));
1128 		return 0;
1129 	}
1130 	return 1;
1131 #else
1132 	log_err("packets timestamping is not supported on this platform");
1133 	(void)s;
1134 	return 0;
1135 #endif
1136 }
1137 
1138 /** set fd to receive source address packet info */
1139 static int
1140 set_recvpktinfo(int s, int family)
1141 {
1142 #if defined(IPV6_RECVPKTINFO) || defined(IPV6_PKTINFO) || (defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)) || defined(IP_PKTINFO)
1143 	int on = 1;
1144 #else
1145 	(void)s;
1146 #endif
1147 	if(family == AF_INET6) {
1148 #           ifdef IPV6_RECVPKTINFO
1149 		if(setsockopt(s, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1150 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1151 			log_err("setsockopt(..., IPV6_RECVPKTINFO, ...) failed: %s",
1152 				strerror(errno));
1153 			return 0;
1154 		}
1155 #           elif defined(IPV6_PKTINFO)
1156 		if(setsockopt(s, IPPROTO_IPV6, IPV6_PKTINFO,
1157 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1158 			log_err("setsockopt(..., IPV6_PKTINFO, ...) failed: %s",
1159 				strerror(errno));
1160 			return 0;
1161 		}
1162 #           else
1163 		log_err("no IPV6_RECVPKTINFO and IPV6_PKTINFO options, please "
1164 			"disable interface-automatic or do-ip6 in config");
1165 		return 0;
1166 #           endif /* defined IPV6_RECVPKTINFO */
1167 
1168 	} else if(family == AF_INET) {
1169 #           ifdef IP_PKTINFO
1170 		if(setsockopt(s, IPPROTO_IP, IP_PKTINFO,
1171 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1172 			log_err("setsockopt(..., IP_PKTINFO, ...) failed: %s",
1173 				strerror(errno));
1174 			return 0;
1175 		}
1176 #           elif defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)
1177 		if(setsockopt(s, IPPROTO_IP, IP_RECVDSTADDR,
1178 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1179 			log_err("setsockopt(..., IP_RECVDSTADDR, ...) failed: %s",
1180 				strerror(errno));
1181 			return 0;
1182 		}
1183 #           else
1184 		log_err("no IP_SENDSRCADDR or IP_PKTINFO option, please disable "
1185 			"interface-automatic or do-ip4 in config");
1186 		return 0;
1187 #           endif /* IP_PKTINFO */
1188 
1189 	}
1190 	return 1;
1191 }
1192 
1193 /** see if interface is ssl, its port number == the ssl port number */
1194 static int
1195 if_is_ssl(const char* ifname, const char* port, int ssl_port,
1196 	struct config_strlist* tls_additional_port)
1197 {
1198 	struct config_strlist* s;
1199 	char* p = strchr(ifname, '@');
1200 	if(!p && atoi(port) == ssl_port)
1201 		return 1;
1202 	if(p && atoi(p+1) == ssl_port)
1203 		return 1;
1204 	for(s = tls_additional_port; s; s = s->next) {
1205 		if(p && atoi(p+1) == atoi(s->str))
1206 			return 1;
1207 		if(!p && atoi(port) == atoi(s->str))
1208 			return 1;
1209 	}
1210 	return 0;
1211 }
1212 
1213 /**
1214  * Helper for ports_open. Creates one interface (or NULL for default).
1215  * @param ifname: The interface ip address.
1216  * @param do_auto: use automatic interface detection.
1217  * 	If enabled, then ifname must be the wildcard name.
1218  * @param do_udp: if udp should be used.
1219  * @param do_tcp: if tcp should be used.
1220  * @param hints: for getaddrinfo. family and flags have to be set by caller.
1221  * @param port: Port number to use (as string).
1222  * @param list: list of open ports, appended to, changed to point to list head.
1223  * @param rcv: receive buffer size for UDP
1224  * @param snd: send buffer size for UDP
1225  * @param ssl_port: ssl service port number
1226  * @param tls_additional_port: list of additional ssl service port numbers.
1227  * @param https_port: DoH service port number
1228  * @param proxy_protocol_port: list of PROXYv2 port numbers.
1229  * @param reuseport: try to set SO_REUSEPORT if nonNULL and true.
1230  * 	set to false on exit if reuseport failed due to no kernel support.
1231  * @param transparent: set IP_TRANSPARENT socket option.
1232  * @param tcp_mss: maximum segment size of tcp socket. default if zero.
1233  * @param freebind: set IP_FREEBIND socket option.
1234  * @param http2_nodelay: set TCP_NODELAY on HTTP/2 connection
1235  * @param use_systemd: if true, fetch sockets from systemd.
1236  * @param dnscrypt_port: dnscrypt service port number
1237  * @param dscp: DSCP to use.
1238  * @param sock_queue_timeout: the sock_queue_timeout from config. Seconds to
1239  * 	wait to discard if UDP packets have waited for long in the socket
1240  * 	buffer.
1241  * @return: returns false on error.
1242  */
1243 static int
1244 ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp,
1245 	struct addrinfo *hints, const char* port, struct listen_port** list,
1246 	size_t rcv, size_t snd, int ssl_port,
1247 	struct config_strlist* tls_additional_port, int https_port,
1248 	struct config_strlist* proxy_protocol_port,
1249 	int* reuseport, int transparent, int tcp_mss, int freebind,
1250 	int http2_nodelay, int use_systemd, int dnscrypt_port, int dscp,
1251 	int sock_queue_timeout)
1252 {
1253 	int s, noip6=0;
1254 	int is_https = if_is_https(ifname, port, https_port);
1255 	int is_dnscrypt = if_is_dnscrypt(ifname, port, dnscrypt_port);
1256 	int is_pp2 = if_is_pp2(ifname, port, proxy_protocol_port);
1257 	int nodelay = is_https && http2_nodelay;
1258 	struct unbound_socket* ub_sock;
1259 
1260 	if(!do_udp && !do_tcp)
1261 		return 0;
1262 
1263 	if(is_pp2) {
1264 		if(is_dnscrypt) {
1265 			fatal_exit("PROXYv2 and DNSCrypt combination not "
1266 				"supported!");
1267 		} else if(is_https) {
1268 			fatal_exit("PROXYv2 and DoH combination not "
1269 				"supported!");
1270 		}
1271 	}
1272 
1273 	if(do_auto) {
1274 		ub_sock = calloc(1, sizeof(struct unbound_socket));
1275 		if(!ub_sock)
1276 			return 0;
1277 		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
1278 			&noip6, rcv, snd, reuseport, transparent,
1279 			tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock)) == -1) {
1280 			if(ub_sock->addr)
1281 				freeaddrinfo(ub_sock->addr);
1282 			free(ub_sock);
1283 			if(noip6) {
1284 				log_warn("IPv6 protocol not available");
1285 				return 1;
1286 			}
1287 			return 0;
1288 		}
1289 		/* getting source addr packet info is highly non-portable */
1290 		if(!set_recvpktinfo(s, hints->ai_family)) {
1291 			sock_close(s);
1292 			if(ub_sock->addr)
1293 				freeaddrinfo(ub_sock->addr);
1294 			free(ub_sock);
1295 			return 0;
1296 		}
1297 		if (sock_queue_timeout && !set_recvtimestamp(s)) {
1298 			log_warn("socket timestamping is not available");
1299 		}
1300 		if(!port_insert(list, s, is_dnscrypt
1301 			?listen_type_udpancil_dnscrypt:listen_type_udpancil,
1302 			is_pp2, ub_sock)) {
1303 			sock_close(s);
1304 			if(ub_sock->addr)
1305 				freeaddrinfo(ub_sock->addr);
1306 			free(ub_sock);
1307 			return 0;
1308 		}
1309 	} else if(do_udp) {
1310 		ub_sock = calloc(1, sizeof(struct unbound_socket));
1311 		if(!ub_sock)
1312 			return 0;
1313 		/* regular udp socket */
1314 		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
1315 			&noip6, rcv, snd, reuseport, transparent,
1316 			tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock)) == -1) {
1317 			if(ub_sock->addr)
1318 				freeaddrinfo(ub_sock->addr);
1319 			free(ub_sock);
1320 			if(noip6) {
1321 				log_warn("IPv6 protocol not available");
1322 				return 1;
1323 			}
1324 			return 0;
1325 		}
1326 		if (sock_queue_timeout && !set_recvtimestamp(s)) {
1327 			log_warn("socket timestamping is not available");
1328 		}
1329 		if(!port_insert(list, s, is_dnscrypt
1330 			?listen_type_udp_dnscrypt :
1331 			(sock_queue_timeout ?
1332 				listen_type_udpancil:listen_type_udp),
1333 			is_pp2, ub_sock)) {
1334 			sock_close(s);
1335 			if(ub_sock->addr)
1336 				freeaddrinfo(ub_sock->addr);
1337 			free(ub_sock);
1338 			return 0;
1339 		}
1340 	}
1341 	if(do_tcp) {
1342 		int is_ssl = if_is_ssl(ifname, port, ssl_port,
1343 			tls_additional_port);
1344 		enum listen_type port_type;
1345 		ub_sock = calloc(1, sizeof(struct unbound_socket));
1346 		if(!ub_sock)
1347 			return 0;
1348 		if(is_ssl)
1349 			port_type = listen_type_ssl;
1350 		else if(is_https)
1351 			port_type = listen_type_http;
1352 		else if(is_dnscrypt)
1353 			port_type = listen_type_tcp_dnscrypt;
1354 		else
1355 			port_type = listen_type_tcp;
1356 		if((s = make_sock_port(SOCK_STREAM, ifname, port, hints, 1,
1357 			&noip6, 0, 0, reuseport, transparent, tcp_mss, nodelay,
1358 			freebind, use_systemd, dscp, ub_sock)) == -1) {
1359 			if(ub_sock->addr)
1360 				freeaddrinfo(ub_sock->addr);
1361 			free(ub_sock);
1362 			if(noip6) {
1363 				/*log_warn("IPv6 protocol not available");*/
1364 				return 1;
1365 			}
1366 			return 0;
1367 		}
1368 		if(is_ssl)
1369 			verbose(VERB_ALGO, "setup TCP for SSL service");
1370 		if(!port_insert(list, s, port_type, is_pp2, ub_sock)) {
1371 			sock_close(s);
1372 			if(ub_sock->addr)
1373 				freeaddrinfo(ub_sock->addr);
1374 			free(ub_sock);
1375 			return 0;
1376 		}
1377 	}
1378 	return 1;
1379 }
1380 
1381 /**
1382  * Add items to commpoint list in front.
1383  * @param c: commpoint to add.
1384  * @param front: listen struct.
1385  * @return: false on failure.
1386  */
1387 static int
1388 listen_cp_insert(struct comm_point* c, struct listen_dnsport* front)
1389 {
1390 	struct listen_list* item = (struct listen_list*)malloc(
1391 		sizeof(struct listen_list));
1392 	if(!item)
1393 		return 0;
1394 	item->com = c;
1395 	item->next = front->cps;
1396 	front->cps = item;
1397 	return 1;
1398 }
1399 
1400 void listen_setup_locks(void)
1401 {
1402 	if(!stream_wait_lock_inited) {
1403 		lock_basic_init(&stream_wait_count_lock);
1404 		stream_wait_lock_inited = 1;
1405 	}
1406 	if(!http2_query_buffer_lock_inited) {
1407 		lock_basic_init(&http2_query_buffer_count_lock);
1408 		http2_query_buffer_lock_inited = 1;
1409 	}
1410 	if(!http2_response_buffer_lock_inited) {
1411 		lock_basic_init(&http2_response_buffer_count_lock);
1412 		http2_response_buffer_lock_inited = 1;
1413 	}
1414 }
1415 
1416 void listen_desetup_locks(void)
1417 {
1418 	if(stream_wait_lock_inited) {
1419 		stream_wait_lock_inited = 0;
1420 		lock_basic_destroy(&stream_wait_count_lock);
1421 	}
1422 	if(http2_query_buffer_lock_inited) {
1423 		http2_query_buffer_lock_inited = 0;
1424 		lock_basic_destroy(&http2_query_buffer_count_lock);
1425 	}
1426 	if(http2_response_buffer_lock_inited) {
1427 		http2_response_buffer_lock_inited = 0;
1428 		lock_basic_destroy(&http2_response_buffer_count_lock);
1429 	}
1430 }
1431 
1432 struct listen_dnsport*
1433 listen_create(struct comm_base* base, struct listen_port* ports,
1434 	size_t bufsize, int tcp_accept_count, int tcp_idle_timeout,
1435 	int harden_large_queries, uint32_t http_max_streams,
1436 	char* http_endpoint, int http_notls, struct tcl_list* tcp_conn_limit,
1437 	void* sslctx, struct dt_env* dtenv, comm_point_callback_type* cb,
1438 	void *cb_arg)
1439 {
1440 	struct listen_dnsport* front = (struct listen_dnsport*)
1441 		malloc(sizeof(struct listen_dnsport));
1442 	if(!front)
1443 		return NULL;
1444 	front->cps = NULL;
1445 	front->udp_buff = sldns_buffer_new(bufsize);
1446 #ifdef USE_DNSCRYPT
1447 	front->dnscrypt_udp_buff = NULL;
1448 #endif
1449 	if(!front->udp_buff) {
1450 		free(front);
1451 		return NULL;
1452 	}
1453 
1454 	/* create comm points as needed */
1455 	while(ports) {
1456 		struct comm_point* cp = NULL;
1457 		if(ports->ftype == listen_type_udp ||
1458 		   ports->ftype == listen_type_udp_dnscrypt) {
1459 			cp = comm_point_create_udp(base, ports->fd,
1460 				front->udp_buff, ports->pp2_enabled, cb,
1461 				cb_arg, ports->socket);
1462 		} else if(ports->ftype == listen_type_tcp ||
1463 				ports->ftype == listen_type_tcp_dnscrypt) {
1464 			cp = comm_point_create_tcp(base, ports->fd,
1465 				tcp_accept_count, tcp_idle_timeout,
1466 				harden_large_queries, 0, NULL,
1467 				tcp_conn_limit, bufsize, front->udp_buff,
1468 				ports->ftype, ports->pp2_enabled, cb, cb_arg,
1469 				ports->socket);
1470 		} else if(ports->ftype == listen_type_ssl ||
1471 			ports->ftype == listen_type_http) {
1472 			cp = comm_point_create_tcp(base, ports->fd,
1473 				tcp_accept_count, tcp_idle_timeout,
1474 				harden_large_queries,
1475 				http_max_streams, http_endpoint,
1476 				tcp_conn_limit, bufsize, front->udp_buff,
1477 				ports->ftype, ports->pp2_enabled, cb, cb_arg,
1478 				ports->socket);
1479 			if(ports->ftype == listen_type_http) {
1480 				if(!sslctx && !http_notls) {
1481 					log_warn("HTTPS port configured, but "
1482 						"no TLS tls-service-key or "
1483 						"tls-service-pem set");
1484 				}
1485 #ifndef HAVE_SSL_CTX_SET_ALPN_SELECT_CB
1486 				if(!http_notls) {
1487 					log_warn("Unbound is not compiled "
1488 						"with an OpenSSL version "
1489 						"supporting ALPN "
1490 						"(OpenSSL >= 1.0.2). This "
1491 						"is required to use "
1492 						"DNS-over-HTTPS");
1493 				}
1494 #endif
1495 #ifndef HAVE_NGHTTP2_NGHTTP2_H
1496 				log_warn("Unbound is not compiled with "
1497 					"nghttp2. This is required to use "
1498 					"DNS-over-HTTPS.");
1499 #endif
1500 			}
1501 		} else if(ports->ftype == listen_type_udpancil ||
1502 				  ports->ftype == listen_type_udpancil_dnscrypt) {
1503 #if defined(AF_INET6) && defined(IPV6_PKTINFO) && defined(HAVE_RECVMSG)
1504 			cp = comm_point_create_udp_ancil(base, ports->fd,
1505 				front->udp_buff, ports->pp2_enabled, cb,
1506 				cb_arg, ports->socket);
1507 #else
1508 			log_warn("This system does not support UDP ancilliary data.");
1509 #endif
1510 		}
1511 		if(!cp) {
1512 			log_err("can't create commpoint");
1513 			listen_delete(front);
1514 			return NULL;
1515 		}
1516 		if((http_notls && ports->ftype == listen_type_http) ||
1517 			(ports->ftype == listen_type_tcp) ||
1518 			(ports->ftype == listen_type_udp) ||
1519 			(ports->ftype == listen_type_udpancil) ||
1520 			(ports->ftype == listen_type_tcp_dnscrypt) ||
1521 			(ports->ftype == listen_type_udp_dnscrypt) ||
1522 			(ports->ftype == listen_type_udpancil_dnscrypt))
1523 			cp->ssl = NULL;
1524 		else
1525 			cp->ssl = sslctx;
1526 		cp->dtenv = dtenv;
1527 		cp->do_not_close = 1;
1528 #ifdef USE_DNSCRYPT
1529 		if (ports->ftype == listen_type_udp_dnscrypt ||
1530 			ports->ftype == listen_type_tcp_dnscrypt ||
1531 			ports->ftype == listen_type_udpancil_dnscrypt) {
1532 			cp->dnscrypt = 1;
1533 			cp->dnscrypt_buffer = sldns_buffer_new(bufsize);
1534 			if(!cp->dnscrypt_buffer) {
1535 				log_err("can't alloc dnscrypt_buffer");
1536 				comm_point_delete(cp);
1537 				listen_delete(front);
1538 				return NULL;
1539 			}
1540 			front->dnscrypt_udp_buff = cp->dnscrypt_buffer;
1541 		}
1542 #endif
1543 		if(!listen_cp_insert(cp, front)) {
1544 			log_err("malloc failed");
1545 			comm_point_delete(cp);
1546 			listen_delete(front);
1547 			return NULL;
1548 		}
1549 		ports = ports->next;
1550 	}
1551 	if(!front->cps) {
1552 		log_err("Could not open sockets to accept queries.");
1553 		listen_delete(front);
1554 		return NULL;
1555 	}
1556 
1557 	return front;
1558 }
1559 
1560 void
1561 listen_list_delete(struct listen_list* list)
1562 {
1563 	struct listen_list *p = list, *pn;
1564 	while(p) {
1565 		pn = p->next;
1566 		comm_point_delete(p->com);
1567 		free(p);
1568 		p = pn;
1569 	}
1570 }
1571 
1572 void
1573 listen_delete(struct listen_dnsport* front)
1574 {
1575 	if(!front)
1576 		return;
1577 	listen_list_delete(front->cps);
1578 #ifdef USE_DNSCRYPT
1579 	if(front->dnscrypt_udp_buff &&
1580 		front->udp_buff != front->dnscrypt_udp_buff) {
1581 		sldns_buffer_free(front->dnscrypt_udp_buff);
1582 	}
1583 #endif
1584 	sldns_buffer_free(front->udp_buff);
1585 	free(front);
1586 }
1587 
1588 #ifdef HAVE_GETIFADDRS
1589 static int
1590 resolve_ifa_name(struct ifaddrs *ifas, const char *search_ifa, char ***ip_addresses, int *ip_addresses_size)
1591 {
1592 	struct ifaddrs *ifa;
1593 	void *tmpbuf;
1594 	int last_ip_addresses_size = *ip_addresses_size;
1595 
1596 	for(ifa = ifas; ifa != NULL; ifa = ifa->ifa_next) {
1597 		sa_family_t family;
1598 		const char* atsign;
1599 #ifdef INET6      /* |   address ip    | % |  ifa name  | @ |  port  | nul */
1600 		char addr_buf[INET6_ADDRSTRLEN + 1 + IF_NAMESIZE + 1 + 16 + 1];
1601 #else
1602 		char addr_buf[INET_ADDRSTRLEN + 1 + 16 + 1];
1603 #endif
1604 
1605 		if((atsign=strrchr(search_ifa, '@')) != NULL) {
1606 			if(strlen(ifa->ifa_name) != (size_t)(atsign-search_ifa)
1607 			   || strncmp(ifa->ifa_name, search_ifa,
1608 			   atsign-search_ifa) != 0)
1609 				continue;
1610 		} else {
1611 			if(strcmp(ifa->ifa_name, search_ifa) != 0)
1612 				continue;
1613 			atsign = "";
1614 		}
1615 
1616 		if(ifa->ifa_addr == NULL)
1617 			continue;
1618 
1619 		family = ifa->ifa_addr->sa_family;
1620 		if(family == AF_INET) {
1621 			char a4[INET_ADDRSTRLEN + 1];
1622 			struct sockaddr_in *in4 = (struct sockaddr_in *)
1623 				ifa->ifa_addr;
1624 			if(!inet_ntop(family, &in4->sin_addr, a4, sizeof(a4))) {
1625 				log_err("inet_ntop failed");
1626 				return 0;
1627 			}
1628 			snprintf(addr_buf, sizeof(addr_buf), "%s%s",
1629 				a4, atsign);
1630 		}
1631 #ifdef INET6
1632 		else if(family == AF_INET6) {
1633 			struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)
1634 				ifa->ifa_addr;
1635 			char a6[INET6_ADDRSTRLEN + 1];
1636 			char if_index_name[IF_NAMESIZE + 1];
1637 			if_index_name[0] = 0;
1638 			if(!inet_ntop(family, &in6->sin6_addr, a6, sizeof(a6))) {
1639 				log_err("inet_ntop failed");
1640 				return 0;
1641 			}
1642 			(void)if_indextoname(in6->sin6_scope_id,
1643 				(char *)if_index_name);
1644 			if (strlen(if_index_name) != 0) {
1645 				snprintf(addr_buf, sizeof(addr_buf),
1646 					"%s%%%s%s", a6, if_index_name, atsign);
1647 			} else {
1648 				snprintf(addr_buf, sizeof(addr_buf), "%s%s",
1649 					a6, atsign);
1650 			}
1651 		}
1652 #endif
1653 		else {
1654 			continue;
1655 		}
1656 		verbose(4, "interface %s has address %s", search_ifa, addr_buf);
1657 
1658 		tmpbuf = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
1659 		if(!tmpbuf) {
1660 			log_err("realloc failed: out of memory");
1661 			return 0;
1662 		} else {
1663 			*ip_addresses = tmpbuf;
1664 		}
1665 		(*ip_addresses)[*ip_addresses_size] = strdup(addr_buf);
1666 		if(!(*ip_addresses)[*ip_addresses_size]) {
1667 			log_err("strdup failed: out of memory");
1668 			return 0;
1669 		}
1670 		(*ip_addresses_size)++;
1671 	}
1672 
1673 	if (*ip_addresses_size == last_ip_addresses_size) {
1674 		tmpbuf = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
1675 		if(!tmpbuf) {
1676 			log_err("realloc failed: out of memory");
1677 			return 0;
1678 		} else {
1679 			*ip_addresses = tmpbuf;
1680 		}
1681 		(*ip_addresses)[*ip_addresses_size] = strdup(search_ifa);
1682 		if(!(*ip_addresses)[*ip_addresses_size]) {
1683 			log_err("strdup failed: out of memory");
1684 			return 0;
1685 		}
1686 		(*ip_addresses_size)++;
1687 	}
1688 	return 1;
1689 }
1690 #endif /* HAVE_GETIFADDRS */
1691 
1692 int resolve_interface_names(char** ifs, int num_ifs,
1693 	struct config_strlist* list, char*** resif, int* num_resif)
1694 {
1695 #ifdef HAVE_GETIFADDRS
1696 	struct ifaddrs *addrs = NULL;
1697 	if(num_ifs == 0 && list == NULL) {
1698 		*resif = NULL;
1699 		*num_resif = 0;
1700 		return 1;
1701 	}
1702 	if(getifaddrs(&addrs) == -1) {
1703 		log_err("failed to list interfaces: getifaddrs: %s",
1704 			strerror(errno));
1705 		freeifaddrs(addrs);
1706 		return 0;
1707 	}
1708 	if(ifs) {
1709 		int i;
1710 		for(i=0; i<num_ifs; i++) {
1711 			if(!resolve_ifa_name(addrs, ifs[i], resif, num_resif)) {
1712 				freeifaddrs(addrs);
1713 				config_del_strarray(*resif, *num_resif);
1714 				*resif = NULL;
1715 				*num_resif = 0;
1716 				return 0;
1717 			}
1718 		}
1719 	}
1720 	if(list) {
1721 		struct config_strlist* p;
1722 		for(p = list; p; p = p->next) {
1723 			if(!resolve_ifa_name(addrs, p->str, resif, num_resif)) {
1724 				freeifaddrs(addrs);
1725 				config_del_strarray(*resif, *num_resif);
1726 				*resif = NULL;
1727 				*num_resif = 0;
1728 				return 0;
1729 			}
1730 }
1731 	}
1732 	freeifaddrs(addrs);
1733 	return 1;
1734 #else
1735 	struct config_strlist* p;
1736 	if(num_ifs == 0 && list == NULL) {
1737 		*resif = NULL;
1738 		*num_resif = 0;
1739 		return 1;
1740 	}
1741 	*num_resif = num_ifs;
1742 	for(p = list; p; p = p->next) {
1743 		(*num_resif)++;
1744 	}
1745 	*resif = calloc(*num_resif, sizeof(**resif));
1746 	if(!*resif) {
1747 		log_err("out of memory");
1748 		return 0;
1749 	}
1750 	if(ifs) {
1751 		int i;
1752 		for(i=0; i<num_ifs; i++) {
1753 			(*resif)[i] = strdup(ifs[i]);
1754 			if(!((*resif)[i])) {
1755 				log_err("out of memory");
1756 				config_del_strarray(*resif, *num_resif);
1757 				*resif = NULL;
1758 				*num_resif = 0;
1759 				return 0;
1760 			}
1761 		}
1762 	}
1763 	if(list) {
1764 		int idx = num_ifs;
1765 		for(p = list; p; p = p->next) {
1766 			(*resif)[idx] = strdup(p->str);
1767 			if(!((*resif)[idx])) {
1768 				log_err("out of memory");
1769 				config_del_strarray(*resif, *num_resif);
1770 				*resif = NULL;
1771 				*num_resif = 0;
1772 				return 0;
1773 			}
1774 			idx++;
1775 		}
1776 	}
1777 	return 1;
1778 #endif /* HAVE_GETIFADDRS */
1779 }
1780 
1781 struct listen_port*
1782 listening_ports_open(struct config_file* cfg, char** ifs, int num_ifs,
1783 	int* reuseport)
1784 {
1785 	struct listen_port* list = NULL;
1786 	struct addrinfo hints;
1787 	int i, do_ip4, do_ip6;
1788 	int do_tcp, do_auto;
1789 	char portbuf[32];
1790 	snprintf(portbuf, sizeof(portbuf), "%d", cfg->port);
1791 	do_ip4 = cfg->do_ip4;
1792 	do_ip6 = cfg->do_ip6;
1793 	do_tcp = cfg->do_tcp;
1794 	do_auto = cfg->if_automatic && cfg->do_udp;
1795 	if(cfg->incoming_num_tcp == 0)
1796 		do_tcp = 0;
1797 
1798 	/* getaddrinfo */
1799 	memset(&hints, 0, sizeof(hints));
1800 	hints.ai_flags = AI_PASSIVE;
1801 	/* no name lookups on our listening ports */
1802 	if(num_ifs > 0)
1803 		hints.ai_flags |= AI_NUMERICHOST;
1804 	hints.ai_family = AF_UNSPEC;
1805 #ifndef INET6
1806 	do_ip6 = 0;
1807 #endif
1808 	if(!do_ip4 && !do_ip6) {
1809 		return NULL;
1810 	}
1811 	/* create ip4 and ip6 ports so that return addresses are nice. */
1812 	if(do_auto || num_ifs == 0) {
1813 		if(do_auto && cfg->if_automatic_ports &&
1814 			cfg->if_automatic_ports[0]!=0) {
1815 			char* now = cfg->if_automatic_ports;
1816 			while(now && *now) {
1817 				char* after;
1818 				int extraport;
1819 				while(isspace((unsigned char)*now))
1820 					now++;
1821 				if(!*now)
1822 					break;
1823 				after = now;
1824 				extraport = (int)strtol(now, &after, 10);
1825 				if(extraport < 0 || extraport > 65535) {
1826 					log_err("interface-automatic-ports port number out of range, at position %d of '%s'", (int)(now-cfg->if_automatic_ports)+1, cfg->if_automatic_ports);
1827 					listening_ports_free(list);
1828 					return NULL;
1829 				}
1830 				if(extraport == 0 && now == after) {
1831 					log_err("interface-automatic-ports could not be parsed, at position %d of '%s'", (int)(now-cfg->if_automatic_ports)+1, cfg->if_automatic_ports);
1832 					listening_ports_free(list);
1833 					return NULL;
1834 				}
1835 				now = after;
1836 				snprintf(portbuf, sizeof(portbuf), "%d", extraport);
1837 				if(do_ip6) {
1838 					hints.ai_family = AF_INET6;
1839 					if(!ports_create_if("::0",
1840 						do_auto, cfg->do_udp, do_tcp,
1841 						&hints, portbuf, &list,
1842 						cfg->so_rcvbuf, cfg->so_sndbuf,
1843 						cfg->ssl_port, cfg->tls_additional_port,
1844 						cfg->https_port,
1845 						cfg->proxy_protocol_port,
1846 						reuseport, cfg->ip_transparent,
1847 						cfg->tcp_mss, cfg->ip_freebind,
1848 						cfg->http_nodelay, cfg->use_systemd,
1849 						cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1850 						listening_ports_free(list);
1851 						return NULL;
1852 					}
1853 				}
1854 				if(do_ip4) {
1855 					hints.ai_family = AF_INET;
1856 					if(!ports_create_if("0.0.0.0",
1857 						do_auto, cfg->do_udp, do_tcp,
1858 						&hints, portbuf, &list,
1859 						cfg->so_rcvbuf, cfg->so_sndbuf,
1860 						cfg->ssl_port, cfg->tls_additional_port,
1861 						cfg->https_port,
1862 						cfg->proxy_protocol_port,
1863 						reuseport, cfg->ip_transparent,
1864 						cfg->tcp_mss, cfg->ip_freebind,
1865 						cfg->http_nodelay, cfg->use_systemd,
1866 						cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1867 						listening_ports_free(list);
1868 						return NULL;
1869 					}
1870 				}
1871 			}
1872 			return list;
1873 		}
1874 		if(do_ip6) {
1875 			hints.ai_family = AF_INET6;
1876 			if(!ports_create_if(do_auto?"::0":"::1",
1877 				do_auto, cfg->do_udp, do_tcp,
1878 				&hints, portbuf, &list,
1879 				cfg->so_rcvbuf, cfg->so_sndbuf,
1880 				cfg->ssl_port, cfg->tls_additional_port,
1881 				cfg->https_port, cfg->proxy_protocol_port,
1882 				reuseport, cfg->ip_transparent,
1883 				cfg->tcp_mss, cfg->ip_freebind,
1884 				cfg->http_nodelay, cfg->use_systemd,
1885 				cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1886 				listening_ports_free(list);
1887 				return NULL;
1888 			}
1889 		}
1890 		if(do_ip4) {
1891 			hints.ai_family = AF_INET;
1892 			if(!ports_create_if(do_auto?"0.0.0.0":"127.0.0.1",
1893 				do_auto, cfg->do_udp, do_tcp,
1894 				&hints, portbuf, &list,
1895 				cfg->so_rcvbuf, cfg->so_sndbuf,
1896 				cfg->ssl_port, cfg->tls_additional_port,
1897 				cfg->https_port, cfg->proxy_protocol_port,
1898 				reuseport, cfg->ip_transparent,
1899 				cfg->tcp_mss, cfg->ip_freebind,
1900 				cfg->http_nodelay, cfg->use_systemd,
1901 				cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1902 				listening_ports_free(list);
1903 				return NULL;
1904 			}
1905 		}
1906 	} else for(i = 0; i<num_ifs; i++) {
1907 		if(str_is_ip6(ifs[i])) {
1908 			if(!do_ip6)
1909 				continue;
1910 			hints.ai_family = AF_INET6;
1911 			if(!ports_create_if(ifs[i], 0, cfg->do_udp,
1912 				do_tcp, &hints, portbuf, &list,
1913 				cfg->so_rcvbuf, cfg->so_sndbuf,
1914 				cfg->ssl_port, cfg->tls_additional_port,
1915 				cfg->https_port, cfg->proxy_protocol_port,
1916 				reuseport, cfg->ip_transparent,
1917 				cfg->tcp_mss, cfg->ip_freebind,
1918 				cfg->http_nodelay, cfg->use_systemd,
1919 				cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1920 				listening_ports_free(list);
1921 				return NULL;
1922 			}
1923 		} else {
1924 			if(!do_ip4)
1925 				continue;
1926 			hints.ai_family = AF_INET;
1927 			if(!ports_create_if(ifs[i], 0, cfg->do_udp,
1928 				do_tcp, &hints, portbuf, &list,
1929 				cfg->so_rcvbuf, cfg->so_sndbuf,
1930 				cfg->ssl_port, cfg->tls_additional_port,
1931 				cfg->https_port, cfg->proxy_protocol_port,
1932 				reuseport, cfg->ip_transparent,
1933 				cfg->tcp_mss, cfg->ip_freebind,
1934 				cfg->http_nodelay, cfg->use_systemd,
1935 				cfg->dnscrypt_port, cfg->ip_dscp, cfg->sock_queue_timeout)) {
1936 				listening_ports_free(list);
1937 				return NULL;
1938 			}
1939 		}
1940 	}
1941 
1942 	return list;
1943 }
1944 
1945 void listening_ports_free(struct listen_port* list)
1946 {
1947 	struct listen_port* nx;
1948 	while(list) {
1949 		nx = list->next;
1950 		if(list->fd != -1) {
1951 			sock_close(list->fd);
1952 		}
1953 		/* rc_ports don't have ub_socket */
1954 		if(list->socket) {
1955 			if(list->socket->addr)
1956 				freeaddrinfo(list->socket->addr);
1957 			free(list->socket);
1958 		}
1959 		free(list);
1960 		list = nx;
1961 	}
1962 }
1963 
1964 size_t listen_get_mem(struct listen_dnsport* listen)
1965 {
1966 	struct listen_list* p;
1967 	size_t s = sizeof(*listen) + sizeof(*listen->base) +
1968 		sizeof(*listen->udp_buff) +
1969 		sldns_buffer_capacity(listen->udp_buff);
1970 #ifdef USE_DNSCRYPT
1971 	s += sizeof(*listen->dnscrypt_udp_buff);
1972 	if(listen->udp_buff != listen->dnscrypt_udp_buff){
1973 		s += sldns_buffer_capacity(listen->dnscrypt_udp_buff);
1974 	}
1975 #endif
1976 	for(p = listen->cps; p; p = p->next) {
1977 		s += sizeof(*p);
1978 		s += comm_point_get_mem(p->com);
1979 	}
1980 	return s;
1981 }
1982 
1983 void listen_stop_accept(struct listen_dnsport* listen)
1984 {
1985 	/* do not stop the ones that have no tcp_free list
1986 	 * (they have already stopped listening) */
1987 	struct listen_list* p;
1988 	for(p=listen->cps; p; p=p->next) {
1989 		if(p->com->type == comm_tcp_accept &&
1990 			p->com->tcp_free != NULL) {
1991 			comm_point_stop_listening(p->com);
1992 		}
1993 	}
1994 }
1995 
1996 void listen_start_accept(struct listen_dnsport* listen)
1997 {
1998 	/* do not start the ones that have no tcp_free list, it is no
1999 	 * use to listen to them because they have no free tcp handlers */
2000 	struct listen_list* p;
2001 	for(p=listen->cps; p; p=p->next) {
2002 		if(p->com->type == comm_tcp_accept &&
2003 			p->com->tcp_free != NULL) {
2004 			comm_point_start_listening(p->com, -1, -1);
2005 		}
2006 	}
2007 }
2008 
2009 struct tcp_req_info*
2010 tcp_req_info_create(struct sldns_buffer* spoolbuf)
2011 {
2012 	struct tcp_req_info* req = (struct tcp_req_info*)malloc(sizeof(*req));
2013 	if(!req) {
2014 		log_err("malloc failure for new stream outoforder processing structure");
2015 		return NULL;
2016 	}
2017 	memset(req, 0, sizeof(*req));
2018 	req->spool_buffer = spoolbuf;
2019 	return req;
2020 }
2021 
2022 void
2023 tcp_req_info_delete(struct tcp_req_info* req)
2024 {
2025 	if(!req) return;
2026 	tcp_req_info_clear(req);
2027 	/* cp is pointer back to commpoint that owns this struct and
2028 	 * called delete on us */
2029 	/* spool_buffer is shared udp buffer, not deleted here */
2030 	free(req);
2031 }
2032 
2033 void tcp_req_info_clear(struct tcp_req_info* req)
2034 {
2035 	struct tcp_req_open_item* open, *nopen;
2036 	struct tcp_req_done_item* item, *nitem;
2037 	if(!req) return;
2038 
2039 	/* free outstanding request mesh reply entries */
2040 	open = req->open_req_list;
2041 	while(open) {
2042 		nopen = open->next;
2043 		mesh_state_remove_reply(open->mesh, open->mesh_state, req->cp);
2044 		free(open);
2045 		open = nopen;
2046 	}
2047 	req->open_req_list = NULL;
2048 	req->num_open_req = 0;
2049 
2050 	/* free pending writable result packets */
2051 	item = req->done_req_list;
2052 	while(item) {
2053 		nitem = item->next;
2054 		lock_basic_lock(&stream_wait_count_lock);
2055 		stream_wait_count -= (sizeof(struct tcp_req_done_item)
2056 			+item->len);
2057 		lock_basic_unlock(&stream_wait_count_lock);
2058 		free(item->buf);
2059 		free(item);
2060 		item = nitem;
2061 	}
2062 	req->done_req_list = NULL;
2063 	req->num_done_req = 0;
2064 	req->read_is_closed = 0;
2065 }
2066 
2067 void
2068 tcp_req_info_remove_mesh_state(struct tcp_req_info* req, struct mesh_state* m)
2069 {
2070 	struct tcp_req_open_item* open, *prev = NULL;
2071 	if(!req || !m) return;
2072 	open = req->open_req_list;
2073 	while(open) {
2074 		if(open->mesh_state == m) {
2075 			struct tcp_req_open_item* next;
2076 			if(prev) prev->next = open->next;
2077 			else req->open_req_list = open->next;
2078 			/* caller has to manage the mesh state reply entry */
2079 			next = open->next;
2080 			free(open);
2081 			req->num_open_req --;
2082 
2083 			/* prev = prev; */
2084 			open = next;
2085 			continue;
2086 		}
2087 		prev = open;
2088 		open = open->next;
2089 	}
2090 }
2091 
2092 /** setup listening for read or write */
2093 static void
2094 tcp_req_info_setup_listen(struct tcp_req_info* req)
2095 {
2096 	int wr = 0;
2097 	int rd = 0;
2098 
2099 	if(req->cp->tcp_byte_count != 0) {
2100 		/* cannot change, halfway through */
2101 		return;
2102 	}
2103 
2104 	if(!req->cp->tcp_is_reading)
2105 		wr = 1;
2106 	if(!req->read_is_closed)
2107 		rd = 1;
2108 
2109 	if(wr) {
2110 		req->cp->tcp_is_reading = 0;
2111 		comm_point_stop_listening(req->cp);
2112 		comm_point_start_listening(req->cp, -1,
2113 			adjusted_tcp_timeout(req->cp));
2114 	} else if(rd) {
2115 		req->cp->tcp_is_reading = 1;
2116 		comm_point_stop_listening(req->cp);
2117 		comm_point_start_listening(req->cp, -1,
2118 			adjusted_tcp_timeout(req->cp));
2119 		/* and also read it (from SSL stack buffers), so
2120 		 * no event read event is expected since the remainder of
2121 		 * the TLS frame is sitting in the buffers. */
2122 		req->read_again = 1;
2123 	} else {
2124 		comm_point_stop_listening(req->cp);
2125 		comm_point_start_listening(req->cp, -1,
2126 			adjusted_tcp_timeout(req->cp));
2127 		comm_point_listen_for_rw(req->cp, 0, 0);
2128 	}
2129 }
2130 
2131 /** remove first item from list of pending results */
2132 static struct tcp_req_done_item*
2133 tcp_req_info_pop_done(struct tcp_req_info* req)
2134 {
2135 	struct tcp_req_done_item* item;
2136 	log_assert(req->num_done_req > 0 && req->done_req_list);
2137 	item = req->done_req_list;
2138 	lock_basic_lock(&stream_wait_count_lock);
2139 	stream_wait_count -= (sizeof(struct tcp_req_done_item)+item->len);
2140 	lock_basic_unlock(&stream_wait_count_lock);
2141 	req->done_req_list = req->done_req_list->next;
2142 	req->num_done_req --;
2143 	return item;
2144 }
2145 
2146 /** Send given buffer and setup to write */
2147 static void
2148 tcp_req_info_start_write_buf(struct tcp_req_info* req, uint8_t* buf,
2149 	size_t len)
2150 {
2151 	sldns_buffer_clear(req->cp->buffer);
2152 	sldns_buffer_write(req->cp->buffer, buf, len);
2153 	sldns_buffer_flip(req->cp->buffer);
2154 
2155 	req->cp->tcp_is_reading = 0; /* we are now writing */
2156 }
2157 
2158 /** pick up the next result and start writing it to the channel */
2159 static void
2160 tcp_req_pickup_next_result(struct tcp_req_info* req)
2161 {
2162 	if(req->num_done_req > 0) {
2163 		/* unlist the done item from the list of pending results */
2164 		struct tcp_req_done_item* item = tcp_req_info_pop_done(req);
2165 		tcp_req_info_start_write_buf(req, item->buf, item->len);
2166 		free(item->buf);
2167 		free(item);
2168 	}
2169 }
2170 
2171 /** the read channel has closed */
2172 int
2173 tcp_req_info_handle_read_close(struct tcp_req_info* req)
2174 {
2175 	verbose(VERB_ALGO, "tcp channel read side closed %d", req->cp->fd);
2176 	/* reset byte count for (potential) partial read */
2177 	req->cp->tcp_byte_count = 0;
2178 	/* if we still have results to write, pick up next and write it */
2179 	if(req->num_done_req != 0) {
2180 		tcp_req_pickup_next_result(req);
2181 		tcp_req_info_setup_listen(req);
2182 		return 1;
2183 	}
2184 	/* if nothing to do, this closes the connection */
2185 	if(req->num_open_req == 0 && req->num_done_req == 0)
2186 		return 0;
2187 	/* otherwise, we must be waiting for dns resolve, wait with timeout */
2188 	req->read_is_closed = 1;
2189 	tcp_req_info_setup_listen(req);
2190 	return 1;
2191 }
2192 
2193 void
2194 tcp_req_info_handle_writedone(struct tcp_req_info* req)
2195 {
2196 	/* back to reading state, we finished this write event */
2197 	sldns_buffer_clear(req->cp->buffer);
2198 	if(req->num_done_req == 0 && req->read_is_closed) {
2199 		/* no more to write and nothing to read, close it */
2200 		comm_point_drop_reply(&req->cp->repinfo);
2201 		return;
2202 	}
2203 	req->cp->tcp_is_reading = 1;
2204 	/* see if another result needs writing */
2205 	tcp_req_pickup_next_result(req);
2206 
2207 	/* see if there is more to write, if not stop_listening for writing */
2208 	/* see if new requests are allowed, if so, start_listening
2209 	 * for reading */
2210 	tcp_req_info_setup_listen(req);
2211 }
2212 
2213 void
2214 tcp_req_info_handle_readdone(struct tcp_req_info* req)
2215 {
2216 	struct comm_point* c = req->cp;
2217 
2218 	/* we want to read up several requests, unless there are
2219 	 * pending answers */
2220 
2221 	req->is_drop = 0;
2222 	req->is_reply = 0;
2223 	req->in_worker_handle = 1;
2224 	sldns_buffer_set_limit(req->spool_buffer, 0);
2225 	/* handle the current request */
2226 	/* this calls the worker handle request routine that could give
2227 	 * a cache response, or localdata response, or drop the reply,
2228 	 * or schedule a mesh entry for later */
2229 	fptr_ok(fptr_whitelist_comm_point(c->callback));
2230 	if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo) ) {
2231 		req->in_worker_handle = 0;
2232 		/* there is an answer, put it up.  It is already in the
2233 		 * c->buffer, just send it. */
2234 		/* since we were just reading a query, the channel is
2235 		 * clear to write to */
2236 	send_it:
2237 		c->tcp_is_reading = 0;
2238 		comm_point_stop_listening(c);
2239 		comm_point_start_listening(c, -1, adjusted_tcp_timeout(c));
2240 		return;
2241 	}
2242 	req->in_worker_handle = 0;
2243 	/* it should be waiting in the mesh for recursion.
2244 	 * If mesh failed to add a new entry and called commpoint_drop_reply.
2245 	 * Then the mesh state has been cleared. */
2246 	if(req->is_drop) {
2247 		/* the reply has been dropped, stream has been closed. */
2248 		return;
2249 	}
2250 	/* If mesh failed(mallocfail) and called commpoint_send_reply with
2251 	 * something like servfail then we pick up that reply below. */
2252 	if(req->is_reply) {
2253 		goto send_it;
2254 	}
2255 
2256 	sldns_buffer_clear(c->buffer);
2257 	/* if pending answers, pick up an answer and start sending it */
2258 	tcp_req_pickup_next_result(req);
2259 
2260 	/* if answers pending, start sending answers */
2261 	/* read more requests if we can have more requests */
2262 	tcp_req_info_setup_listen(req);
2263 }
2264 
2265 int
2266 tcp_req_info_add_meshstate(struct tcp_req_info* req,
2267 	struct mesh_area* mesh, struct mesh_state* m)
2268 {
2269 	struct tcp_req_open_item* item;
2270 	log_assert(req && mesh && m);
2271 	item = (struct tcp_req_open_item*)malloc(sizeof(*item));
2272 	if(!item) return 0;
2273 	item->next = req->open_req_list;
2274 	item->mesh = mesh;
2275 	item->mesh_state = m;
2276 	req->open_req_list = item;
2277 	req->num_open_req++;
2278 	return 1;
2279 }
2280 
2281 /** Add a result to the result list.  At the end. */
2282 static int
2283 tcp_req_info_add_result(struct tcp_req_info* req, uint8_t* buf, size_t len)
2284 {
2285 	struct tcp_req_done_item* last = NULL;
2286 	struct tcp_req_done_item* item;
2287 	size_t space;
2288 
2289 	/* see if we have space */
2290 	space = sizeof(struct tcp_req_done_item) + len;
2291 	lock_basic_lock(&stream_wait_count_lock);
2292 	if(stream_wait_count + space > stream_wait_max) {
2293 		lock_basic_unlock(&stream_wait_count_lock);
2294 		verbose(VERB_ALGO, "drop stream reply, no space left, in stream-wait-size");
2295 		return 0;
2296 	}
2297 	stream_wait_count += space;
2298 	lock_basic_unlock(&stream_wait_count_lock);
2299 
2300 	/* find last element */
2301 	last = req->done_req_list;
2302 	while(last && last->next)
2303 		last = last->next;
2304 
2305 	/* create new element */
2306 	item = (struct tcp_req_done_item*)malloc(sizeof(*item));
2307 	if(!item) {
2308 		log_err("malloc failure, for stream result list");
2309 		return 0;
2310 	}
2311 	item->next = NULL;
2312 	item->len = len;
2313 	item->buf = memdup(buf, len);
2314 	if(!item->buf) {
2315 		free(item);
2316 		log_err("malloc failure, adding reply to stream result list");
2317 		return 0;
2318 	}
2319 
2320 	/* link in */
2321 	if(last) last->next = item;
2322 	else req->done_req_list = item;
2323 	req->num_done_req++;
2324 	return 1;
2325 }
2326 
2327 void
2328 tcp_req_info_send_reply(struct tcp_req_info* req)
2329 {
2330 	if(req->in_worker_handle) {
2331 		/* reply from mesh is in the spool_buffer */
2332 		/* copy now, so that the spool buffer is free for other tasks
2333 		 * before the callback is done */
2334 		sldns_buffer_clear(req->cp->buffer);
2335 		sldns_buffer_write(req->cp->buffer,
2336 			sldns_buffer_begin(req->spool_buffer),
2337 			sldns_buffer_limit(req->spool_buffer));
2338 		sldns_buffer_flip(req->cp->buffer);
2339 		req->is_reply = 1;
2340 		return;
2341 	}
2342 	/* now that the query has been handled, that mesh_reply entry
2343 	 * should be removed, from the tcp_req_info list,
2344 	 * the mesh state cleanup removes then with region_cleanup and
2345 	 * replies_sent true. */
2346 	/* see if we can send it straight away (we are not doing
2347 	 * anything else).  If so, copy to buffer and start */
2348 	if(req->cp->tcp_is_reading && req->cp->tcp_byte_count == 0) {
2349 		/* buffer is free, and was ready to read new query into,
2350 		 * but we are now going to use it to send this answer */
2351 		tcp_req_info_start_write_buf(req,
2352 			sldns_buffer_begin(req->spool_buffer),
2353 			sldns_buffer_limit(req->spool_buffer));
2354 		/* switch to listen to write events */
2355 		comm_point_stop_listening(req->cp);
2356 		comm_point_start_listening(req->cp, -1,
2357 			adjusted_tcp_timeout(req->cp));
2358 		return;
2359 	}
2360 	/* queue up the answer behind the others already pending */
2361 	if(!tcp_req_info_add_result(req, sldns_buffer_begin(req->spool_buffer),
2362 		sldns_buffer_limit(req->spool_buffer))) {
2363 		/* drop the connection, we are out of resources */
2364 		comm_point_drop_reply(&req->cp->repinfo);
2365 	}
2366 }
2367 
2368 size_t tcp_req_info_get_stream_buffer_size(void)
2369 {
2370 	size_t s;
2371 	if(!stream_wait_lock_inited)
2372 		return stream_wait_count;
2373 	lock_basic_lock(&stream_wait_count_lock);
2374 	s = stream_wait_count;
2375 	lock_basic_unlock(&stream_wait_count_lock);
2376 	return s;
2377 }
2378 
2379 size_t http2_get_query_buffer_size(void)
2380 {
2381 	size_t s;
2382 	if(!http2_query_buffer_lock_inited)
2383 		return http2_query_buffer_count;
2384 	lock_basic_lock(&http2_query_buffer_count_lock);
2385 	s = http2_query_buffer_count;
2386 	lock_basic_unlock(&http2_query_buffer_count_lock);
2387 	return s;
2388 }
2389 
2390 size_t http2_get_response_buffer_size(void)
2391 {
2392 	size_t s;
2393 	if(!http2_response_buffer_lock_inited)
2394 		return http2_response_buffer_count;
2395 	lock_basic_lock(&http2_response_buffer_count_lock);
2396 	s = http2_response_buffer_count;
2397 	lock_basic_unlock(&http2_response_buffer_count_lock);
2398 	return s;
2399 }
2400 
2401 #ifdef HAVE_NGHTTP2
2402 /** nghttp2 callback. Used to copy response from rbuffer to nghttp2 session */
2403 static ssize_t http2_submit_response_read_callback(
2404 	nghttp2_session* ATTR_UNUSED(session),
2405 	int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
2406 	nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
2407 {
2408 	struct http2_stream* h2_stream;
2409 	struct http2_session* h2_session = source->ptr;
2410 	size_t copylen = length;
2411 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2412 		h2_session->session, stream_id))) {
2413 		verbose(VERB_QUERY, "http2: cannot get stream data, closing "
2414 			"stream");
2415 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2416 	}
2417 	if(!h2_stream->rbuffer ||
2418 		sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
2419 		verbose(VERB_QUERY, "http2: cannot submit buffer. No data "
2420 			"available in rbuffer");
2421 		/* rbuffer will be free'd in frame close cb */
2422 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2423 	}
2424 
2425 	if(copylen > sldns_buffer_remaining(h2_stream->rbuffer))
2426 		copylen = sldns_buffer_remaining(h2_stream->rbuffer);
2427 	if(copylen > SSIZE_MAX)
2428 		copylen = SSIZE_MAX; /* will probably never happen */
2429 
2430 	memcpy(buf, sldns_buffer_current(h2_stream->rbuffer), copylen);
2431 	sldns_buffer_skip(h2_stream->rbuffer, copylen);
2432 
2433 	if(sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
2434 		*data_flags |= NGHTTP2_DATA_FLAG_EOF;
2435 		lock_basic_lock(&http2_response_buffer_count_lock);
2436 		http2_response_buffer_count -=
2437 			sldns_buffer_capacity(h2_stream->rbuffer);
2438 		lock_basic_unlock(&http2_response_buffer_count_lock);
2439 		sldns_buffer_free(h2_stream->rbuffer);
2440 		h2_stream->rbuffer = NULL;
2441 	}
2442 
2443 	return copylen;
2444 }
2445 
2446 /**
2447  * Send RST_STREAM frame for stream.
2448  * @param h2_session: http2 session to submit frame to
2449  * @param h2_stream: http2 stream containing frame ID to use in RST_STREAM
2450  * @return 0 on error, 1 otherwise
2451  */
2452 static int http2_submit_rst_stream(struct http2_session* h2_session,
2453 		struct http2_stream* h2_stream)
2454 {
2455 	int ret = nghttp2_submit_rst_stream(h2_session->session,
2456 		NGHTTP2_FLAG_NONE, h2_stream->stream_id,
2457 		NGHTTP2_INTERNAL_ERROR);
2458 	if(ret) {
2459 		verbose(VERB_QUERY, "http2: nghttp2_submit_rst_stream failed, "
2460 			"error: %s", nghttp2_strerror(ret));
2461 		return 0;
2462 	}
2463 	return 1;
2464 }
2465 
2466 /**
2467  * DNS response ready to be submitted to nghttp2, to be prepared for sending
2468  * out. Response is stored in c->buffer. Copy to rbuffer because the c->buffer
2469  * might be used before this will be sent out.
2470  * @param h2_session: http2 session, containing c->buffer which contains answer
2471  * @return 0 on error, 1 otherwise
2472  */
2473 int http2_submit_dns_response(struct http2_session* h2_session)
2474 {
2475 	int ret;
2476 	nghttp2_data_provider data_prd;
2477 	char status[4];
2478 	nghttp2_nv headers[3];
2479 	struct http2_stream* h2_stream = h2_session->c->h2_stream;
2480 	size_t rlen;
2481 	char rlen_str[32];
2482 
2483 	if(h2_stream->rbuffer) {
2484 		log_err("http2 submit response error: rbuffer already "
2485 			"exists");
2486 		return 0;
2487 	}
2488 	if(sldns_buffer_remaining(h2_session->c->buffer) == 0) {
2489 		log_err("http2 submit response error: c->buffer not complete");
2490 		return 0;
2491 	}
2492 
2493 	if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
2494 		verbose(VERB_QUERY, "http2: submit response error: "
2495 			"invalid status");
2496 		return 0;
2497 	}
2498 
2499 	rlen = sldns_buffer_remaining(h2_session->c->buffer);
2500 	snprintf(rlen_str, sizeof(rlen_str), "%u", (unsigned)rlen);
2501 
2502 	lock_basic_lock(&http2_response_buffer_count_lock);
2503 	if(http2_response_buffer_count + rlen > http2_response_buffer_max) {
2504 		lock_basic_unlock(&http2_response_buffer_count_lock);
2505 		verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2506 			"in https-response-buffer-size");
2507 		return http2_submit_rst_stream(h2_session, h2_stream);
2508 	}
2509 	http2_response_buffer_count += rlen;
2510 	lock_basic_unlock(&http2_response_buffer_count_lock);
2511 
2512 	if(!(h2_stream->rbuffer = sldns_buffer_new(rlen))) {
2513 		lock_basic_lock(&http2_response_buffer_count_lock);
2514 		http2_response_buffer_count -= rlen;
2515 		lock_basic_unlock(&http2_response_buffer_count_lock);
2516 		log_err("http2 submit response error: malloc failure");
2517 		return 0;
2518 	}
2519 
2520 	headers[0].name = (uint8_t*)":status";
2521 	headers[0].namelen = 7;
2522 	headers[0].value = (uint8_t*)status;
2523 	headers[0].valuelen = 3;
2524 	headers[0].flags = NGHTTP2_NV_FLAG_NONE;
2525 
2526 	headers[1].name = (uint8_t*)"content-type";
2527 	headers[1].namelen = 12;
2528 	headers[1].value = (uint8_t*)"application/dns-message";
2529 	headers[1].valuelen = 23;
2530 	headers[1].flags = NGHTTP2_NV_FLAG_NONE;
2531 
2532 	headers[2].name = (uint8_t*)"content-length";
2533 	headers[2].namelen = 14;
2534 	headers[2].value = (uint8_t*)rlen_str;
2535 	headers[2].valuelen = strlen(rlen_str);
2536 	headers[2].flags = NGHTTP2_NV_FLAG_NONE;
2537 
2538 	sldns_buffer_write(h2_stream->rbuffer,
2539 		sldns_buffer_current(h2_session->c->buffer),
2540 		sldns_buffer_remaining(h2_session->c->buffer));
2541 	sldns_buffer_flip(h2_stream->rbuffer);
2542 
2543 	data_prd.source.ptr = h2_session;
2544 	data_prd.read_callback = http2_submit_response_read_callback;
2545 	ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
2546 		headers, 3, &data_prd);
2547 	if(ret) {
2548 		verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
2549 			"error: %s", nghttp2_strerror(ret));
2550 		return 0;
2551 	}
2552 	return 1;
2553 }
2554 #else
2555 int http2_submit_dns_response(void* ATTR_UNUSED(v))
2556 {
2557 	return 0;
2558 }
2559 #endif
2560 
2561 #ifdef HAVE_NGHTTP2
2562 /** HTTP status to descriptive string */
2563 static char* http_status_to_str(enum http_status s)
2564 {
2565 	switch(s) {
2566 		case HTTP_STATUS_OK:
2567 			return "OK";
2568 		case HTTP_STATUS_BAD_REQUEST:
2569 			return "Bad Request";
2570 		case HTTP_STATUS_NOT_FOUND:
2571 			return "Not Found";
2572 		case HTTP_STATUS_PAYLOAD_TOO_LARGE:
2573 			return "Payload Too Large";
2574 		case HTTP_STATUS_URI_TOO_LONG:
2575 			return "URI Too Long";
2576 		case HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE:
2577 			return "Unsupported Media Type";
2578 		case HTTP_STATUS_NOT_IMPLEMENTED:
2579 			return "Not Implemented";
2580 	}
2581 	return "Status Unknown";
2582 }
2583 
2584 /** nghttp2 callback. Used to copy error message to nghttp2 session */
2585 static ssize_t http2_submit_error_read_callback(
2586 	nghttp2_session* ATTR_UNUSED(session),
2587 	int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
2588 	nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
2589 {
2590 	struct http2_stream* h2_stream;
2591 	struct http2_session* h2_session = source->ptr;
2592 	char* msg;
2593 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2594 		h2_session->session, stream_id))) {
2595 		verbose(VERB_QUERY, "http2: cannot get stream data, closing "
2596 			"stream");
2597 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2598 	}
2599 	*data_flags |= NGHTTP2_DATA_FLAG_EOF;
2600 	msg = http_status_to_str(h2_stream->status);
2601 	if(length < strlen(msg))
2602 		return 0; /* not worth trying over multiple frames */
2603 	memcpy(buf, msg, strlen(msg));
2604 	return strlen(msg);
2605 
2606 }
2607 
2608 /**
2609  * HTTP error response ready to be submitted to nghttp2, to be prepared for
2610  * sending out. Message body will contain descriptive string for HTTP status.
2611  * @param h2_session: http2 session to submit to
2612  * @param h2_stream: http2 stream containing HTTP status to use for error
2613  * @return 0 on error, 1 otherwise
2614  */
2615 static int http2_submit_error(struct http2_session* h2_session,
2616 	struct http2_stream* h2_stream)
2617 {
2618 	int ret;
2619 	char status[4];
2620 	nghttp2_data_provider data_prd;
2621 	nghttp2_nv headers[1]; /* will be copied by nghttp */
2622 	if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
2623 		verbose(VERB_QUERY, "http2: submit error failed, "
2624 			"invalid status");
2625 		return 0;
2626 	}
2627 	headers[0].name = (uint8_t*)":status";
2628 	headers[0].namelen = 7;
2629 	headers[0].value = (uint8_t*)status;
2630 	headers[0].valuelen = 3;
2631 	headers[0].flags = NGHTTP2_NV_FLAG_NONE;
2632 
2633 	data_prd.source.ptr = h2_session;
2634 	data_prd.read_callback = http2_submit_error_read_callback;
2635 
2636 	ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
2637 		headers, 1, &data_prd);
2638 	if(ret) {
2639 		verbose(VERB_QUERY, "http2: submit error failed, "
2640 			"error: %s", nghttp2_strerror(ret));
2641 		return 0;
2642 	}
2643 	return 1;
2644 }
2645 
2646 /**
2647  * Start query handling. Query is stored in the stream, and will be free'd here.
2648  * @param h2_session: http2 session, containing comm point
2649  * @param h2_stream: stream containing buffered query
2650  * @return: -1 on error, 1 if answer is stored in c->buffer, 0 if there is no
2651  * reply available (yet).
2652  */
2653 static int http2_query_read_done(struct http2_session* h2_session,
2654 	struct http2_stream* h2_stream)
2655 {
2656 	log_assert(h2_stream->qbuffer);
2657 
2658 	if(h2_session->c->h2_stream) {
2659 		verbose(VERB_ALGO, "http2_query_read_done failure: shared "
2660 			"buffer already assigned to stream");
2661 		return -1;
2662 	}
2663 
2664     /* the c->buffer might be used by mesh_send_reply and no be cleard
2665 	 * need to be cleared before use */
2666 	sldns_buffer_clear(h2_session->c->buffer);
2667 	if(sldns_buffer_remaining(h2_session->c->buffer) <
2668 		sldns_buffer_remaining(h2_stream->qbuffer)) {
2669 		/* qbuffer will be free'd in frame close cb */
2670 		sldns_buffer_clear(h2_session->c->buffer);
2671 		verbose(VERB_ALGO, "http2_query_read_done failure: can't fit "
2672 			"qbuffer in c->buffer");
2673 		return -1;
2674 	}
2675 
2676 	sldns_buffer_write(h2_session->c->buffer,
2677 		sldns_buffer_current(h2_stream->qbuffer),
2678 		sldns_buffer_remaining(h2_stream->qbuffer));
2679 
2680 	lock_basic_lock(&http2_query_buffer_count_lock);
2681 	http2_query_buffer_count -= sldns_buffer_capacity(h2_stream->qbuffer);
2682 	lock_basic_unlock(&http2_query_buffer_count_lock);
2683 	sldns_buffer_free(h2_stream->qbuffer);
2684 	h2_stream->qbuffer = NULL;
2685 
2686 	sldns_buffer_flip(h2_session->c->buffer);
2687 	h2_session->c->h2_stream = h2_stream;
2688 	fptr_ok(fptr_whitelist_comm_point(h2_session->c->callback));
2689 	if((*h2_session->c->callback)(h2_session->c, h2_session->c->cb_arg,
2690 		NETEVENT_NOERROR, &h2_session->c->repinfo)) {
2691 		return 1; /* answer in c->buffer */
2692 	}
2693 	sldns_buffer_clear(h2_session->c->buffer);
2694 	h2_session->c->h2_stream = NULL;
2695 	return 0; /* mesh state added, or dropped */
2696 }
2697 
2698 /** nghttp2 callback. Used to check if the received frame indicates the end of a
2699  * stream. Gather collected request data and start query handling. */
2700 static int http2_req_frame_recv_cb(nghttp2_session* session,
2701 	const nghttp2_frame* frame, void* cb_arg)
2702 {
2703 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2704 	struct http2_stream* h2_stream;
2705 	int query_read_done;
2706 
2707 	if((frame->hd.type != NGHTTP2_DATA &&
2708 		frame->hd.type != NGHTTP2_HEADERS) ||
2709 		!(frame->hd.flags & NGHTTP2_FLAG_END_STREAM)) {
2710 			return 0;
2711 	}
2712 
2713 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2714 		session, frame->hd.stream_id)))
2715 		return 0;
2716 
2717 	if(h2_stream->invalid_endpoint) {
2718 		h2_stream->status = HTTP_STATUS_NOT_FOUND;
2719 		goto submit_http_error;
2720 	}
2721 
2722 	if(h2_stream->invalid_content_type) {
2723 		h2_stream->status = HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE;
2724 		goto submit_http_error;
2725 	}
2726 
2727 	if(h2_stream->http_method != HTTP_METHOD_GET &&
2728 		h2_stream->http_method != HTTP_METHOD_POST) {
2729 		h2_stream->status = HTTP_STATUS_NOT_IMPLEMENTED;
2730 		goto submit_http_error;
2731 	}
2732 
2733 	if(h2_stream->query_too_large) {
2734 		if(h2_stream->http_method == HTTP_METHOD_POST)
2735 			h2_stream->status = HTTP_STATUS_PAYLOAD_TOO_LARGE;
2736 		else
2737 			h2_stream->status = HTTP_STATUS_URI_TOO_LONG;
2738 		goto submit_http_error;
2739 	}
2740 
2741 	if(!h2_stream->qbuffer) {
2742 		h2_stream->status = HTTP_STATUS_BAD_REQUEST;
2743 		goto submit_http_error;
2744 	}
2745 
2746 	if(h2_stream->status) {
2747 submit_http_error:
2748 		verbose(VERB_QUERY, "http2 request invalid, returning :status="
2749 			"%d", h2_stream->status);
2750 		if(!http2_submit_error(h2_session, h2_stream)) {
2751 			return NGHTTP2_ERR_CALLBACK_FAILURE;
2752 		}
2753 		return 0;
2754 	}
2755 	h2_stream->status = HTTP_STATUS_OK;
2756 
2757 	sldns_buffer_flip(h2_stream->qbuffer);
2758 	h2_session->postpone_drop = 1;
2759 	query_read_done = http2_query_read_done(h2_session, h2_stream);
2760 	if(query_read_done < 0)
2761 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2762 	else if(!query_read_done) {
2763 		if(h2_session->is_drop) {
2764 			/* connection needs to be closed. Return failure to make
2765 			 * sure no other action are taken anymore on comm point.
2766 			 * failure will result in reclaiming (and closing)
2767 			 * of comm point. */
2768 			verbose(VERB_QUERY, "http2 query dropped in worker cb");
2769 			h2_session->postpone_drop = 0;
2770 			return NGHTTP2_ERR_CALLBACK_FAILURE;
2771 		}
2772 		/* nothing to submit right now, query added to mesh. */
2773 		h2_session->postpone_drop = 0;
2774 		return 0;
2775 	}
2776 	if(!http2_submit_dns_response(h2_session)) {
2777 		sldns_buffer_clear(h2_session->c->buffer);
2778 		h2_session->c->h2_stream = NULL;
2779 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2780 	}
2781 	verbose(VERB_QUERY, "http2 query submitted to session");
2782 	sldns_buffer_clear(h2_session->c->buffer);
2783 	h2_session->c->h2_stream = NULL;
2784 	return 0;
2785 }
2786 
2787 /** nghttp2 callback. Used to detect start of new streams. */
2788 static int http2_req_begin_headers_cb(nghttp2_session* session,
2789 	const nghttp2_frame* frame, void* cb_arg)
2790 {
2791 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2792 	struct http2_stream* h2_stream;
2793 	int ret;
2794 	if(frame->hd.type != NGHTTP2_HEADERS ||
2795 		frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
2796 		/* only interested in request headers */
2797 		return 0;
2798 	}
2799 	if(!(h2_stream = http2_stream_create(frame->hd.stream_id))) {
2800 		log_err("malloc failure while creating http2 stream");
2801 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2802 	}
2803 	http2_session_add_stream(h2_session, h2_stream);
2804 	ret = nghttp2_session_set_stream_user_data(session,
2805 		frame->hd.stream_id, h2_stream);
2806 	if(ret) {
2807 		/* stream does not exist */
2808 		verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
2809 			"error: %s", nghttp2_strerror(ret));
2810 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2811 	}
2812 
2813 	return 0;
2814 }
2815 
2816 /**
2817  * base64url decode, store in qbuffer
2818  * @param h2_session: http2 session
2819  * @param h2_stream: http2 stream
2820  * @param start: start of the base64 string
2821  * @param length: length of the base64 string
2822  * @return: 0 on error, 1 otherwise. query will be stored in h2_stream->qbuffer,
2823  * buffer will be NULL is unparseble.
2824  */
2825 static int http2_buffer_uri_query(struct http2_session* h2_session,
2826 	struct http2_stream* h2_stream, const uint8_t* start, size_t length)
2827 {
2828 	size_t expectb64len;
2829 	int b64len;
2830 	if(h2_stream->http_method == HTTP_METHOD_POST)
2831 		return 1;
2832 	if(length == 0)
2833 		return 1;
2834 	if(h2_stream->qbuffer) {
2835 		verbose(VERB_ALGO, "http2_req_header fail, "
2836 			"qbuffer already set");
2837 		return 0;
2838 	}
2839 
2840 	/* calculate size, might be a bit bigger than the real
2841 	 * decoded buffer size */
2842 	expectb64len = sldns_b64_pton_calculate_size(length);
2843 	log_assert(expectb64len > 0);
2844 	if(expectb64len >
2845 		h2_session->c->http2_stream_max_qbuffer_size) {
2846 		h2_stream->query_too_large = 1;
2847 		return 1;
2848 	}
2849 
2850 	lock_basic_lock(&http2_query_buffer_count_lock);
2851 	if(http2_query_buffer_count + expectb64len > http2_query_buffer_max) {
2852 		lock_basic_unlock(&http2_query_buffer_count_lock);
2853 		verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2854 			"in http2-query-buffer-size");
2855 		return http2_submit_rst_stream(h2_session, h2_stream);
2856 	}
2857 	http2_query_buffer_count += expectb64len;
2858 	lock_basic_unlock(&http2_query_buffer_count_lock);
2859 	if(!(h2_stream->qbuffer = sldns_buffer_new(expectb64len))) {
2860 		lock_basic_lock(&http2_query_buffer_count_lock);
2861 		http2_query_buffer_count -= expectb64len;
2862 		lock_basic_unlock(&http2_query_buffer_count_lock);
2863 		log_err("http2_req_header fail, qbuffer "
2864 			"malloc failure");
2865 		return 0;
2866 	}
2867 
2868 	if(sldns_b64_contains_nonurl((char const*)start, length)) {
2869 		char buf[65536+4];
2870 		verbose(VERB_ALGO, "HTTP2 stream contains wrong b64 encoding");
2871 		/* copy to the scratch buffer temporarily to terminate the
2872 		 * string with a zero */
2873 		if(length+1 > sizeof(buf)) {
2874 			/* too long */
2875 			lock_basic_lock(&http2_query_buffer_count_lock);
2876 			http2_query_buffer_count -= expectb64len;
2877 			lock_basic_unlock(&http2_query_buffer_count_lock);
2878 			sldns_buffer_free(h2_stream->qbuffer);
2879 			h2_stream->qbuffer = NULL;
2880 			return 1;
2881 		}
2882 		memmove(buf, start, length);
2883 		buf[length] = 0;
2884 		if(!(b64len = sldns_b64_pton(buf, sldns_buffer_current(
2885 			h2_stream->qbuffer), expectb64len)) || b64len < 0) {
2886 			lock_basic_lock(&http2_query_buffer_count_lock);
2887 			http2_query_buffer_count -= expectb64len;
2888 			lock_basic_unlock(&http2_query_buffer_count_lock);
2889 			sldns_buffer_free(h2_stream->qbuffer);
2890 			h2_stream->qbuffer = NULL;
2891 			return 1;
2892 		}
2893 	} else {
2894 		if(!(b64len = sldns_b64url_pton(
2895 			(char const *)start, length,
2896 			sldns_buffer_current(h2_stream->qbuffer),
2897 			expectb64len)) || b64len < 0) {
2898 			lock_basic_lock(&http2_query_buffer_count_lock);
2899 			http2_query_buffer_count -= expectb64len;
2900 			lock_basic_unlock(&http2_query_buffer_count_lock);
2901 			sldns_buffer_free(h2_stream->qbuffer);
2902 			h2_stream->qbuffer = NULL;
2903 			/* return without error, method can be an
2904 			 * unknown POST */
2905 			return 1;
2906 		}
2907 	}
2908 	sldns_buffer_skip(h2_stream->qbuffer, (size_t)b64len);
2909 	return 1;
2910 }
2911 
2912 /** nghttp2 callback. Used to parse headers from HEADER frames. */
2913 static int http2_req_header_cb(nghttp2_session* session,
2914 	const nghttp2_frame* frame, const uint8_t* name, size_t namelen,
2915 	const uint8_t* value, size_t valuelen, uint8_t ATTR_UNUSED(flags),
2916 	void* cb_arg)
2917 {
2918 	struct http2_stream* h2_stream = NULL;
2919 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2920 	/* nghttp2 deals with CONTINUATION frames and provides them as part of
2921 	 * the HEADER */
2922 	if(frame->hd.type != NGHTTP2_HEADERS ||
2923 		frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
2924 		/* only interested in request headers */
2925 		return 0;
2926 	}
2927 	if(!(h2_stream = nghttp2_session_get_stream_user_data(session,
2928 		frame->hd.stream_id)))
2929 		return 0;
2930 
2931 	/* earlier checks already indicate we can stop handling this query */
2932 	if(h2_stream->http_method == HTTP_METHOD_UNSUPPORTED ||
2933 		h2_stream->invalid_content_type ||
2934 		h2_stream->invalid_endpoint)
2935 		return 0;
2936 
2937 
2938 	/* nghttp2 performs some sanity checks in the headers, including:
2939 	 * name and value are guaranteed to be null terminated
2940 	 * name is guaranteed to be lowercase
2941 	 * content-length value is guaranteed to contain digits
2942 	 */
2943 
2944 	if(!h2_stream->http_method && namelen == 7 &&
2945 		memcmp(":method", name, namelen) == 0) {
2946 		/* Case insensitive check on :method value to be on the safe
2947 		 * side. I failed to find text about case sensitivity in specs.
2948 		 */
2949 		if(valuelen == 3 && strcasecmp("GET", (const char*)value) == 0)
2950 			h2_stream->http_method = HTTP_METHOD_GET;
2951 		else if(valuelen == 4 &&
2952 			strcasecmp("POST", (const char*)value) == 0) {
2953 			h2_stream->http_method = HTTP_METHOD_POST;
2954 			if(h2_stream->qbuffer) {
2955 				/* POST method uses query from DATA frames */
2956 				lock_basic_lock(&http2_query_buffer_count_lock);
2957 				http2_query_buffer_count -=
2958 					sldns_buffer_capacity(h2_stream->qbuffer);
2959 				lock_basic_unlock(&http2_query_buffer_count_lock);
2960 				sldns_buffer_free(h2_stream->qbuffer);
2961 				h2_stream->qbuffer = NULL;
2962 			}
2963 		} else
2964 			h2_stream->http_method = HTTP_METHOD_UNSUPPORTED;
2965 		return 0;
2966 	}
2967 	if(namelen == 5 && memcmp(":path", name, namelen) == 0) {
2968 		/* :path may contain DNS query, depending on method. Method might
2969 		 * not be known yet here, so check after finishing receiving
2970 		 * stream. */
2971 #define	HTTP_QUERY_PARAM "?dns="
2972 		size_t el = strlen(h2_session->c->http_endpoint);
2973 		size_t qpl = strlen(HTTP_QUERY_PARAM);
2974 
2975 		if(valuelen < el || memcmp(h2_session->c->http_endpoint,
2976 			value, el) != 0) {
2977 			h2_stream->invalid_endpoint = 1;
2978 			return 0;
2979 		}
2980 		/* larger than endpoint only allowed if it is for the query
2981 		 * parameter */
2982 		if(valuelen <= el+qpl ||
2983 			memcmp(HTTP_QUERY_PARAM, value+el, qpl) != 0) {
2984 			if(valuelen != el)
2985 				h2_stream->invalid_endpoint = 1;
2986 			return 0;
2987 		}
2988 
2989 		if(!http2_buffer_uri_query(h2_session, h2_stream,
2990 			value+(el+qpl), valuelen-(el+qpl))) {
2991 			return NGHTTP2_ERR_CALLBACK_FAILURE;
2992 		}
2993 		return 0;
2994 	}
2995 	/* Content type is a SHOULD (rfc7231#section-3.1.1.5) when using POST,
2996 	 * and not needed when using GET. Don't enfore.
2997 	 * If set only allow lowercase "application/dns-message".
2998 	 *
2999 	 * Clients SHOULD (rfc8484#section-4.1) set an accept header, but MUST
3000 	 * be able to handle "application/dns-message". Since that is the only
3001 	 * content-type supported we can ignore the accept header.
3002 	 */
3003 	if((namelen == 12 && memcmp("content-type", name, namelen) == 0)) {
3004 		if(valuelen != 23 || memcmp("application/dns-message", value,
3005 			valuelen) != 0) {
3006 			h2_stream->invalid_content_type = 1;
3007 		}
3008 	}
3009 
3010 	/* Only interested in content-lentg for POST (on not yet known) method.
3011 	 */
3012 	if((!h2_stream->http_method ||
3013 		h2_stream->http_method == HTTP_METHOD_POST) &&
3014 		!h2_stream->content_length && namelen  == 14 &&
3015 		memcmp("content-length", name, namelen) == 0) {
3016 		if(valuelen > 5) {
3017 			h2_stream->query_too_large = 1;
3018 			return 0;
3019 		}
3020 		/* guaranteed to only contain digits and be null terminated */
3021 		h2_stream->content_length = atoi((const char*)value);
3022 		if(h2_stream->content_length >
3023 			h2_session->c->http2_stream_max_qbuffer_size) {
3024 			h2_stream->query_too_large = 1;
3025 			return 0;
3026 		}
3027 	}
3028 	return 0;
3029 }
3030 
3031 /** nghttp2 callback. Used to get data from DATA frames, which can contain
3032  * queries in POST requests. */
3033 static int http2_req_data_chunk_recv_cb(nghttp2_session* ATTR_UNUSED(session),
3034 	uint8_t ATTR_UNUSED(flags), int32_t stream_id, const uint8_t* data,
3035 	size_t len, void* cb_arg)
3036 {
3037 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
3038 	struct http2_stream* h2_stream;
3039 	size_t qlen = 0;
3040 
3041 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
3042 		h2_session->session, stream_id))) {
3043 		return 0;
3044 	}
3045 
3046 	if(h2_stream->query_too_large)
3047 		return 0;
3048 
3049 	if(!h2_stream->qbuffer) {
3050 		if(h2_stream->content_length) {
3051 			if(h2_stream->content_length < len)
3052 				/* getting more data in DATA frame than
3053 				 * advertised in content-length header. */
3054 				return NGHTTP2_ERR_CALLBACK_FAILURE;
3055 			qlen = h2_stream->content_length;
3056 		} else if(len <= h2_session->c->http2_stream_max_qbuffer_size) {
3057 			/* setting this to msg-buffer-size can result in a lot
3058 			 * of memory consuption. Most queries should fit in a
3059 			 * single DATA frame, and most POST queries will
3060 			 * contain content-length which does not impose this
3061 			 * limit. */
3062 			qlen = len;
3063 		}
3064 	}
3065 	if(!h2_stream->qbuffer && qlen) {
3066 		lock_basic_lock(&http2_query_buffer_count_lock);
3067 		if(http2_query_buffer_count + qlen > http2_query_buffer_max) {
3068 			lock_basic_unlock(&http2_query_buffer_count_lock);
3069 			verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
3070 				"in http2-query-buffer-size");
3071 			return http2_submit_rst_stream(h2_session, h2_stream);
3072 		}
3073 		http2_query_buffer_count += qlen;
3074 		lock_basic_unlock(&http2_query_buffer_count_lock);
3075 		if(!(h2_stream->qbuffer = sldns_buffer_new(qlen))) {
3076 			lock_basic_lock(&http2_query_buffer_count_lock);
3077 			http2_query_buffer_count -= qlen;
3078 			lock_basic_unlock(&http2_query_buffer_count_lock);
3079 		}
3080 	}
3081 
3082 	if(!h2_stream->qbuffer ||
3083 		sldns_buffer_remaining(h2_stream->qbuffer) < len) {
3084 		verbose(VERB_ALGO, "http2 data_chunck_recv failed. Not enough "
3085 			"buffer space for POST query. Can happen on multi "
3086 			"frame requests without content-length header");
3087 		h2_stream->query_too_large = 1;
3088 		return 0;
3089 	}
3090 
3091 	sldns_buffer_write(h2_stream->qbuffer, data, len);
3092 
3093 	return 0;
3094 }
3095 
3096 void http2_req_stream_clear(struct http2_stream* h2_stream)
3097 {
3098 	if(h2_stream->qbuffer) {
3099 		lock_basic_lock(&http2_query_buffer_count_lock);
3100 		http2_query_buffer_count -=
3101 			sldns_buffer_capacity(h2_stream->qbuffer);
3102 		lock_basic_unlock(&http2_query_buffer_count_lock);
3103 		sldns_buffer_free(h2_stream->qbuffer);
3104 		h2_stream->qbuffer = NULL;
3105 	}
3106 	if(h2_stream->rbuffer) {
3107 		lock_basic_lock(&http2_response_buffer_count_lock);
3108 		http2_response_buffer_count -=
3109 			sldns_buffer_capacity(h2_stream->rbuffer);
3110 		lock_basic_unlock(&http2_response_buffer_count_lock);
3111 		sldns_buffer_free(h2_stream->rbuffer);
3112 		h2_stream->rbuffer = NULL;
3113 	}
3114 }
3115 
3116 nghttp2_session_callbacks* http2_req_callbacks_create(void)
3117 {
3118 	nghttp2_session_callbacks *callbacks;
3119 	if(nghttp2_session_callbacks_new(&callbacks) == NGHTTP2_ERR_NOMEM) {
3120 		log_err("failed to initialize nghttp2 callback");
3121 		return NULL;
3122 	}
3123 	/* reception of header block started, used to create h2_stream */
3124 	nghttp2_session_callbacks_set_on_begin_headers_callback(callbacks,
3125 		http2_req_begin_headers_cb);
3126 	/* complete frame received, used to get data from stream if frame
3127 	 * has end stream flag, and start processing query */
3128 	nghttp2_session_callbacks_set_on_frame_recv_callback(callbacks,
3129 		http2_req_frame_recv_cb);
3130 	/* get request info from headers */
3131 	nghttp2_session_callbacks_set_on_header_callback(callbacks,
3132 		http2_req_header_cb);
3133 	/* get data from DATA frames, containing POST query */
3134 	nghttp2_session_callbacks_set_on_data_chunk_recv_callback(callbacks,
3135 		http2_req_data_chunk_recv_cb);
3136 
3137 	/* generic HTTP2 callbacks */
3138 	nghttp2_session_callbacks_set_recv_callback(callbacks, http2_recv_cb);
3139 	nghttp2_session_callbacks_set_send_callback(callbacks, http2_send_cb);
3140 	nghttp2_session_callbacks_set_on_stream_close_callback(callbacks,
3141 		http2_stream_close_cb);
3142 
3143 	return callbacks;
3144 }
3145 #endif /* HAVE_NGHTTP2 */
3146