1 /*
2  * services/listen_dnsport.c - listen on port 53 for incoming DNS queries.
3  *
4  * Copyright (c) 2007, NLnet Labs. All rights reserved.
5  *
6  * This software is open source.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * Redistributions of source code must retain the above copyright notice,
13  * this list of conditions and the following disclaimer.
14  *
15  * Redistributions in binary form must reproduce the above copyright notice,
16  * this list of conditions and the following disclaimer in the documentation
17  * and/or other materials provided with the distribution.
18  *
19  * Neither the name of the NLNET LABS nor the names of its contributors may
20  * be used to endorse or promote products derived from this software without
21  * specific prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
26  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
27  * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
28  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
29  * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
30  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
31  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
32  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
33  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34  */
35 
36 /**
37  * \file
38  *
39  * This file has functions to get queries from clients.
40  */
41 #include "config.h"
42 #ifdef HAVE_SYS_TYPES_H
43 #  include <sys/types.h>
44 #endif
45 #include <sys/time.h>
46 #include <limits.h>
47 #ifdef USE_TCP_FASTOPEN
48 #include <netinet/tcp.h>
49 #endif
50 #include <ctype.h>
51 #include "services/listen_dnsport.h"
52 #include "services/outside_network.h"
53 #include "util/netevent.h"
54 #include "util/log.h"
55 #include "util/config_file.h"
56 #include "util/net_help.h"
57 #include "sldns/sbuffer.h"
58 #include "sldns/parseutil.h"
59 #include "services/mesh.h"
60 #include "util/fptr_wlist.h"
61 #include "util/locks.h"
62 
63 #ifdef HAVE_NETDB_H
64 #include <netdb.h>
65 #endif
66 #include <fcntl.h>
67 
68 #ifdef HAVE_SYS_UN_H
69 #include <sys/un.h>
70 #endif
71 
72 #ifdef HAVE_SYSTEMD
73 #include <systemd/sd-daemon.h>
74 #endif
75 
76 #ifdef HAVE_IFADDRS_H
77 #include <ifaddrs.h>
78 #endif
79 #ifdef HAVE_NET_IF_H
80 #include <net/if.h>
81 #endif
82 
83 /** number of queued TCP connections for listen() */
84 #define TCP_BACKLOG 256
85 
86 #ifndef THREADS_DISABLED
87 /** lock on the counter of stream buffer memory */
88 static lock_basic_type stream_wait_count_lock;
89 /** lock on the counter of HTTP2 query buffer memory */
90 static lock_basic_type http2_query_buffer_count_lock;
91 /** lock on the counter of HTTP2 response buffer memory */
92 static lock_basic_type http2_response_buffer_count_lock;
93 #endif
94 /** size (in bytes) of stream wait buffers */
95 static size_t stream_wait_count = 0;
96 /** is the lock initialised for stream wait buffers */
97 static int stream_wait_lock_inited = 0;
98 /** size (in bytes) of HTTP2 query buffers */
99 static size_t http2_query_buffer_count = 0;
100 /** is the lock initialised for HTTP2 query buffers */
101 static int http2_query_buffer_lock_inited = 0;
102 /** size (in bytes) of HTTP2 response buffers */
103 static size_t http2_response_buffer_count = 0;
104 /** is the lock initialised for HTTP2 response buffers */
105 static int http2_response_buffer_lock_inited = 0;
106 
107 /**
108  * Debug print of the getaddrinfo returned address.
109  * @param addr: the address returned.
110  */
111 static void
112 verbose_print_addr(struct addrinfo *addr)
113 {
114 	if(verbosity >= VERB_ALGO) {
115 		char buf[100];
116 		void* sinaddr = &((struct sockaddr_in*)addr->ai_addr)->sin_addr;
117 #ifdef INET6
118 		if(addr->ai_family == AF_INET6)
119 			sinaddr = &((struct sockaddr_in6*)addr->ai_addr)->
120 				sin6_addr;
121 #endif /* INET6 */
122 		if(inet_ntop(addr->ai_family, sinaddr, buf,
123 			(socklen_t)sizeof(buf)) == 0) {
124 			(void)strlcpy(buf, "(null)", sizeof(buf));
125 		}
126 		buf[sizeof(buf)-1] = 0;
127 		verbose(VERB_ALGO, "creating %s%s socket %s %d",
128 			addr->ai_socktype==SOCK_DGRAM?"udp":
129 			addr->ai_socktype==SOCK_STREAM?"tcp":"otherproto",
130 			addr->ai_family==AF_INET?"4":
131 			addr->ai_family==AF_INET6?"6":
132 			"_otherfam", buf,
133 			ntohs(((struct sockaddr_in*)addr->ai_addr)->sin_port));
134 	}
135 }
136 
137 void
138 verbose_print_unbound_socket(struct unbound_socket* ub_sock)
139 {
140 	if(verbosity >= VERB_ALGO) {
141 		log_info("listing of unbound_socket structure:");
142 		verbose_print_addr(ub_sock->addr);
143 		log_info("s is: %d, fam is: %s, acl: %s", ub_sock->s,
144 			ub_sock->fam == AF_INET?"AF_INET":"AF_INET6",
145 			ub_sock->acl?"yes":"no");
146 	}
147 }
148 
149 #ifdef HAVE_SYSTEMD
150 static int
151 systemd_get_activated(int family, int socktype, int listen,
152 		      struct sockaddr *addr, socklen_t addrlen,
153 		      const char *path)
154 {
155 	int i = 0;
156 	int r = 0;
157 	int s = -1;
158 	const char* listen_pid, *listen_fds;
159 
160 	/* We should use "listen" option only for stream protocols. For UDP it should be -1 */
161 
162 	if((r = sd_booted()) < 1) {
163 		if(r == 0)
164 			log_warn("systemd is not running");
165 		else
166 			log_err("systemd sd_booted(): %s", strerror(-r));
167 		return -1;
168 	}
169 
170 	listen_pid = getenv("LISTEN_PID");
171 	listen_fds = getenv("LISTEN_FDS");
172 
173 	if (!listen_pid) {
174 		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_PID");
175 		return -1;
176 	}
177 
178 	if (!listen_fds) {
179 		log_warn("Systemd mandatory ENV variable is not defined: LISTEN_FDS");
180 		return -1;
181 	}
182 
183 	if((r = sd_listen_fds(0)) < 1) {
184 		if(r == 0)
185 			log_warn("systemd: did not return socket, check unit configuration");
186 		else
187 			log_err("systemd sd_listen_fds(): %s", strerror(-r));
188 		return -1;
189 	}
190 
191 	for(i = 0; i < r; i++) {
192 		if(sd_is_socket(SD_LISTEN_FDS_START + i, family, socktype, listen)) {
193 			s = SD_LISTEN_FDS_START + i;
194 			break;
195 		}
196 	}
197 	if (s == -1) {
198 		if (addr)
199 			log_err_addr("systemd sd_listen_fds()",
200 				     "no such socket",
201 				     (struct sockaddr_storage *)addr, addrlen);
202 		else
203 			log_err("systemd sd_listen_fds(): %s", path);
204 	}
205 	return s;
206 }
207 #endif
208 
209 int
210 create_udp_sock(int family, int socktype, struct sockaddr* addr,
211         socklen_t addrlen, int v6only, int* inuse, int* noproto,
212 	int rcv, int snd, int listen, int* reuseport, int transparent,
213 	int freebind, int use_systemd, int dscp)
214 {
215 	int s;
216 	char* err;
217 #if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_USE_MIN_MTU)  || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined (SO_BINDANY)
218 	int on=1;
219 #endif
220 #ifdef IPV6_MTU
221 	int mtu = IPV6_MIN_MTU;
222 #endif
223 #if !defined(SO_RCVBUFFORCE) && !defined(SO_RCVBUF)
224 	(void)rcv;
225 #endif
226 #if !defined(SO_SNDBUFFORCE) && !defined(SO_SNDBUF)
227 	(void)snd;
228 #endif
229 #ifndef IPV6_V6ONLY
230 	(void)v6only;
231 #endif
232 #if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
233 	(void)transparent;
234 #endif
235 #if !defined(IP_FREEBIND)
236 	(void)freebind;
237 #endif
238 #ifdef HAVE_SYSTEMD
239 	int got_fd_from_systemd = 0;
240 
241 	if (!use_systemd
242 	    || (use_systemd
243 		&& (s = systemd_get_activated(family, socktype, -1, addr,
244 					      addrlen, NULL)) == -1)) {
245 #else
246 	(void)use_systemd;
247 #endif
248 	if((s = socket(family, socktype, 0)) == -1) {
249 		*inuse = 0;
250 #ifndef USE_WINSOCK
251 		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
252 			*noproto = 1;
253 			return -1;
254 		}
255 #else
256 		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
257 			WSAGetLastError() == WSAEPROTONOSUPPORT) {
258 			*noproto = 1;
259 			return -1;
260 		}
261 #endif
262 		log_err("can't create socket: %s", sock_strerror(errno));
263 		*noproto = 0;
264 		return -1;
265 	}
266 #ifdef HAVE_SYSTEMD
267 	} else {
268 		got_fd_from_systemd = 1;
269 	}
270 #endif
271 	if(listen) {
272 #ifdef SO_REUSEADDR
273 		if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
274 			(socklen_t)sizeof(on)) < 0) {
275 			log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
276 				sock_strerror(errno));
277 #ifndef USE_WINSOCK
278 			if(errno != ENOSYS) {
279 				close(s);
280 				*noproto = 0;
281 				*inuse = 0;
282 				return -1;
283 			}
284 #else
285 			closesocket(s);
286 			*noproto = 0;
287 			*inuse = 0;
288 			return -1;
289 #endif
290 		}
291 #endif /* SO_REUSEADDR */
292 #ifdef SO_REUSEPORT
293 #  ifdef SO_REUSEPORT_LB
294 		/* on FreeBSD 12 we have SO_REUSEPORT_LB that does loadbalance
295 		 * like SO_REUSEPORT on Linux.  This is what the users want
296 		 * with the config option in unbound.conf; if we actually
297 		 * need local address and port reuse they'll also need to
298 		 * have SO_REUSEPORT set for them, assume it was _LB they want.
299 		 */
300 		if (reuseport && *reuseport &&
301 		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT_LB, (void*)&on,
302 			(socklen_t)sizeof(on)) < 0) {
303 #ifdef ENOPROTOOPT
304 			if(errno != ENOPROTOOPT || verbosity >= 3)
305 				log_warn("setsockopt(.. SO_REUSEPORT_LB ..) failed: %s",
306 					strerror(errno));
307 #endif
308 			/* this option is not essential, we can continue */
309 			*reuseport = 0;
310 		}
311 #  else /* no SO_REUSEPORT_LB */
312 
313 		/* try to set SO_REUSEPORT so that incoming
314 		 * queries are distributed evenly among the receiving threads.
315 		 * Each thread must have its own socket bound to the same port,
316 		 * with SO_REUSEPORT set on each socket.
317 		 */
318 		if (reuseport && *reuseport &&
319 		    setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
320 			(socklen_t)sizeof(on)) < 0) {
321 #ifdef ENOPROTOOPT
322 			if(errno != ENOPROTOOPT || verbosity >= 3)
323 				log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
324 					strerror(errno));
325 #endif
326 			/* this option is not essential, we can continue */
327 			*reuseport = 0;
328 		}
329 #  endif /* SO_REUSEPORT_LB */
330 #else
331 		(void)reuseport;
332 #endif /* defined(SO_REUSEPORT) */
333 #ifdef IP_TRANSPARENT
334 		if (transparent &&
335 		    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
336 		    (socklen_t)sizeof(on)) < 0) {
337 			log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
338 			strerror(errno));
339 		}
340 #elif defined(IP_BINDANY)
341 		if (transparent &&
342 		    setsockopt(s, (family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
343 		    (family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
344 		    (void*)&on, (socklen_t)sizeof(on)) < 0) {
345 			log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
346 			(family==AF_INET6?"V6":""), strerror(errno));
347 		}
348 #elif defined(SO_BINDANY)
349 		if (transparent &&
350 		    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on,
351 		    (socklen_t)sizeof(on)) < 0) {
352 			log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
353 			strerror(errno));
354 		}
355 #endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
356 	}
357 #ifdef IP_FREEBIND
358 	if(freebind &&
359 	    setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
360 	    (socklen_t)sizeof(on)) < 0) {
361 		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
362 		strerror(errno));
363 	}
364 #endif /* IP_FREEBIND */
365 	if(rcv) {
366 #ifdef SO_RCVBUF
367 		int got;
368 		socklen_t slen = (socklen_t)sizeof(got);
369 #  ifdef SO_RCVBUFFORCE
370 		/* Linux specific: try to use root permission to override
371 		 * system limits on rcvbuf. The limit is stored in
372 		 * /proc/sys/net/core/rmem_max or sysctl net.core.rmem_max */
373 		if(setsockopt(s, SOL_SOCKET, SO_RCVBUFFORCE, (void*)&rcv,
374 			(socklen_t)sizeof(rcv)) < 0) {
375 			if(errno != EPERM) {
376 				log_err("setsockopt(..., SO_RCVBUFFORCE, "
377 					"...) failed: %s", sock_strerror(errno));
378 				sock_close(s);
379 				*noproto = 0;
380 				*inuse = 0;
381 				return -1;
382 			}
383 #  endif /* SO_RCVBUFFORCE */
384 			if(setsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&rcv,
385 				(socklen_t)sizeof(rcv)) < 0) {
386 				log_err("setsockopt(..., SO_RCVBUF, "
387 					"...) failed: %s", sock_strerror(errno));
388 				sock_close(s);
389 				*noproto = 0;
390 				*inuse = 0;
391 				return -1;
392 			}
393 			/* check if we got the right thing or if system
394 			 * reduced to some system max.  Warn if so */
395 			if(getsockopt(s, SOL_SOCKET, SO_RCVBUF, (void*)&got,
396 				&slen) >= 0 && got < rcv/2) {
397 				log_warn("so-rcvbuf %u was not granted. "
398 					"Got %u. To fix: start with "
399 					"root permissions(linux) or sysctl "
400 					"bigger net.core.rmem_max(linux) or "
401 					"kern.ipc.maxsockbuf(bsd) values.",
402 					(unsigned)rcv, (unsigned)got);
403 			}
404 #  ifdef SO_RCVBUFFORCE
405 		}
406 #  endif
407 #endif /* SO_RCVBUF */
408 	}
409 	/* first do RCVBUF as the receive buffer is more important */
410 	if(snd) {
411 #ifdef SO_SNDBUF
412 		int got;
413 		socklen_t slen = (socklen_t)sizeof(got);
414 #  ifdef SO_SNDBUFFORCE
415 		/* Linux specific: try to use root permission to override
416 		 * system limits on sndbuf. The limit is stored in
417 		 * /proc/sys/net/core/wmem_max or sysctl net.core.wmem_max */
418 		if(setsockopt(s, SOL_SOCKET, SO_SNDBUFFORCE, (void*)&snd,
419 			(socklen_t)sizeof(snd)) < 0) {
420 			if(errno != EPERM) {
421 				log_err("setsockopt(..., SO_SNDBUFFORCE, "
422 					"...) failed: %s", sock_strerror(errno));
423 				sock_close(s);
424 				*noproto = 0;
425 				*inuse = 0;
426 				return -1;
427 			}
428 #  endif /* SO_SNDBUFFORCE */
429 			if(setsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&snd,
430 				(socklen_t)sizeof(snd)) < 0) {
431 				log_err("setsockopt(..., SO_SNDBUF, "
432 					"...) failed: %s", sock_strerror(errno));
433 				sock_close(s);
434 				*noproto = 0;
435 				*inuse = 0;
436 				return -1;
437 			}
438 			/* check if we got the right thing or if system
439 			 * reduced to some system max.  Warn if so */
440 			if(getsockopt(s, SOL_SOCKET, SO_SNDBUF, (void*)&got,
441 				&slen) >= 0 && got < snd/2) {
442 				log_warn("so-sndbuf %u was not granted. "
443 					"Got %u. To fix: start with "
444 					"root permissions(linux) or sysctl "
445 					"bigger net.core.wmem_max(linux) or "
446 					"kern.ipc.maxsockbuf(bsd) values.",
447 					(unsigned)snd, (unsigned)got);
448 			}
449 #  ifdef SO_SNDBUFFORCE
450 		}
451 #  endif
452 #endif /* SO_SNDBUF */
453 	}
454 	err = set_ip_dscp(s, family, dscp);
455 	if(err != NULL)
456 		log_warn("error setting IP DiffServ codepoint %d on UDP socket: %s", dscp, err);
457 	if(family == AF_INET6) {
458 # if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
459 		int omit6_set = 0;
460 		int action;
461 # endif
462 # if defined(IPV6_V6ONLY)
463 		if(v6only
464 #   ifdef HAVE_SYSTEMD
465 			/* Systemd wants to control if the socket is v6 only
466 			 * or both, with BindIPv6Only=default, ipv6-only or
467 			 * both in systemd.socket, so it is not set here. */
468 			&& !got_fd_from_systemd
469 #   endif
470 			) {
471 			int val=(v6only==2)?0:1;
472 			if (setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
473 				(void*)&val, (socklen_t)sizeof(val)) < 0) {
474 				log_err("setsockopt(..., IPV6_V6ONLY"
475 					", ...) failed: %s", sock_strerror(errno));
476 				sock_close(s);
477 				*noproto = 0;
478 				*inuse = 0;
479 				return -1;
480 			}
481 		}
482 # endif
483 # if defined(IPV6_USE_MIN_MTU)
484 		/*
485 		 * There is no fragmentation of IPv6 datagrams
486 		 * during forwarding in the network. Therefore
487 		 * we do not send UDP datagrams larger than
488 		 * the minimum IPv6 MTU of 1280 octets. The
489 		 * EDNS0 message length can be larger if the
490 		 * network stack supports IPV6_USE_MIN_MTU.
491 		 */
492 		if (setsockopt(s, IPPROTO_IPV6, IPV6_USE_MIN_MTU,
493 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
494 			log_err("setsockopt(..., IPV6_USE_MIN_MTU, "
495 				"...) failed: %s", sock_strerror(errno));
496 			sock_close(s);
497 			*noproto = 0;
498 			*inuse = 0;
499 			return -1;
500 		}
501 # elif defined(IPV6_MTU)
502 #   ifndef USE_WINSOCK
503 		/*
504 		 * On Linux, to send no larger than 1280, the PMTUD is
505 		 * disabled by default for datagrams anyway, so we set
506 		 * the MTU to use.
507 		 */
508 		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU,
509 			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
510 			log_err("setsockopt(..., IPV6_MTU, ...) failed: %s",
511 				sock_strerror(errno));
512 			sock_close(s);
513 			*noproto = 0;
514 			*inuse = 0;
515 			return -1;
516 		}
517 #   elif defined(IPV6_USER_MTU)
518 		/* As later versions of the mingw crosscompiler define
519 		 * IPV6_MTU, do the same for windows but use IPV6_USER_MTU
520 		 * instead which is writable; IPV6_MTU is readonly there. */
521 		if (setsockopt(s, IPPROTO_IPV6, IPV6_USER_MTU,
522 			(void*)&mtu, (socklen_t)sizeof(mtu)) < 0) {
523 			if (WSAGetLastError() != WSAENOPROTOOPT) {
524 				log_err("setsockopt(..., IPV6_USER_MTU, ...) failed: %s",
525 					wsa_strerror(WSAGetLastError()));
526 				sock_close(s);
527 				*noproto = 0;
528 				*inuse = 0;
529 				return -1;
530 			}
531 		}
532 #   endif /* USE_WINSOCK */
533 # endif /* IPv6 MTU */
534 # if defined(IPV6_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
535 #  if defined(IP_PMTUDISC_OMIT)
536 		action = IP_PMTUDISC_OMIT;
537 		if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
538 			&action, (socklen_t)sizeof(action)) < 0) {
539 
540 			if (errno != EINVAL) {
541 				log_err("setsockopt(..., IPV6_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
542 					strerror(errno));
543 				sock_close(s);
544 				*noproto = 0;
545 				*inuse = 0;
546 				return -1;
547 			}
548 		}
549 		else
550 		{
551 		    omit6_set = 1;
552 		}
553 #  endif
554 		if (omit6_set == 0) {
555 			action = IP_PMTUDISC_DONT;
556 			if (setsockopt(s, IPPROTO_IPV6, IPV6_MTU_DISCOVER,
557 				&action, (socklen_t)sizeof(action)) < 0) {
558 				log_err("setsockopt(..., IPV6_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
559 					strerror(errno));
560 				sock_close(s);
561 				*noproto = 0;
562 				*inuse = 0;
563 				return -1;
564 			}
565 		}
566 # endif /* IPV6_MTU_DISCOVER */
567 	} else if(family == AF_INET) {
568 #  if defined(IP_MTU_DISCOVER) && defined(IP_PMTUDISC_DONT)
569 /* linux 3.15 has IP_PMTUDISC_OMIT, Hannes Frederic Sowa made it so that
570  * PMTU information is not accepted, but fragmentation is allowed
571  * if and only if the packet size exceeds the outgoing interface MTU
572  * (and also uses the interface mtu to determine the size of the packets).
573  * So there won't be any EMSGSIZE error.  Against DNS fragmentation attacks.
574  * FreeBSD already has same semantics without setting the option. */
575 		int omit_set = 0;
576 		int action;
577 #   if defined(IP_PMTUDISC_OMIT)
578 		action = IP_PMTUDISC_OMIT;
579 		if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
580 			&action, (socklen_t)sizeof(action)) < 0) {
581 
582 			if (errno != EINVAL) {
583 				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_OMIT...) failed: %s",
584 					strerror(errno));
585 				sock_close(s);
586 				*noproto = 0;
587 				*inuse = 0;
588 				return -1;
589 			}
590 		}
591 		else
592 		{
593 		    omit_set = 1;
594 		}
595 #   endif
596 		if (omit_set == 0) {
597    			action = IP_PMTUDISC_DONT;
598 			if (setsockopt(s, IPPROTO_IP, IP_MTU_DISCOVER,
599 				&action, (socklen_t)sizeof(action)) < 0) {
600 				log_err("setsockopt(..., IP_MTU_DISCOVER, IP_PMTUDISC_DONT...) failed: %s",
601 					strerror(errno));
602 				sock_close(s);
603 				*noproto = 0;
604 				*inuse = 0;
605 				return -1;
606 			}
607 		}
608 #  elif defined(IP_DONTFRAG) && !defined(__APPLE__)
609 		/* the IP_DONTFRAG option if defined in the 11.0 OSX headers,
610 		 * but does not work on that version, so we exclude it */
611 		int off = 0;
612 		if (setsockopt(s, IPPROTO_IP, IP_DONTFRAG,
613 			&off, (socklen_t)sizeof(off)) < 0) {
614 			log_err("setsockopt(..., IP_DONTFRAG, ...) failed: %s",
615 				strerror(errno));
616 			sock_close(s);
617 			*noproto = 0;
618 			*inuse = 0;
619 			return -1;
620 		}
621 #  endif /* IPv4 MTU */
622 	}
623 	if(
624 #ifdef HAVE_SYSTEMD
625 		!got_fd_from_systemd &&
626 #endif
627 		bind(s, (struct sockaddr*)addr, addrlen) != 0) {
628 		*noproto = 0;
629 		*inuse = 0;
630 #ifndef USE_WINSOCK
631 #ifdef EADDRINUSE
632 		*inuse = (errno == EADDRINUSE);
633 		/* detect freebsd jail with no ipv6 permission */
634 		if(family==AF_INET6 && errno==EINVAL)
635 			*noproto = 1;
636 		else if(errno != EADDRINUSE &&
637 			!(errno == EACCES && verbosity < 4 && !listen)
638 #ifdef EADDRNOTAVAIL
639 			&& !(errno == EADDRNOTAVAIL && verbosity < 4 && !listen)
640 #endif
641 			) {
642 			log_err_addr("can't bind socket", strerror(errno),
643 				(struct sockaddr_storage*)addr, addrlen);
644 		}
645 #endif /* EADDRINUSE */
646 #else /* USE_WINSOCK */
647 		if(WSAGetLastError() != WSAEADDRINUSE &&
648 			WSAGetLastError() != WSAEADDRNOTAVAIL &&
649 			!(WSAGetLastError() == WSAEACCES && verbosity < 4 && !listen)) {
650 			log_err_addr("can't bind socket",
651 				wsa_strerror(WSAGetLastError()),
652 				(struct sockaddr_storage*)addr, addrlen);
653 		}
654 #endif /* USE_WINSOCK */
655 		sock_close(s);
656 		return -1;
657 	}
658 	if(!fd_set_nonblock(s)) {
659 		*noproto = 0;
660 		*inuse = 0;
661 		sock_close(s);
662 		return -1;
663 	}
664 	return s;
665 }
666 
667 int
668 create_tcp_accept_sock(struct addrinfo *addr, int v6only, int* noproto,
669 	int* reuseport, int transparent, int mss, int nodelay, int freebind,
670 	int use_systemd, int dscp)
671 {
672 	int s;
673 	char* err;
674 #if defined(SO_REUSEADDR) || defined(SO_REUSEPORT) || defined(IPV6_V6ONLY) || defined(IP_TRANSPARENT) || defined(IP_BINDANY) || defined(IP_FREEBIND) || defined(SO_BINDANY)
675 	int on = 1;
676 #endif
677 #ifdef HAVE_SYSTEMD
678 	int got_fd_from_systemd = 0;
679 #endif
680 #ifdef USE_TCP_FASTOPEN
681 	int qlen;
682 #endif
683 #if !defined(IP_TRANSPARENT) && !defined(IP_BINDANY) && !defined(SO_BINDANY)
684 	(void)transparent;
685 #endif
686 #if !defined(IP_FREEBIND)
687 	(void)freebind;
688 #endif
689 	verbose_print_addr(addr);
690 	*noproto = 0;
691 #ifdef HAVE_SYSTEMD
692 	if (!use_systemd ||
693 	    (use_systemd
694 	     && (s = systemd_get_activated(addr->ai_family, addr->ai_socktype, 1,
695 					   addr->ai_addr, addr->ai_addrlen,
696 					   NULL)) == -1)) {
697 #else
698 	(void)use_systemd;
699 #endif
700 	if((s = socket(addr->ai_family, addr->ai_socktype, 0)) == -1) {
701 #ifndef USE_WINSOCK
702 		if(errno == EAFNOSUPPORT || errno == EPROTONOSUPPORT) {
703 			*noproto = 1;
704 			return -1;
705 		}
706 #else
707 		if(WSAGetLastError() == WSAEAFNOSUPPORT ||
708 			WSAGetLastError() == WSAEPROTONOSUPPORT) {
709 			*noproto = 1;
710 			return -1;
711 		}
712 #endif
713 		log_err("can't create socket: %s", sock_strerror(errno));
714 		return -1;
715 	}
716 	if(nodelay) {
717 #if defined(IPPROTO_TCP) && defined(TCP_NODELAY)
718 		if(setsockopt(s, IPPROTO_TCP, TCP_NODELAY, (void*)&on,
719 			(socklen_t)sizeof(on)) < 0) {
720 			#ifndef USE_WINSOCK
721 			log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
722 				strerror(errno));
723 			#else
724 			log_err(" setsockopt(.. TCP_NODELAY ..) failed: %s",
725 				wsa_strerror(WSAGetLastError()));
726 			#endif
727 		}
728 #else
729 		log_warn(" setsockopt(TCP_NODELAY) unsupported");
730 #endif /* defined(IPPROTO_TCP) && defined(TCP_NODELAY) */
731 	}
732 	if (mss > 0) {
733 #if defined(IPPROTO_TCP) && defined(TCP_MAXSEG)
734 		if(setsockopt(s, IPPROTO_TCP, TCP_MAXSEG, (void*)&mss,
735 			(socklen_t)sizeof(mss)) < 0) {
736 			log_err(" setsockopt(.. TCP_MAXSEG ..) failed: %s",
737 				sock_strerror(errno));
738 		} else {
739 			verbose(VERB_ALGO,
740 				" tcp socket mss set to %d", mss);
741 		}
742 #else
743 		log_warn(" setsockopt(TCP_MAXSEG) unsupported");
744 #endif /* defined(IPPROTO_TCP) && defined(TCP_MAXSEG) */
745 	}
746 #ifdef HAVE_SYSTEMD
747 	} else {
748 		got_fd_from_systemd = 1;
749     }
750 #endif
751 #ifdef SO_REUSEADDR
752 	if(setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (void*)&on,
753 		(socklen_t)sizeof(on)) < 0) {
754 		log_err("setsockopt(.. SO_REUSEADDR ..) failed: %s",
755 			sock_strerror(errno));
756 		sock_close(s);
757 		return -1;
758 	}
759 #endif /* SO_REUSEADDR */
760 #ifdef IP_FREEBIND
761 	if (freebind && setsockopt(s, IPPROTO_IP, IP_FREEBIND, (void*)&on,
762 	    (socklen_t)sizeof(on)) < 0) {
763 		log_warn("setsockopt(.. IP_FREEBIND ..) failed: %s",
764 		strerror(errno));
765 	}
766 #endif /* IP_FREEBIND */
767 #ifdef SO_REUSEPORT
768 	/* try to set SO_REUSEPORT so that incoming
769 	 * connections are distributed evenly among the receiving threads.
770 	 * Each thread must have its own socket bound to the same port,
771 	 * with SO_REUSEPORT set on each socket.
772 	 */
773 	if (reuseport && *reuseport &&
774 		setsockopt(s, SOL_SOCKET, SO_REUSEPORT, (void*)&on,
775 		(socklen_t)sizeof(on)) < 0) {
776 #ifdef ENOPROTOOPT
777 		if(errno != ENOPROTOOPT || verbosity >= 3)
778 			log_warn("setsockopt(.. SO_REUSEPORT ..) failed: %s",
779 				strerror(errno));
780 #endif
781 		/* this option is not essential, we can continue */
782 		*reuseport = 0;
783 	}
784 #else
785 	(void)reuseport;
786 #endif /* defined(SO_REUSEPORT) */
787 #if defined(IPV6_V6ONLY)
788 	if(addr->ai_family == AF_INET6 && v6only
789 #  ifdef HAVE_SYSTEMD
790 		/* Systemd wants to control if the socket is v6 only
791 		 * or both, with BindIPv6Only=default, ipv6-only or
792 		 * both in systemd.socket, so it is not set here. */
793 		&& !got_fd_from_systemd
794 #  endif
795 		) {
796 		if(setsockopt(s, IPPROTO_IPV6, IPV6_V6ONLY,
797 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
798 			log_err("setsockopt(..., IPV6_V6ONLY, ...) failed: %s",
799 				sock_strerror(errno));
800 			sock_close(s);
801 			return -1;
802 		}
803 	}
804 #else
805 	(void)v6only;
806 #endif /* IPV6_V6ONLY */
807 #ifdef IP_TRANSPARENT
808 	if (transparent &&
809 	    setsockopt(s, IPPROTO_IP, IP_TRANSPARENT, (void*)&on,
810 	    (socklen_t)sizeof(on)) < 0) {
811 		log_warn("setsockopt(.. IP_TRANSPARENT ..) failed: %s",
812 			strerror(errno));
813 	}
814 #elif defined(IP_BINDANY)
815 	if (transparent &&
816 	    setsockopt(s, (addr->ai_family==AF_INET6? IPPROTO_IPV6:IPPROTO_IP),
817 	    (addr->ai_family == AF_INET6? IPV6_BINDANY:IP_BINDANY),
818 	    (void*)&on, (socklen_t)sizeof(on)) < 0) {
819 		log_warn("setsockopt(.. IP%s_BINDANY ..) failed: %s",
820 		(addr->ai_family==AF_INET6?"V6":""), strerror(errno));
821 	}
822 #elif defined(SO_BINDANY)
823 	if (transparent &&
824 	    setsockopt(s, SOL_SOCKET, SO_BINDANY, (void*)&on, (socklen_t)
825 	    sizeof(on)) < 0) {
826 		log_warn("setsockopt(.. SO_BINDANY ..) failed: %s",
827 		strerror(errno));
828 	}
829 #endif /* IP_TRANSPARENT || IP_BINDANY || SO_BINDANY */
830 	err = set_ip_dscp(s, addr->ai_family, dscp);
831 	if(err != NULL)
832 		log_warn("error setting IP DiffServ codepoint %d on TCP socket: %s", dscp, err);
833 	if(
834 #ifdef HAVE_SYSTEMD
835 		!got_fd_from_systemd &&
836 #endif
837         bind(s, addr->ai_addr, addr->ai_addrlen) != 0) {
838 #ifndef USE_WINSOCK
839 		/* detect freebsd jail with no ipv6 permission */
840 		if(addr->ai_family==AF_INET6 && errno==EINVAL)
841 			*noproto = 1;
842 		else {
843 			log_err_addr("can't bind socket", strerror(errno),
844 				(struct sockaddr_storage*)addr->ai_addr,
845 				addr->ai_addrlen);
846 		}
847 #else
848 		log_err_addr("can't bind socket",
849 			wsa_strerror(WSAGetLastError()),
850 			(struct sockaddr_storage*)addr->ai_addr,
851 			addr->ai_addrlen);
852 #endif
853 		sock_close(s);
854 		return -1;
855 	}
856 	if(!fd_set_nonblock(s)) {
857 		sock_close(s);
858 		return -1;
859 	}
860 	if(listen(s, TCP_BACKLOG) == -1) {
861 		log_err("can't listen: %s", sock_strerror(errno));
862 		sock_close(s);
863 		return -1;
864 	}
865 #ifdef USE_TCP_FASTOPEN
866 	/* qlen specifies how many outstanding TFO requests to allow. Limit is a defense
867 	   against IP spoofing attacks as suggested in RFC7413 */
868 #ifdef __APPLE__
869 	/* OS X implementation only supports qlen of 1 via this call. Actual
870 	   value is configured by the net.inet.tcp.fastopen_backlog kernel parm. */
871 	qlen = 1;
872 #else
873 	/* 5 is recommended on linux */
874 	qlen = 5;
875 #endif
876 	if ((setsockopt(s, IPPROTO_TCP, TCP_FASTOPEN, &qlen,
877 		  sizeof(qlen))) == -1 ) {
878 #ifdef ENOPROTOOPT
879 		/* squelch ENOPROTOOPT: freebsd server mode with kernel support
880 		   disabled, except when verbosity enabled for debugging */
881 		if(errno != ENOPROTOOPT || verbosity >= 3) {
882 #endif
883 		  if(errno == EPERM) {
884 		  	log_warn("Setting TCP Fast Open as server failed: %s ; this could likely be because sysctl net.inet.tcp.fastopen.enabled, net.inet.tcp.fastopen.server_enable, or net.ipv4.tcp_fastopen is disabled", strerror(errno));
885 		  } else {
886 		  	log_err("Setting TCP Fast Open as server failed: %s", strerror(errno));
887 		  }
888 #ifdef ENOPROTOOPT
889 		}
890 #endif
891 	}
892 #endif
893 	return s;
894 }
895 
896 char*
897 set_ip_dscp(int socket, int addrfamily, int dscp)
898 {
899 	int ds;
900 
901 	if(dscp == 0)
902 		return NULL;
903 	ds = dscp << 2;
904 	switch(addrfamily) {
905 	case AF_INET6:
906 	#ifdef IPV6_TCLASS
907 		if(setsockopt(socket, IPPROTO_IPV6, IPV6_TCLASS, (void*)&ds,
908 			sizeof(ds)) < 0)
909 			return sock_strerror(errno);
910 		break;
911 	#else
912 		return "IPV6_TCLASS not defined on this system";
913 	#endif
914 	default:
915 		if(setsockopt(socket, IPPROTO_IP, IP_TOS, (void*)&ds, sizeof(ds)) < 0)
916 			return sock_strerror(errno);
917 		break;
918 	}
919 	return NULL;
920 }
921 
922 int
923 create_local_accept_sock(const char *path, int* noproto, int use_systemd)
924 {
925 #ifdef HAVE_SYSTEMD
926 	int ret;
927 
928 	if (use_systemd && (ret = systemd_get_activated(AF_LOCAL, SOCK_STREAM, 1, NULL, 0, path)) != -1)
929 		return ret;
930 	else {
931 #endif
932 #ifdef HAVE_SYS_UN_H
933 	int s;
934 	struct sockaddr_un usock;
935 #ifndef HAVE_SYSTEMD
936 	(void)use_systemd;
937 #endif
938 
939 	verbose(VERB_ALGO, "creating unix socket %s", path);
940 #ifdef HAVE_STRUCT_SOCKADDR_UN_SUN_LEN
941 	/* this member exists on BSDs, not Linux */
942 	usock.sun_len = (unsigned)sizeof(usock);
943 #endif
944 	usock.sun_family = AF_LOCAL;
945 	/* length is 92-108, 104 on FreeBSD */
946 	(void)strlcpy(usock.sun_path, path, sizeof(usock.sun_path));
947 
948 	if ((s = socket(AF_LOCAL, SOCK_STREAM, 0)) == -1) {
949 		log_err("Cannot create local socket %s (%s)",
950 			path, strerror(errno));
951 		return -1;
952 	}
953 
954 	if (unlink(path) && errno != ENOENT) {
955 		/* The socket already exists and cannot be removed */
956 		log_err("Cannot remove old local socket %s (%s)",
957 			path, strerror(errno));
958 		goto err;
959 	}
960 
961 	if (bind(s, (struct sockaddr *)&usock,
962 		(socklen_t)sizeof(struct sockaddr_un)) == -1) {
963 		log_err("Cannot bind local socket %s (%s)",
964 			path, strerror(errno));
965 		goto err;
966 	}
967 
968 	if (!fd_set_nonblock(s)) {
969 		log_err("Cannot set non-blocking mode");
970 		goto err;
971 	}
972 
973 	if (listen(s, TCP_BACKLOG) == -1) {
974 		log_err("can't listen: %s", strerror(errno));
975 		goto err;
976 	}
977 
978 	(void)noproto; /*unused*/
979 	return s;
980 
981 err:
982 	sock_close(s);
983 	return -1;
984 
985 #ifdef HAVE_SYSTEMD
986 	}
987 #endif
988 #else
989 	(void)use_systemd;
990 	(void)path;
991 	log_err("Local sockets are not supported");
992 	*noproto = 1;
993 	return -1;
994 #endif
995 }
996 
997 
998 /**
999  * Create socket from getaddrinfo results
1000  */
1001 static int
1002 make_sock(int stype, const char* ifname, const char* port,
1003 	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1004 	int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
1005 	int use_systemd, int dscp, struct unbound_socket* ub_sock)
1006 {
1007 	struct addrinfo *res = NULL;
1008 	int r, s, inuse, noproto;
1009 	hints->ai_socktype = stype;
1010 	*noip6 = 0;
1011 	if((r=getaddrinfo(ifname, port, hints, &res)) != 0 || !res) {
1012 #ifdef USE_WINSOCK
1013 		if(r == EAI_NONAME && hints->ai_family == AF_INET6){
1014 			*noip6 = 1; /* 'Host not found' for IP6 on winXP */
1015 			return -1;
1016 		}
1017 #endif
1018 		log_err("node %s:%s getaddrinfo: %s %s",
1019 			ifname?ifname:"default", port, gai_strerror(r),
1020 #ifdef EAI_SYSTEM
1021 			r==EAI_SYSTEM?(char*)strerror(errno):""
1022 #else
1023 			""
1024 #endif
1025 		);
1026 		return -1;
1027 	}
1028 	if(stype == SOCK_DGRAM) {
1029 		verbose_print_addr(res);
1030 		s = create_udp_sock(res->ai_family, res->ai_socktype,
1031 			(struct sockaddr*)res->ai_addr, res->ai_addrlen,
1032 			v6only, &inuse, &noproto, (int)rcv, (int)snd, 1,
1033 			reuseport, transparent, freebind, use_systemd, dscp);
1034 		if(s == -1 && inuse) {
1035 			log_err("bind: address already in use");
1036 		} else if(s == -1 && noproto && hints->ai_family == AF_INET6){
1037 			*noip6 = 1;
1038 		}
1039 	} else	{
1040 		s = create_tcp_accept_sock(res, v6only, &noproto, reuseport,
1041 			transparent, tcp_mss, nodelay, freebind, use_systemd,
1042 			dscp);
1043 		if(s == -1 && noproto && hints->ai_family == AF_INET6){
1044 			*noip6 = 1;
1045 		}
1046 	}
1047 
1048 	ub_sock->addr = res;
1049 	ub_sock->s = s;
1050 	ub_sock->fam = hints->ai_family;
1051 	ub_sock->acl = NULL;
1052 
1053 	return s;
1054 }
1055 
1056 /** make socket and first see if ifname contains port override info */
1057 static int
1058 make_sock_port(int stype, const char* ifname, const char* port,
1059 	struct addrinfo *hints, int v6only, int* noip6, size_t rcv, size_t snd,
1060 	int* reuseport, int transparent, int tcp_mss, int nodelay, int freebind,
1061 	int use_systemd, int dscp, struct unbound_socket* ub_sock)
1062 {
1063 	char* s = strchr(ifname, '@');
1064 	if(s) {
1065 		/* override port with ifspec@port */
1066 		char p[16];
1067 		char newif[128];
1068 		if((size_t)(s-ifname) >= sizeof(newif)) {
1069 			log_err("ifname too long: %s", ifname);
1070 			*noip6 = 0;
1071 			return -1;
1072 		}
1073 		if(strlen(s+1) >= sizeof(p)) {
1074 			log_err("portnumber too long: %s", ifname);
1075 			*noip6 = 0;
1076 			return -1;
1077 		}
1078 		(void)strlcpy(newif, ifname, sizeof(newif));
1079 		newif[s-ifname] = 0;
1080 		(void)strlcpy(p, s+1, sizeof(p));
1081 		p[strlen(s+1)]=0;
1082 		return make_sock(stype, newif, p, hints, v6only, noip6, rcv,
1083 			snd, reuseport, transparent, tcp_mss, nodelay, freebind,
1084 			use_systemd, dscp, ub_sock);
1085 	}
1086 	return make_sock(stype, ifname, port, hints, v6only, noip6, rcv, snd,
1087 		reuseport, transparent, tcp_mss, nodelay, freebind, use_systemd,
1088 		dscp, ub_sock);
1089 }
1090 
1091 /**
1092  * Add port to open ports list.
1093  * @param list: list head. changed.
1094  * @param s: fd.
1095  * @param ftype: if fd is UDP.
1096  * @param pp2_enabled: if PROXYv2 is enabled for this port.
1097  * @param ub_sock: socket with address.
1098  * @return false on failure. list in unchanged then.
1099  */
1100 static int
1101 port_insert(struct listen_port** list, int s, enum listen_type ftype,
1102 	int pp2_enabled, struct unbound_socket* ub_sock)
1103 {
1104 	struct listen_port* item = (struct listen_port*)malloc(
1105 		sizeof(struct listen_port));
1106 	if(!item)
1107 		return 0;
1108 	item->next = *list;
1109 	item->fd = s;
1110 	item->ftype = ftype;
1111 	item->pp2_enabled = pp2_enabled;
1112 	item->socket = ub_sock;
1113 	*list = item;
1114 	return 1;
1115 }
1116 
1117 /** set fd to receive source address packet info */
1118 static int
1119 set_recvpktinfo(int s, int family)
1120 {
1121 #if defined(IPV6_RECVPKTINFO) || defined(IPV6_PKTINFO) || (defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)) || defined(IP_PKTINFO)
1122 	int on = 1;
1123 #else
1124 	(void)s;
1125 #endif
1126 	if(family == AF_INET6) {
1127 #           ifdef IPV6_RECVPKTINFO
1128 		if(setsockopt(s, IPPROTO_IPV6, IPV6_RECVPKTINFO,
1129 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1130 			log_err("setsockopt(..., IPV6_RECVPKTINFO, ...) failed: %s",
1131 				strerror(errno));
1132 			return 0;
1133 		}
1134 #           elif defined(IPV6_PKTINFO)
1135 		if(setsockopt(s, IPPROTO_IPV6, IPV6_PKTINFO,
1136 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1137 			log_err("setsockopt(..., IPV6_PKTINFO, ...) failed: %s",
1138 				strerror(errno));
1139 			return 0;
1140 		}
1141 #           else
1142 		log_err("no IPV6_RECVPKTINFO and IPV6_PKTINFO options, please "
1143 			"disable interface-automatic or do-ip6 in config");
1144 		return 0;
1145 #           endif /* defined IPV6_RECVPKTINFO */
1146 
1147 	} else if(family == AF_INET) {
1148 #           ifdef IP_PKTINFO
1149 		if(setsockopt(s, IPPROTO_IP, IP_PKTINFO,
1150 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1151 			log_err("setsockopt(..., IP_PKTINFO, ...) failed: %s",
1152 				strerror(errno));
1153 			return 0;
1154 		}
1155 #           elif defined(IP_RECVDSTADDR) && defined(IP_SENDSRCADDR)
1156 		if(setsockopt(s, IPPROTO_IP, IP_RECVDSTADDR,
1157 			(void*)&on, (socklen_t)sizeof(on)) < 0) {
1158 			log_err("setsockopt(..., IP_RECVDSTADDR, ...) failed: %s",
1159 				strerror(errno));
1160 			return 0;
1161 		}
1162 #           else
1163 		log_err("no IP_SENDSRCADDR or IP_PKTINFO option, please disable "
1164 			"interface-automatic or do-ip4 in config");
1165 		return 0;
1166 #           endif /* IP_PKTINFO */
1167 
1168 	}
1169 	return 1;
1170 }
1171 
1172 /** see if interface is ssl, its port number == the ssl port number */
1173 static int
1174 if_is_ssl(const char* ifname, const char* port, int ssl_port,
1175 	struct config_strlist* tls_additional_port)
1176 {
1177 	struct config_strlist* s;
1178 	char* p = strchr(ifname, '@');
1179 	if(!p && atoi(port) == ssl_port)
1180 		return 1;
1181 	if(p && atoi(p+1) == ssl_port)
1182 		return 1;
1183 	for(s = tls_additional_port; s; s = s->next) {
1184 		if(p && atoi(p+1) == atoi(s->str))
1185 			return 1;
1186 		if(!p && atoi(port) == atoi(s->str))
1187 			return 1;
1188 	}
1189 	return 0;
1190 }
1191 
1192 /**
1193  * Helper for ports_open. Creates one interface (or NULL for default).
1194  * @param ifname: The interface ip address.
1195  * @param do_auto: use automatic interface detection.
1196  * 	If enabled, then ifname must be the wildcard name.
1197  * @param do_udp: if udp should be used.
1198  * @param do_tcp: if tcp should be used.
1199  * @param hints: for getaddrinfo. family and flags have to be set by caller.
1200  * @param port: Port number to use (as string).
1201  * @param list: list of open ports, appended to, changed to point to list head.
1202  * @param rcv: receive buffer size for UDP
1203  * @param snd: send buffer size for UDP
1204  * @param ssl_port: ssl service port number
1205  * @param tls_additional_port: list of additional ssl service port numbers.
1206  * @param https_port: DoH service port number
1207  * @param proxy_protocol_port: list of PROXYv2 port numbers.
1208  * @param reuseport: try to set SO_REUSEPORT if nonNULL and true.
1209  * 	set to false on exit if reuseport failed due to no kernel support.
1210  * @param transparent: set IP_TRANSPARENT socket option.
1211  * @param tcp_mss: maximum segment size of tcp socket. default if zero.
1212  * @param freebind: set IP_FREEBIND socket option.
1213  * @param http2_nodelay: set TCP_NODELAY on HTTP/2 connection
1214  * @param use_systemd: if true, fetch sockets from systemd.
1215  * @param dnscrypt_port: dnscrypt service port number
1216  * @param dscp: DSCP to use.
1217  * @return: returns false on error.
1218  */
1219 static int
1220 ports_create_if(const char* ifname, int do_auto, int do_udp, int do_tcp,
1221 	struct addrinfo *hints, const char* port, struct listen_port** list,
1222 	size_t rcv, size_t snd, int ssl_port,
1223 	struct config_strlist* tls_additional_port, int https_port,
1224 	struct config_strlist* proxy_protocol_port,
1225 	int* reuseport, int transparent, int tcp_mss, int freebind,
1226 	int http2_nodelay, int use_systemd, int dnscrypt_port, int dscp)
1227 {
1228 	int s, noip6=0;
1229 	int is_https = if_is_https(ifname, port, https_port);
1230 	int is_dnscrypt = if_is_dnscrypt(ifname, port, dnscrypt_port);
1231 	int is_pp2 = if_is_pp2(ifname, port, proxy_protocol_port);
1232 	int nodelay = is_https && http2_nodelay;
1233 	struct unbound_socket* ub_sock;
1234 
1235 	if(!do_udp && !do_tcp)
1236 		return 0;
1237 
1238 	if(is_pp2) {
1239 		if(is_dnscrypt) {
1240 			fatal_exit("PROXYv2 and DNSCrypt combination not "
1241 				"supported!");
1242 		} else if(is_https) {
1243 			fatal_exit("PROXYv2 and DoH combination not "
1244 				"supported!");
1245 		}
1246 	}
1247 
1248 	if(do_auto) {
1249 		ub_sock = calloc(1, sizeof(struct unbound_socket));
1250 		if(!ub_sock)
1251 			return 0;
1252 		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
1253 			&noip6, rcv, snd, reuseport, transparent,
1254 			tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock)) == -1) {
1255 			freeaddrinfo(ub_sock->addr);
1256 			free(ub_sock);
1257 			if(noip6) {
1258 				log_warn("IPv6 protocol not available");
1259 				return 1;
1260 			}
1261 			return 0;
1262 		}
1263 		/* getting source addr packet info is highly non-portable */
1264 		if(!set_recvpktinfo(s, hints->ai_family)) {
1265 			sock_close(s);
1266 			freeaddrinfo(ub_sock->addr);
1267 			free(ub_sock);
1268 			return 0;
1269 		}
1270 		if(!port_insert(list, s, is_dnscrypt
1271 			?listen_type_udpancil_dnscrypt:listen_type_udpancil,
1272 			is_pp2, ub_sock)) {
1273 			sock_close(s);
1274 			freeaddrinfo(ub_sock->addr);
1275 			free(ub_sock);
1276 			return 0;
1277 		}
1278 	} else if(do_udp) {
1279 		ub_sock = calloc(1, sizeof(struct unbound_socket));
1280 		if(!ub_sock)
1281 			return 0;
1282 		/* regular udp socket */
1283 		if((s = make_sock_port(SOCK_DGRAM, ifname, port, hints, 1,
1284 			&noip6, rcv, snd, reuseport, transparent,
1285 			tcp_mss, nodelay, freebind, use_systemd, dscp, ub_sock)) == -1) {
1286 			freeaddrinfo(ub_sock->addr);
1287 			free(ub_sock);
1288 			if(noip6) {
1289 				log_warn("IPv6 protocol not available");
1290 				return 1;
1291 			}
1292 			return 0;
1293 		}
1294 		if(!port_insert(list, s, is_dnscrypt
1295 			?listen_type_udp_dnscrypt:listen_type_udp,
1296 			is_pp2, ub_sock)) {
1297 			sock_close(s);
1298 			freeaddrinfo(ub_sock->addr);
1299 			free(ub_sock);
1300 			return 0;
1301 		}
1302 	}
1303 	if(do_tcp) {
1304 		int is_ssl = if_is_ssl(ifname, port, ssl_port,
1305 			tls_additional_port);
1306 		enum listen_type port_type;
1307 		ub_sock = calloc(1, sizeof(struct unbound_socket));
1308 		if(!ub_sock)
1309 			return 0;
1310 		if(is_ssl)
1311 			port_type = listen_type_ssl;
1312 		else if(is_https)
1313 			port_type = listen_type_http;
1314 		else if(is_dnscrypt)
1315 			port_type = listen_type_tcp_dnscrypt;
1316 		else
1317 			port_type = listen_type_tcp;
1318 		if((s = make_sock_port(SOCK_STREAM, ifname, port, hints, 1,
1319 			&noip6, 0, 0, reuseport, transparent, tcp_mss, nodelay,
1320 			freebind, use_systemd, dscp, ub_sock)) == -1) {
1321 			freeaddrinfo(ub_sock->addr);
1322 			free(ub_sock);
1323 			if(noip6) {
1324 				/*log_warn("IPv6 protocol not available");*/
1325 				return 1;
1326 			}
1327 			return 0;
1328 		}
1329 		if(is_ssl)
1330 			verbose(VERB_ALGO, "setup TCP for SSL service");
1331 		if(!port_insert(list, s, port_type, is_pp2, ub_sock)) {
1332 			sock_close(s);
1333 			freeaddrinfo(ub_sock->addr);
1334 			free(ub_sock);
1335 			return 0;
1336 		}
1337 	}
1338 	return 1;
1339 }
1340 
1341 /**
1342  * Add items to commpoint list in front.
1343  * @param c: commpoint to add.
1344  * @param front: listen struct.
1345  * @return: false on failure.
1346  */
1347 static int
1348 listen_cp_insert(struct comm_point* c, struct listen_dnsport* front)
1349 {
1350 	struct listen_list* item = (struct listen_list*)malloc(
1351 		sizeof(struct listen_list));
1352 	if(!item)
1353 		return 0;
1354 	item->com = c;
1355 	item->next = front->cps;
1356 	front->cps = item;
1357 	return 1;
1358 }
1359 
1360 void listen_setup_locks(void)
1361 {
1362 	if(!stream_wait_lock_inited) {
1363 		lock_basic_init(&stream_wait_count_lock);
1364 		stream_wait_lock_inited = 1;
1365 	}
1366 	if(!http2_query_buffer_lock_inited) {
1367 		lock_basic_init(&http2_query_buffer_count_lock);
1368 		http2_query_buffer_lock_inited = 1;
1369 	}
1370 	if(!http2_response_buffer_lock_inited) {
1371 		lock_basic_init(&http2_response_buffer_count_lock);
1372 		http2_response_buffer_lock_inited = 1;
1373 	}
1374 }
1375 
1376 void listen_desetup_locks(void)
1377 {
1378 	if(stream_wait_lock_inited) {
1379 		stream_wait_lock_inited = 0;
1380 		lock_basic_destroy(&stream_wait_count_lock);
1381 	}
1382 	if(http2_query_buffer_lock_inited) {
1383 		http2_query_buffer_lock_inited = 0;
1384 		lock_basic_destroy(&http2_query_buffer_count_lock);
1385 	}
1386 	if(http2_response_buffer_lock_inited) {
1387 		http2_response_buffer_lock_inited = 0;
1388 		lock_basic_destroy(&http2_response_buffer_count_lock);
1389 	}
1390 }
1391 
1392 struct listen_dnsport*
1393 listen_create(struct comm_base* base, struct listen_port* ports,
1394 	size_t bufsize, int tcp_accept_count, int tcp_idle_timeout,
1395 	int harden_large_queries, uint32_t http_max_streams,
1396 	char* http_endpoint, int http_notls, struct tcl_list* tcp_conn_limit,
1397 	void* sslctx, struct dt_env* dtenv, comm_point_callback_type* cb,
1398 	void *cb_arg)
1399 {
1400 	struct listen_dnsport* front = (struct listen_dnsport*)
1401 		malloc(sizeof(struct listen_dnsport));
1402 	if(!front)
1403 		return NULL;
1404 	front->cps = NULL;
1405 	front->udp_buff = sldns_buffer_new(bufsize);
1406 #ifdef USE_DNSCRYPT
1407 	front->dnscrypt_udp_buff = NULL;
1408 #endif
1409 	if(!front->udp_buff) {
1410 		free(front);
1411 		return NULL;
1412 	}
1413 
1414 	/* create comm points as needed */
1415 	while(ports) {
1416 		struct comm_point* cp = NULL;
1417 		if(ports->ftype == listen_type_udp ||
1418 		   ports->ftype == listen_type_udp_dnscrypt) {
1419 			cp = comm_point_create_udp(base, ports->fd,
1420 				front->udp_buff, ports->pp2_enabled, cb,
1421 				cb_arg, ports->socket);
1422 		} else if(ports->ftype == listen_type_tcp ||
1423 				ports->ftype == listen_type_tcp_dnscrypt) {
1424 			cp = comm_point_create_tcp(base, ports->fd,
1425 				tcp_accept_count, tcp_idle_timeout,
1426 				harden_large_queries, 0, NULL,
1427 				tcp_conn_limit, bufsize, front->udp_buff,
1428 				ports->ftype, ports->pp2_enabled, cb, cb_arg,
1429 				ports->socket);
1430 		} else if(ports->ftype == listen_type_ssl ||
1431 			ports->ftype == listen_type_http) {
1432 			cp = comm_point_create_tcp(base, ports->fd,
1433 				tcp_accept_count, tcp_idle_timeout,
1434 				harden_large_queries,
1435 				http_max_streams, http_endpoint,
1436 				tcp_conn_limit, bufsize, front->udp_buff,
1437 				ports->ftype, ports->pp2_enabled, cb, cb_arg,
1438 				ports->socket);
1439 			if(ports->ftype == listen_type_http) {
1440 				if(!sslctx && !http_notls) {
1441 					log_warn("HTTPS port configured, but "
1442 						"no TLS tls-service-key or "
1443 						"tls-service-pem set");
1444 				}
1445 #ifndef HAVE_SSL_CTX_SET_ALPN_SELECT_CB
1446 				if(!http_notls) {
1447 					log_warn("Unbound is not compiled "
1448 						"with an OpenSSL version "
1449 						"supporting ALPN "
1450 						"(OpenSSL >= 1.0.2). This "
1451 						"is required to use "
1452 						"DNS-over-HTTPS");
1453 				}
1454 #endif
1455 #ifndef HAVE_NGHTTP2_NGHTTP2_H
1456 				log_warn("Unbound is not compiled with "
1457 					"nghttp2. This is required to use "
1458 					"DNS-over-HTTPS.");
1459 #endif
1460 			}
1461 		} else if(ports->ftype == listen_type_udpancil ||
1462 				  ports->ftype == listen_type_udpancil_dnscrypt) {
1463 			cp = comm_point_create_udp_ancil(base, ports->fd,
1464 				front->udp_buff, ports->pp2_enabled, cb,
1465 				cb_arg, ports->socket);
1466 		}
1467 		if(!cp) {
1468 			log_err("can't create commpoint");
1469 			listen_delete(front);
1470 			return NULL;
1471 		}
1472 		if((http_notls && ports->ftype == listen_type_http) ||
1473 			(ports->ftype == listen_type_tcp) ||
1474 			(ports->ftype == listen_type_udp) ||
1475 			(ports->ftype == listen_type_udpancil) ||
1476 			(ports->ftype == listen_type_tcp_dnscrypt) ||
1477 			(ports->ftype == listen_type_udp_dnscrypt) ||
1478 			(ports->ftype == listen_type_udpancil_dnscrypt))
1479 			cp->ssl = NULL;
1480 		else
1481 			cp->ssl = sslctx;
1482 		cp->dtenv = dtenv;
1483 		cp->do_not_close = 1;
1484 #ifdef USE_DNSCRYPT
1485 		if (ports->ftype == listen_type_udp_dnscrypt ||
1486 			ports->ftype == listen_type_tcp_dnscrypt ||
1487 			ports->ftype == listen_type_udpancil_dnscrypt) {
1488 			cp->dnscrypt = 1;
1489 			cp->dnscrypt_buffer = sldns_buffer_new(bufsize);
1490 			if(!cp->dnscrypt_buffer) {
1491 				log_err("can't alloc dnscrypt_buffer");
1492 				comm_point_delete(cp);
1493 				listen_delete(front);
1494 				return NULL;
1495 			}
1496 			front->dnscrypt_udp_buff = cp->dnscrypt_buffer;
1497 		}
1498 #endif
1499 		if(!listen_cp_insert(cp, front)) {
1500 			log_err("malloc failed");
1501 			comm_point_delete(cp);
1502 			listen_delete(front);
1503 			return NULL;
1504 		}
1505 		ports = ports->next;
1506 	}
1507 	if(!front->cps) {
1508 		log_err("Could not open sockets to accept queries.");
1509 		listen_delete(front);
1510 		return NULL;
1511 	}
1512 
1513 	return front;
1514 }
1515 
1516 void
1517 listen_list_delete(struct listen_list* list)
1518 {
1519 	struct listen_list *p = list, *pn;
1520 	while(p) {
1521 		pn = p->next;
1522 		comm_point_delete(p->com);
1523 		free(p);
1524 		p = pn;
1525 	}
1526 }
1527 
1528 void
1529 listen_delete(struct listen_dnsport* front)
1530 {
1531 	if(!front)
1532 		return;
1533 	listen_list_delete(front->cps);
1534 #ifdef USE_DNSCRYPT
1535 	if(front->dnscrypt_udp_buff &&
1536 		front->udp_buff != front->dnscrypt_udp_buff) {
1537 		sldns_buffer_free(front->dnscrypt_udp_buff);
1538 	}
1539 #endif
1540 	sldns_buffer_free(front->udp_buff);
1541 	free(front);
1542 }
1543 
1544 #ifdef HAVE_GETIFADDRS
1545 static int
1546 resolve_ifa_name(struct ifaddrs *ifas, const char *search_ifa, char ***ip_addresses, int *ip_addresses_size)
1547 {
1548 	struct ifaddrs *ifa;
1549 	void *tmpbuf;
1550 	int last_ip_addresses_size = *ip_addresses_size;
1551 
1552 	for(ifa = ifas; ifa != NULL; ifa = ifa->ifa_next) {
1553 		sa_family_t family;
1554 		const char* atsign;
1555 #ifdef INET6      /* |   address ip    | % |  ifa name  | @ |  port  | nul */
1556 		char addr_buf[INET6_ADDRSTRLEN + 1 + IF_NAMESIZE + 1 + 16 + 1];
1557 #else
1558 		char addr_buf[INET_ADDRSTRLEN + 1 + 16 + 1];
1559 #endif
1560 
1561 		if((atsign=strrchr(search_ifa, '@')) != NULL) {
1562 			if(strlen(ifa->ifa_name) != (size_t)(atsign-search_ifa)
1563 			   || strncmp(ifa->ifa_name, search_ifa,
1564 			   atsign-search_ifa) != 0)
1565 				continue;
1566 		} else {
1567 			if(strcmp(ifa->ifa_name, search_ifa) != 0)
1568 				continue;
1569 			atsign = "";
1570 		}
1571 
1572 		if(ifa->ifa_addr == NULL)
1573 			continue;
1574 
1575 		family = ifa->ifa_addr->sa_family;
1576 		if(family == AF_INET) {
1577 			char a4[INET_ADDRSTRLEN + 1];
1578 			struct sockaddr_in *in4 = (struct sockaddr_in *)
1579 				ifa->ifa_addr;
1580 			if(!inet_ntop(family, &in4->sin_addr, a4, sizeof(a4))) {
1581 				log_err("inet_ntop failed");
1582 				return 0;
1583 			}
1584 			snprintf(addr_buf, sizeof(addr_buf), "%s%s",
1585 				a4, atsign);
1586 		}
1587 #ifdef INET6
1588 		else if(family == AF_INET6) {
1589 			struct sockaddr_in6 *in6 = (struct sockaddr_in6 *)
1590 				ifa->ifa_addr;
1591 			char a6[INET6_ADDRSTRLEN + 1];
1592 			char if_index_name[IF_NAMESIZE + 1];
1593 			if_index_name[0] = 0;
1594 			if(!inet_ntop(family, &in6->sin6_addr, a6, sizeof(a6))) {
1595 				log_err("inet_ntop failed");
1596 				return 0;
1597 			}
1598 			(void)if_indextoname(in6->sin6_scope_id,
1599 				(char *)if_index_name);
1600 			if (strlen(if_index_name) != 0) {
1601 				snprintf(addr_buf, sizeof(addr_buf),
1602 					"%s%%%s%s", a6, if_index_name, atsign);
1603 			} else {
1604 				snprintf(addr_buf, sizeof(addr_buf), "%s%s",
1605 					a6, atsign);
1606 			}
1607 		}
1608 #endif
1609 		else {
1610 			continue;
1611 		}
1612 		verbose(4, "interface %s has address %s", search_ifa, addr_buf);
1613 
1614 		tmpbuf = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
1615 		if(!tmpbuf) {
1616 			log_err("realloc failed: out of memory");
1617 			return 0;
1618 		} else {
1619 			*ip_addresses = tmpbuf;
1620 		}
1621 		(*ip_addresses)[*ip_addresses_size] = strdup(addr_buf);
1622 		if(!(*ip_addresses)[*ip_addresses_size]) {
1623 			log_err("strdup failed: out of memory");
1624 			return 0;
1625 		}
1626 		(*ip_addresses_size)++;
1627 	}
1628 
1629 	if (*ip_addresses_size == last_ip_addresses_size) {
1630 		tmpbuf = realloc(*ip_addresses, sizeof(char *) * (*ip_addresses_size + 1));
1631 		if(!tmpbuf) {
1632 			log_err("realloc failed: out of memory");
1633 			return 0;
1634 		} else {
1635 			*ip_addresses = tmpbuf;
1636 		}
1637 		(*ip_addresses)[*ip_addresses_size] = strdup(search_ifa);
1638 		if(!(*ip_addresses)[*ip_addresses_size]) {
1639 			log_err("strdup failed: out of memory");
1640 			return 0;
1641 		}
1642 		(*ip_addresses_size)++;
1643 	}
1644 	return 1;
1645 }
1646 #endif /* HAVE_GETIFADDRS */
1647 
1648 int resolve_interface_names(char** ifs, int num_ifs,
1649 	struct config_strlist* list, char*** resif, int* num_resif)
1650 {
1651 #ifdef HAVE_GETIFADDRS
1652 	struct ifaddrs *addrs = NULL;
1653 	if(num_ifs == 0 && list == NULL) {
1654 		*resif = NULL;
1655 		*num_resif = 0;
1656 		return 1;
1657 	}
1658 	if(getifaddrs(&addrs) == -1) {
1659 		log_err("failed to list interfaces: getifaddrs: %s",
1660 			strerror(errno));
1661 		freeifaddrs(addrs);
1662 		return 0;
1663 	}
1664 	if(ifs) {
1665 		int i;
1666 		for(i=0; i<num_ifs; i++) {
1667 			if(!resolve_ifa_name(addrs, ifs[i], resif, num_resif)) {
1668 				freeifaddrs(addrs);
1669 				config_del_strarray(*resif, *num_resif);
1670 				*resif = NULL;
1671 				*num_resif = 0;
1672 				return 0;
1673 			}
1674 		}
1675 	}
1676 	if(list) {
1677 		struct config_strlist* p;
1678 		for(p = list; p; p = p->next) {
1679 			if(!resolve_ifa_name(addrs, p->str, resif, num_resif)) {
1680 				freeifaddrs(addrs);
1681 				config_del_strarray(*resif, *num_resif);
1682 				*resif = NULL;
1683 				*num_resif = 0;
1684 				return 0;
1685 			}
1686 }
1687 	}
1688 	freeifaddrs(addrs);
1689 	return 1;
1690 #else
1691 	struct config_strlist* p;
1692 	if(num_ifs == 0 && list == NULL) {
1693 		*resif = NULL;
1694 		*num_resif = 0;
1695 		return 1;
1696 	}
1697 	*num_resif = num_ifs;
1698 	for(p = list; p; p = p->next) {
1699 		(*num_resif)++;
1700 	}
1701 	*resif = calloc(*num_resif, sizeof(**resif));
1702 	if(!*resif) {
1703 		log_err("out of memory");
1704 		return 0;
1705 	}
1706 	if(ifs) {
1707 		int i;
1708 		for(i=0; i<num_ifs; i++) {
1709 			(*resif)[i] = strdup(ifs[i]);
1710 			if(!((*resif)[i])) {
1711 				log_err("out of memory");
1712 				config_del_strarray(*resif, *num_resif);
1713 				*resif = NULL;
1714 				*num_resif = 0;
1715 				return 0;
1716 			}
1717 		}
1718 	}
1719 	if(list) {
1720 		int idx = num_ifs;
1721 		for(p = list; p; p = p->next) {
1722 			(*resif)[idx] = strdup(p->str);
1723 			if(!((*resif)[idx])) {
1724 				log_err("out of memory");
1725 				config_del_strarray(*resif, *num_resif);
1726 				*resif = NULL;
1727 				*num_resif = 0;
1728 				return 0;
1729 			}
1730 			idx++;
1731 		}
1732 	}
1733 	return 1;
1734 #endif /* HAVE_GETIFADDRS */
1735 }
1736 
1737 struct listen_port*
1738 listening_ports_open(struct config_file* cfg, char** ifs, int num_ifs,
1739 	int* reuseport)
1740 {
1741 	struct listen_port* list = NULL;
1742 	struct addrinfo hints;
1743 	int i, do_ip4, do_ip6;
1744 	int do_tcp, do_auto;
1745 	char portbuf[32];
1746 	snprintf(portbuf, sizeof(portbuf), "%d", cfg->port);
1747 	do_ip4 = cfg->do_ip4;
1748 	do_ip6 = cfg->do_ip6;
1749 	do_tcp = cfg->do_tcp;
1750 	do_auto = cfg->if_automatic && cfg->do_udp;
1751 	if(cfg->incoming_num_tcp == 0)
1752 		do_tcp = 0;
1753 
1754 	/* getaddrinfo */
1755 	memset(&hints, 0, sizeof(hints));
1756 	hints.ai_flags = AI_PASSIVE;
1757 	/* no name lookups on our listening ports */
1758 	if(num_ifs > 0)
1759 		hints.ai_flags |= AI_NUMERICHOST;
1760 	hints.ai_family = AF_UNSPEC;
1761 #ifndef INET6
1762 	do_ip6 = 0;
1763 #endif
1764 	if(!do_ip4 && !do_ip6) {
1765 		return NULL;
1766 	}
1767 	/* create ip4 and ip6 ports so that return addresses are nice. */
1768 	if(do_auto || num_ifs == 0) {
1769 		if(do_auto && cfg->if_automatic_ports &&
1770 			cfg->if_automatic_ports[0]!=0) {
1771 			char* now = cfg->if_automatic_ports;
1772 			while(now && *now) {
1773 				char* after;
1774 				int extraport;
1775 				while(isspace((unsigned char)*now))
1776 					now++;
1777 				if(!*now)
1778 					break;
1779 				after = now;
1780 				extraport = (int)strtol(now, &after, 10);
1781 				if(extraport < 0 || extraport > 65535) {
1782 					log_err("interface-automatic-ports port number out of range, at position %d of '%s'", (int)(now-cfg->if_automatic_ports)+1, cfg->if_automatic_ports);
1783 					listening_ports_free(list);
1784 					return NULL;
1785 				}
1786 				if(extraport == 0 && now == after) {
1787 					log_err("interface-automatic-ports could not be parsed, at position %d of '%s'", (int)(now-cfg->if_automatic_ports)+1, cfg->if_automatic_ports);
1788 					listening_ports_free(list);
1789 					return NULL;
1790 				}
1791 				now = after;
1792 				snprintf(portbuf, sizeof(portbuf), "%d", extraport);
1793 				if(do_ip6) {
1794 					hints.ai_family = AF_INET6;
1795 					if(!ports_create_if("::0",
1796 						do_auto, cfg->do_udp, do_tcp,
1797 						&hints, portbuf, &list,
1798 						cfg->so_rcvbuf, cfg->so_sndbuf,
1799 						cfg->ssl_port, cfg->tls_additional_port,
1800 						cfg->https_port,
1801 						cfg->proxy_protocol_port,
1802 						reuseport, cfg->ip_transparent,
1803 						cfg->tcp_mss, cfg->ip_freebind,
1804 						cfg->http_nodelay, cfg->use_systemd,
1805 						cfg->dnscrypt_port, cfg->ip_dscp)) {
1806 						listening_ports_free(list);
1807 						return NULL;
1808 					}
1809 				}
1810 				if(do_ip4) {
1811 					hints.ai_family = AF_INET;
1812 					if(!ports_create_if("0.0.0.0",
1813 						do_auto, cfg->do_udp, do_tcp,
1814 						&hints, portbuf, &list,
1815 						cfg->so_rcvbuf, cfg->so_sndbuf,
1816 						cfg->ssl_port, cfg->tls_additional_port,
1817 						cfg->https_port,
1818 						cfg->proxy_protocol_port,
1819 						reuseport, cfg->ip_transparent,
1820 						cfg->tcp_mss, cfg->ip_freebind,
1821 						cfg->http_nodelay, cfg->use_systemd,
1822 						cfg->dnscrypt_port, cfg->ip_dscp)) {
1823 						listening_ports_free(list);
1824 						return NULL;
1825 					}
1826 				}
1827 			}
1828 			return list;
1829 		}
1830 		if(do_ip6) {
1831 			hints.ai_family = AF_INET6;
1832 			if(!ports_create_if(do_auto?"::0":"::1",
1833 				do_auto, cfg->do_udp, do_tcp,
1834 				&hints, portbuf, &list,
1835 				cfg->so_rcvbuf, cfg->so_sndbuf,
1836 				cfg->ssl_port, cfg->tls_additional_port,
1837 				cfg->https_port, cfg->proxy_protocol_port,
1838 				reuseport, cfg->ip_transparent,
1839 				cfg->tcp_mss, cfg->ip_freebind,
1840 				cfg->http_nodelay, cfg->use_systemd,
1841 				cfg->dnscrypt_port, cfg->ip_dscp)) {
1842 				listening_ports_free(list);
1843 				return NULL;
1844 			}
1845 		}
1846 		if(do_ip4) {
1847 			hints.ai_family = AF_INET;
1848 			if(!ports_create_if(do_auto?"0.0.0.0":"127.0.0.1",
1849 				do_auto, cfg->do_udp, do_tcp,
1850 				&hints, portbuf, &list,
1851 				cfg->so_rcvbuf, cfg->so_sndbuf,
1852 				cfg->ssl_port, cfg->tls_additional_port,
1853 				cfg->https_port, cfg->proxy_protocol_port,
1854 				reuseport, cfg->ip_transparent,
1855 				cfg->tcp_mss, cfg->ip_freebind,
1856 				cfg->http_nodelay, cfg->use_systemd,
1857 				cfg->dnscrypt_port, cfg->ip_dscp)) {
1858 				listening_ports_free(list);
1859 				return NULL;
1860 			}
1861 		}
1862 	} else for(i = 0; i<num_ifs; i++) {
1863 		if(str_is_ip6(ifs[i])) {
1864 			if(!do_ip6)
1865 				continue;
1866 			hints.ai_family = AF_INET6;
1867 			if(!ports_create_if(ifs[i], 0, cfg->do_udp,
1868 				do_tcp, &hints, portbuf, &list,
1869 				cfg->so_rcvbuf, cfg->so_sndbuf,
1870 				cfg->ssl_port, cfg->tls_additional_port,
1871 				cfg->https_port, cfg->proxy_protocol_port,
1872 				reuseport, cfg->ip_transparent,
1873 				cfg->tcp_mss, cfg->ip_freebind,
1874 				cfg->http_nodelay, cfg->use_systemd,
1875 				cfg->dnscrypt_port, cfg->ip_dscp)) {
1876 				listening_ports_free(list);
1877 				return NULL;
1878 			}
1879 		} else {
1880 			if(!do_ip4)
1881 				continue;
1882 			hints.ai_family = AF_INET;
1883 			if(!ports_create_if(ifs[i], 0, cfg->do_udp,
1884 				do_tcp, &hints, portbuf, &list,
1885 				cfg->so_rcvbuf, cfg->so_sndbuf,
1886 				cfg->ssl_port, cfg->tls_additional_port,
1887 				cfg->https_port, cfg->proxy_protocol_port,
1888 				reuseport, cfg->ip_transparent,
1889 				cfg->tcp_mss, cfg->ip_freebind,
1890 				cfg->http_nodelay, cfg->use_systemd,
1891 				cfg->dnscrypt_port, cfg->ip_dscp)) {
1892 				listening_ports_free(list);
1893 				return NULL;
1894 			}
1895 		}
1896 	}
1897 
1898 	return list;
1899 }
1900 
1901 void listening_ports_free(struct listen_port* list)
1902 {
1903 	struct listen_port* nx;
1904 	while(list) {
1905 		nx = list->next;
1906 		if(list->fd != -1) {
1907 			sock_close(list->fd);
1908 		}
1909 		/* rc_ports don't have ub_socket */
1910 		if(list->socket) {
1911 			freeaddrinfo(list->socket->addr);
1912 			free(list->socket);
1913 		}
1914 		free(list);
1915 		list = nx;
1916 	}
1917 }
1918 
1919 size_t listen_get_mem(struct listen_dnsport* listen)
1920 {
1921 	struct listen_list* p;
1922 	size_t s = sizeof(*listen) + sizeof(*listen->base) +
1923 		sizeof(*listen->udp_buff) +
1924 		sldns_buffer_capacity(listen->udp_buff);
1925 #ifdef USE_DNSCRYPT
1926 	s += sizeof(*listen->dnscrypt_udp_buff);
1927 	if(listen->udp_buff != listen->dnscrypt_udp_buff){
1928 		s += sldns_buffer_capacity(listen->dnscrypt_udp_buff);
1929 	}
1930 #endif
1931 	for(p = listen->cps; p; p = p->next) {
1932 		s += sizeof(*p);
1933 		s += comm_point_get_mem(p->com);
1934 	}
1935 	return s;
1936 }
1937 
1938 void listen_stop_accept(struct listen_dnsport* listen)
1939 {
1940 	/* do not stop the ones that have no tcp_free list
1941 	 * (they have already stopped listening) */
1942 	struct listen_list* p;
1943 	for(p=listen->cps; p; p=p->next) {
1944 		if(p->com->type == comm_tcp_accept &&
1945 			p->com->tcp_free != NULL) {
1946 			comm_point_stop_listening(p->com);
1947 		}
1948 	}
1949 }
1950 
1951 void listen_start_accept(struct listen_dnsport* listen)
1952 {
1953 	/* do not start the ones that have no tcp_free list, it is no
1954 	 * use to listen to them because they have no free tcp handlers */
1955 	struct listen_list* p;
1956 	for(p=listen->cps; p; p=p->next) {
1957 		if(p->com->type == comm_tcp_accept &&
1958 			p->com->tcp_free != NULL) {
1959 			comm_point_start_listening(p->com, -1, -1);
1960 		}
1961 	}
1962 }
1963 
1964 struct tcp_req_info*
1965 tcp_req_info_create(struct sldns_buffer* spoolbuf)
1966 {
1967 	struct tcp_req_info* req = (struct tcp_req_info*)malloc(sizeof(*req));
1968 	if(!req) {
1969 		log_err("malloc failure for new stream outoforder processing structure");
1970 		return NULL;
1971 	}
1972 	memset(req, 0, sizeof(*req));
1973 	req->spool_buffer = spoolbuf;
1974 	return req;
1975 }
1976 
1977 void
1978 tcp_req_info_delete(struct tcp_req_info* req)
1979 {
1980 	if(!req) return;
1981 	tcp_req_info_clear(req);
1982 	/* cp is pointer back to commpoint that owns this struct and
1983 	 * called delete on us */
1984 	/* spool_buffer is shared udp buffer, not deleted here */
1985 	free(req);
1986 }
1987 
1988 void tcp_req_info_clear(struct tcp_req_info* req)
1989 {
1990 	struct tcp_req_open_item* open, *nopen;
1991 	struct tcp_req_done_item* item, *nitem;
1992 	if(!req) return;
1993 
1994 	/* free outstanding request mesh reply entries */
1995 	open = req->open_req_list;
1996 	while(open) {
1997 		nopen = open->next;
1998 		mesh_state_remove_reply(open->mesh, open->mesh_state, req->cp);
1999 		free(open);
2000 		open = nopen;
2001 	}
2002 	req->open_req_list = NULL;
2003 	req->num_open_req = 0;
2004 
2005 	/* free pending writable result packets */
2006 	item = req->done_req_list;
2007 	while(item) {
2008 		nitem = item->next;
2009 		lock_basic_lock(&stream_wait_count_lock);
2010 		stream_wait_count -= (sizeof(struct tcp_req_done_item)
2011 			+item->len);
2012 		lock_basic_unlock(&stream_wait_count_lock);
2013 		free(item->buf);
2014 		free(item);
2015 		item = nitem;
2016 	}
2017 	req->done_req_list = NULL;
2018 	req->num_done_req = 0;
2019 	req->read_is_closed = 0;
2020 }
2021 
2022 void
2023 tcp_req_info_remove_mesh_state(struct tcp_req_info* req, struct mesh_state* m)
2024 {
2025 	struct tcp_req_open_item* open, *prev = NULL;
2026 	if(!req || !m) return;
2027 	open = req->open_req_list;
2028 	while(open) {
2029 		if(open->mesh_state == m) {
2030 			struct tcp_req_open_item* next;
2031 			if(prev) prev->next = open->next;
2032 			else req->open_req_list = open->next;
2033 			/* caller has to manage the mesh state reply entry */
2034 			next = open->next;
2035 			free(open);
2036 			req->num_open_req --;
2037 
2038 			/* prev = prev; */
2039 			open = next;
2040 			continue;
2041 		}
2042 		prev = open;
2043 		open = open->next;
2044 	}
2045 }
2046 
2047 /** setup listening for read or write */
2048 static void
2049 tcp_req_info_setup_listen(struct tcp_req_info* req)
2050 {
2051 	int wr = 0;
2052 	int rd = 0;
2053 
2054 	if(req->cp->tcp_byte_count != 0) {
2055 		/* cannot change, halfway through */
2056 		return;
2057 	}
2058 
2059 	if(!req->cp->tcp_is_reading)
2060 		wr = 1;
2061 	if(!req->read_is_closed)
2062 		rd = 1;
2063 
2064 	if(wr) {
2065 		req->cp->tcp_is_reading = 0;
2066 		comm_point_stop_listening(req->cp);
2067 		comm_point_start_listening(req->cp, -1,
2068 			adjusted_tcp_timeout(req->cp));
2069 	} else if(rd) {
2070 		req->cp->tcp_is_reading = 1;
2071 		comm_point_stop_listening(req->cp);
2072 		comm_point_start_listening(req->cp, -1,
2073 			adjusted_tcp_timeout(req->cp));
2074 		/* and also read it (from SSL stack buffers), so
2075 		 * no event read event is expected since the remainder of
2076 		 * the TLS frame is sitting in the buffers. */
2077 		req->read_again = 1;
2078 	} else {
2079 		comm_point_stop_listening(req->cp);
2080 		comm_point_start_listening(req->cp, -1,
2081 			adjusted_tcp_timeout(req->cp));
2082 		comm_point_listen_for_rw(req->cp, 0, 0);
2083 	}
2084 }
2085 
2086 /** remove first item from list of pending results */
2087 static struct tcp_req_done_item*
2088 tcp_req_info_pop_done(struct tcp_req_info* req)
2089 {
2090 	struct tcp_req_done_item* item;
2091 	log_assert(req->num_done_req > 0 && req->done_req_list);
2092 	item = req->done_req_list;
2093 	lock_basic_lock(&stream_wait_count_lock);
2094 	stream_wait_count -= (sizeof(struct tcp_req_done_item)+item->len);
2095 	lock_basic_unlock(&stream_wait_count_lock);
2096 	req->done_req_list = req->done_req_list->next;
2097 	req->num_done_req --;
2098 	return item;
2099 }
2100 
2101 /** Send given buffer and setup to write */
2102 static void
2103 tcp_req_info_start_write_buf(struct tcp_req_info* req, uint8_t* buf,
2104 	size_t len)
2105 {
2106 	sldns_buffer_clear(req->cp->buffer);
2107 	sldns_buffer_write(req->cp->buffer, buf, len);
2108 	sldns_buffer_flip(req->cp->buffer);
2109 
2110 	req->cp->tcp_is_reading = 0; /* we are now writing */
2111 }
2112 
2113 /** pick up the next result and start writing it to the channel */
2114 static void
2115 tcp_req_pickup_next_result(struct tcp_req_info* req)
2116 {
2117 	if(req->num_done_req > 0) {
2118 		/* unlist the done item from the list of pending results */
2119 		struct tcp_req_done_item* item = tcp_req_info_pop_done(req);
2120 		tcp_req_info_start_write_buf(req, item->buf, item->len);
2121 		free(item->buf);
2122 		free(item);
2123 	}
2124 }
2125 
2126 /** the read channel has closed */
2127 int
2128 tcp_req_info_handle_read_close(struct tcp_req_info* req)
2129 {
2130 	verbose(VERB_ALGO, "tcp channel read side closed %d", req->cp->fd);
2131 	/* reset byte count for (potential) partial read */
2132 	req->cp->tcp_byte_count = 0;
2133 	/* if we still have results to write, pick up next and write it */
2134 	if(req->num_done_req != 0) {
2135 		tcp_req_pickup_next_result(req);
2136 		tcp_req_info_setup_listen(req);
2137 		return 1;
2138 	}
2139 	/* if nothing to do, this closes the connection */
2140 	if(req->num_open_req == 0 && req->num_done_req == 0)
2141 		return 0;
2142 	/* otherwise, we must be waiting for dns resolve, wait with timeout */
2143 	req->read_is_closed = 1;
2144 	tcp_req_info_setup_listen(req);
2145 	return 1;
2146 }
2147 
2148 void
2149 tcp_req_info_handle_writedone(struct tcp_req_info* req)
2150 {
2151 	/* back to reading state, we finished this write event */
2152 	sldns_buffer_clear(req->cp->buffer);
2153 	if(req->num_done_req == 0 && req->read_is_closed) {
2154 		/* no more to write and nothing to read, close it */
2155 		comm_point_drop_reply(&req->cp->repinfo);
2156 		return;
2157 	}
2158 	req->cp->tcp_is_reading = 1;
2159 	/* see if another result needs writing */
2160 	tcp_req_pickup_next_result(req);
2161 
2162 	/* see if there is more to write, if not stop_listening for writing */
2163 	/* see if new requests are allowed, if so, start_listening
2164 	 * for reading */
2165 	tcp_req_info_setup_listen(req);
2166 }
2167 
2168 void
2169 tcp_req_info_handle_readdone(struct tcp_req_info* req)
2170 {
2171 	struct comm_point* c = req->cp;
2172 
2173 	/* we want to read up several requests, unless there are
2174 	 * pending answers */
2175 
2176 	req->is_drop = 0;
2177 	req->is_reply = 0;
2178 	req->in_worker_handle = 1;
2179 	sldns_buffer_set_limit(req->spool_buffer, 0);
2180 	/* handle the current request */
2181 	/* this calls the worker handle request routine that could give
2182 	 * a cache response, or localdata response, or drop the reply,
2183 	 * or schedule a mesh entry for later */
2184 	fptr_ok(fptr_whitelist_comm_point(c->callback));
2185 	if( (*c->callback)(c, c->cb_arg, NETEVENT_NOERROR, &c->repinfo) ) {
2186 		req->in_worker_handle = 0;
2187 		/* there is an answer, put it up.  It is already in the
2188 		 * c->buffer, just send it. */
2189 		/* since we were just reading a query, the channel is
2190 		 * clear to write to */
2191 	send_it:
2192 		c->tcp_is_reading = 0;
2193 		comm_point_stop_listening(c);
2194 		comm_point_start_listening(c, -1, adjusted_tcp_timeout(c));
2195 		return;
2196 	}
2197 	req->in_worker_handle = 0;
2198 	/* it should be waiting in the mesh for recursion.
2199 	 * If mesh failed to add a new entry and called commpoint_drop_reply.
2200 	 * Then the mesh state has been cleared. */
2201 	if(req->is_drop) {
2202 		/* the reply has been dropped, stream has been closed. */
2203 		return;
2204 	}
2205 	/* If mesh failed(mallocfail) and called commpoint_send_reply with
2206 	 * something like servfail then we pick up that reply below. */
2207 	if(req->is_reply) {
2208 		goto send_it;
2209 	}
2210 
2211 	sldns_buffer_clear(c->buffer);
2212 	/* if pending answers, pick up an answer and start sending it */
2213 	tcp_req_pickup_next_result(req);
2214 
2215 	/* if answers pending, start sending answers */
2216 	/* read more requests if we can have more requests */
2217 	tcp_req_info_setup_listen(req);
2218 }
2219 
2220 int
2221 tcp_req_info_add_meshstate(struct tcp_req_info* req,
2222 	struct mesh_area* mesh, struct mesh_state* m)
2223 {
2224 	struct tcp_req_open_item* item;
2225 	log_assert(req && mesh && m);
2226 	item = (struct tcp_req_open_item*)malloc(sizeof(*item));
2227 	if(!item) return 0;
2228 	item->next = req->open_req_list;
2229 	item->mesh = mesh;
2230 	item->mesh_state = m;
2231 	req->open_req_list = item;
2232 	req->num_open_req++;
2233 	return 1;
2234 }
2235 
2236 /** Add a result to the result list.  At the end. */
2237 static int
2238 tcp_req_info_add_result(struct tcp_req_info* req, uint8_t* buf, size_t len)
2239 {
2240 	struct tcp_req_done_item* last = NULL;
2241 	struct tcp_req_done_item* item;
2242 	size_t space;
2243 
2244 	/* see if we have space */
2245 	space = sizeof(struct tcp_req_done_item) + len;
2246 	lock_basic_lock(&stream_wait_count_lock);
2247 	if(stream_wait_count + space > stream_wait_max) {
2248 		lock_basic_unlock(&stream_wait_count_lock);
2249 		verbose(VERB_ALGO, "drop stream reply, no space left, in stream-wait-size");
2250 		return 0;
2251 	}
2252 	stream_wait_count += space;
2253 	lock_basic_unlock(&stream_wait_count_lock);
2254 
2255 	/* find last element */
2256 	last = req->done_req_list;
2257 	while(last && last->next)
2258 		last = last->next;
2259 
2260 	/* create new element */
2261 	item = (struct tcp_req_done_item*)malloc(sizeof(*item));
2262 	if(!item) {
2263 		log_err("malloc failure, for stream result list");
2264 		return 0;
2265 	}
2266 	item->next = NULL;
2267 	item->len = len;
2268 	item->buf = memdup(buf, len);
2269 	if(!item->buf) {
2270 		free(item);
2271 		log_err("malloc failure, adding reply to stream result list");
2272 		return 0;
2273 	}
2274 
2275 	/* link in */
2276 	if(last) last->next = item;
2277 	else req->done_req_list = item;
2278 	req->num_done_req++;
2279 	return 1;
2280 }
2281 
2282 void
2283 tcp_req_info_send_reply(struct tcp_req_info* req)
2284 {
2285 	if(req->in_worker_handle) {
2286 		/* reply from mesh is in the spool_buffer */
2287 		/* copy now, so that the spool buffer is free for other tasks
2288 		 * before the callback is done */
2289 		sldns_buffer_clear(req->cp->buffer);
2290 		sldns_buffer_write(req->cp->buffer,
2291 			sldns_buffer_begin(req->spool_buffer),
2292 			sldns_buffer_limit(req->spool_buffer));
2293 		sldns_buffer_flip(req->cp->buffer);
2294 		req->is_reply = 1;
2295 		return;
2296 	}
2297 	/* now that the query has been handled, that mesh_reply entry
2298 	 * should be removed, from the tcp_req_info list,
2299 	 * the mesh state cleanup removes then with region_cleanup and
2300 	 * replies_sent true. */
2301 	/* see if we can send it straight away (we are not doing
2302 	 * anything else).  If so, copy to buffer and start */
2303 	if(req->cp->tcp_is_reading && req->cp->tcp_byte_count == 0) {
2304 		/* buffer is free, and was ready to read new query into,
2305 		 * but we are now going to use it to send this answer */
2306 		tcp_req_info_start_write_buf(req,
2307 			sldns_buffer_begin(req->spool_buffer),
2308 			sldns_buffer_limit(req->spool_buffer));
2309 		/* switch to listen to write events */
2310 		comm_point_stop_listening(req->cp);
2311 		comm_point_start_listening(req->cp, -1,
2312 			adjusted_tcp_timeout(req->cp));
2313 		return;
2314 	}
2315 	/* queue up the answer behind the others already pending */
2316 	if(!tcp_req_info_add_result(req, sldns_buffer_begin(req->spool_buffer),
2317 		sldns_buffer_limit(req->spool_buffer))) {
2318 		/* drop the connection, we are out of resources */
2319 		comm_point_drop_reply(&req->cp->repinfo);
2320 	}
2321 }
2322 
2323 size_t tcp_req_info_get_stream_buffer_size(void)
2324 {
2325 	size_t s;
2326 	if(!stream_wait_lock_inited)
2327 		return stream_wait_count;
2328 	lock_basic_lock(&stream_wait_count_lock);
2329 	s = stream_wait_count;
2330 	lock_basic_unlock(&stream_wait_count_lock);
2331 	return s;
2332 }
2333 
2334 size_t http2_get_query_buffer_size(void)
2335 {
2336 	size_t s;
2337 	if(!http2_query_buffer_lock_inited)
2338 		return http2_query_buffer_count;
2339 	lock_basic_lock(&http2_query_buffer_count_lock);
2340 	s = http2_query_buffer_count;
2341 	lock_basic_unlock(&http2_query_buffer_count_lock);
2342 	return s;
2343 }
2344 
2345 size_t http2_get_response_buffer_size(void)
2346 {
2347 	size_t s;
2348 	if(!http2_response_buffer_lock_inited)
2349 		return http2_response_buffer_count;
2350 	lock_basic_lock(&http2_response_buffer_count_lock);
2351 	s = http2_response_buffer_count;
2352 	lock_basic_unlock(&http2_response_buffer_count_lock);
2353 	return s;
2354 }
2355 
2356 #ifdef HAVE_NGHTTP2
2357 /** nghttp2 callback. Used to copy response from rbuffer to nghttp2 session */
2358 static ssize_t http2_submit_response_read_callback(
2359 	nghttp2_session* ATTR_UNUSED(session),
2360 	int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
2361 	nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
2362 {
2363 	struct http2_stream* h2_stream;
2364 	struct http2_session* h2_session = source->ptr;
2365 	size_t copylen = length;
2366 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2367 		h2_session->session, stream_id))) {
2368 		verbose(VERB_QUERY, "http2: cannot get stream data, closing "
2369 			"stream");
2370 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2371 	}
2372 	if(!h2_stream->rbuffer ||
2373 		sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
2374 		verbose(VERB_QUERY, "http2: cannot submit buffer. No data "
2375 			"available in rbuffer");
2376 		/* rbuffer will be free'd in frame close cb */
2377 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2378 	}
2379 
2380 	if(copylen > sldns_buffer_remaining(h2_stream->rbuffer))
2381 		copylen = sldns_buffer_remaining(h2_stream->rbuffer);
2382 	if(copylen > SSIZE_MAX)
2383 		copylen = SSIZE_MAX; /* will probably never happen */
2384 
2385 	memcpy(buf, sldns_buffer_current(h2_stream->rbuffer), copylen);
2386 	sldns_buffer_skip(h2_stream->rbuffer, copylen);
2387 
2388 	if(sldns_buffer_remaining(h2_stream->rbuffer) == 0) {
2389 		*data_flags |= NGHTTP2_DATA_FLAG_EOF;
2390 		lock_basic_lock(&http2_response_buffer_count_lock);
2391 		http2_response_buffer_count -=
2392 			sldns_buffer_capacity(h2_stream->rbuffer);
2393 		lock_basic_unlock(&http2_response_buffer_count_lock);
2394 		sldns_buffer_free(h2_stream->rbuffer);
2395 		h2_stream->rbuffer = NULL;
2396 	}
2397 
2398 	return copylen;
2399 }
2400 
2401 /**
2402  * Send RST_STREAM frame for stream.
2403  * @param h2_session: http2 session to submit frame to
2404  * @param h2_stream: http2 stream containing frame ID to use in RST_STREAM
2405  * @return 0 on error, 1 otherwise
2406  */
2407 static int http2_submit_rst_stream(struct http2_session* h2_session,
2408 		struct http2_stream* h2_stream)
2409 {
2410 	int ret = nghttp2_submit_rst_stream(h2_session->session,
2411 		NGHTTP2_FLAG_NONE, h2_stream->stream_id,
2412 		NGHTTP2_INTERNAL_ERROR);
2413 	if(ret) {
2414 		verbose(VERB_QUERY, "http2: nghttp2_submit_rst_stream failed, "
2415 			"error: %s", nghttp2_strerror(ret));
2416 		return 0;
2417 	}
2418 	return 1;
2419 }
2420 
2421 /**
2422  * DNS response ready to be submitted to nghttp2, to be prepared for sending
2423  * out. Response is stored in c->buffer. Copy to rbuffer because the c->buffer
2424  * might be used before this will be sent out.
2425  * @param h2_session: http2 session, containing c->buffer which contains answer
2426  * @return 0 on error, 1 otherwise
2427  */
2428 int http2_submit_dns_response(struct http2_session* h2_session)
2429 {
2430 	int ret;
2431 	nghttp2_data_provider data_prd;
2432 	char status[4];
2433 	nghttp2_nv headers[3];
2434 	struct http2_stream* h2_stream = h2_session->c->h2_stream;
2435 	size_t rlen;
2436 	char rlen_str[32];
2437 
2438 	if(h2_stream->rbuffer) {
2439 		log_err("http2 submit response error: rbuffer already "
2440 			"exists");
2441 		return 0;
2442 	}
2443 	if(sldns_buffer_remaining(h2_session->c->buffer) == 0) {
2444 		log_err("http2 submit response error: c->buffer not complete");
2445 		return 0;
2446 	}
2447 
2448 	if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
2449 		verbose(VERB_QUERY, "http2: submit response error: "
2450 			"invalid status");
2451 		return 0;
2452 	}
2453 
2454 	rlen = sldns_buffer_remaining(h2_session->c->buffer);
2455 	snprintf(rlen_str, sizeof(rlen_str), "%u", (unsigned)rlen);
2456 
2457 	lock_basic_lock(&http2_response_buffer_count_lock);
2458 	if(http2_response_buffer_count + rlen > http2_response_buffer_max) {
2459 		lock_basic_unlock(&http2_response_buffer_count_lock);
2460 		verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2461 			"in https-response-buffer-size");
2462 		return http2_submit_rst_stream(h2_session, h2_stream);
2463 	}
2464 	http2_response_buffer_count += rlen;
2465 	lock_basic_unlock(&http2_response_buffer_count_lock);
2466 
2467 	if(!(h2_stream->rbuffer = sldns_buffer_new(rlen))) {
2468 		lock_basic_lock(&http2_response_buffer_count_lock);
2469 		http2_response_buffer_count -= rlen;
2470 		lock_basic_unlock(&http2_response_buffer_count_lock);
2471 		log_err("http2 submit response error: malloc failure");
2472 		return 0;
2473 	}
2474 
2475 	headers[0].name = (uint8_t*)":status";
2476 	headers[0].namelen = 7;
2477 	headers[0].value = (uint8_t*)status;
2478 	headers[0].valuelen = 3;
2479 	headers[0].flags = NGHTTP2_NV_FLAG_NONE;
2480 
2481 	headers[1].name = (uint8_t*)"content-type";
2482 	headers[1].namelen = 12;
2483 	headers[1].value = (uint8_t*)"application/dns-message";
2484 	headers[1].valuelen = 23;
2485 	headers[1].flags = NGHTTP2_NV_FLAG_NONE;
2486 
2487 	headers[2].name = (uint8_t*)"content-length";
2488 	headers[2].namelen = 14;
2489 	headers[2].value = (uint8_t*)rlen_str;
2490 	headers[2].valuelen = strlen(rlen_str);
2491 	headers[2].flags = NGHTTP2_NV_FLAG_NONE;
2492 
2493 	sldns_buffer_write(h2_stream->rbuffer,
2494 		sldns_buffer_current(h2_session->c->buffer),
2495 		sldns_buffer_remaining(h2_session->c->buffer));
2496 	sldns_buffer_flip(h2_stream->rbuffer);
2497 
2498 	data_prd.source.ptr = h2_session;
2499 	data_prd.read_callback = http2_submit_response_read_callback;
2500 	ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
2501 		headers, 3, &data_prd);
2502 	if(ret) {
2503 		verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
2504 			"error: %s", nghttp2_strerror(ret));
2505 		return 0;
2506 	}
2507 	return 1;
2508 }
2509 #else
2510 int http2_submit_dns_response(void* ATTR_UNUSED(v))
2511 {
2512 	return 0;
2513 }
2514 #endif
2515 
2516 #ifdef HAVE_NGHTTP2
2517 /** HTTP status to descriptive string */
2518 static char* http_status_to_str(enum http_status s)
2519 {
2520 	switch(s) {
2521 		case HTTP_STATUS_OK:
2522 			return "OK";
2523 		case HTTP_STATUS_BAD_REQUEST:
2524 			return "Bad Request";
2525 		case HTTP_STATUS_NOT_FOUND:
2526 			return "Not Found";
2527 		case HTTP_STATUS_PAYLOAD_TOO_LARGE:
2528 			return "Payload Too Large";
2529 		case HTTP_STATUS_URI_TOO_LONG:
2530 			return "URI Too Long";
2531 		case HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE:
2532 			return "Unsupported Media Type";
2533 		case HTTP_STATUS_NOT_IMPLEMENTED:
2534 			return "Not Implemented";
2535 	}
2536 	return "Status Unknown";
2537 }
2538 
2539 /** nghttp2 callback. Used to copy error message to nghttp2 session */
2540 static ssize_t http2_submit_error_read_callback(
2541 	nghttp2_session* ATTR_UNUSED(session),
2542 	int32_t stream_id, uint8_t* buf, size_t length, uint32_t* data_flags,
2543 	nghttp2_data_source* source, void* ATTR_UNUSED(cb_arg))
2544 {
2545 	struct http2_stream* h2_stream;
2546 	struct http2_session* h2_session = source->ptr;
2547 	char* msg;
2548 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2549 		h2_session->session, stream_id))) {
2550 		verbose(VERB_QUERY, "http2: cannot get stream data, closing "
2551 			"stream");
2552 		return NGHTTP2_ERR_TEMPORAL_CALLBACK_FAILURE;
2553 	}
2554 	*data_flags |= NGHTTP2_DATA_FLAG_EOF;
2555 	msg = http_status_to_str(h2_stream->status);
2556 	if(length < strlen(msg))
2557 		return 0; /* not worth trying over multiple frames */
2558 	memcpy(buf, msg, strlen(msg));
2559 	return strlen(msg);
2560 
2561 }
2562 
2563 /**
2564  * HTTP error response ready to be submitted to nghttp2, to be prepared for
2565  * sending out. Message body will contain descriptive string for HTTP status.
2566  * @param h2_session: http2 session to submit to
2567  * @param h2_stream: http2 stream containing HTTP status to use for error
2568  * @return 0 on error, 1 otherwise
2569  */
2570 static int http2_submit_error(struct http2_session* h2_session,
2571 	struct http2_stream* h2_stream)
2572 {
2573 	int ret;
2574 	char status[4];
2575 	nghttp2_data_provider data_prd;
2576 	nghttp2_nv headers[1]; /* will be copied by nghttp */
2577 	if(snprintf(status, 4, "%d", h2_stream->status) != 3) {
2578 		verbose(VERB_QUERY, "http2: submit error failed, "
2579 			"invalid status");
2580 		return 0;
2581 	}
2582 	headers[0].name = (uint8_t*)":status";
2583 	headers[0].namelen = 7;
2584 	headers[0].value = (uint8_t*)status;
2585 	headers[0].valuelen = 3;
2586 	headers[0].flags = NGHTTP2_NV_FLAG_NONE;
2587 
2588 	data_prd.source.ptr = h2_session;
2589 	data_prd.read_callback = http2_submit_error_read_callback;
2590 
2591 	ret = nghttp2_submit_response(h2_session->session, h2_stream->stream_id,
2592 		headers, 1, &data_prd);
2593 	if(ret) {
2594 		verbose(VERB_QUERY, "http2: submit error failed, "
2595 			"error: %s", nghttp2_strerror(ret));
2596 		return 0;
2597 	}
2598 	return 1;
2599 }
2600 
2601 /**
2602  * Start query handling. Query is stored in the stream, and will be free'd here.
2603  * @param h2_session: http2 session, containing comm point
2604  * @param h2_stream: stream containing buffered query
2605  * @return: -1 on error, 1 if answer is stored in c->buffer, 0 if there is no
2606  * reply available (yet).
2607  */
2608 static int http2_query_read_done(struct http2_session* h2_session,
2609 	struct http2_stream* h2_stream)
2610 {
2611 	log_assert(h2_stream->qbuffer);
2612 
2613 	if(h2_session->c->h2_stream) {
2614 		verbose(VERB_ALGO, "http2_query_read_done failure: shared "
2615 			"buffer already assigned to stream");
2616 		return -1;
2617 	}
2618 
2619     /* the c->buffer might be used by mesh_send_reply and no be cleard
2620 	 * need to be cleared before use */
2621 	sldns_buffer_clear(h2_session->c->buffer);
2622 	if(sldns_buffer_remaining(h2_session->c->buffer) <
2623 		sldns_buffer_remaining(h2_stream->qbuffer)) {
2624 		/* qbuffer will be free'd in frame close cb */
2625 		sldns_buffer_clear(h2_session->c->buffer);
2626 		verbose(VERB_ALGO, "http2_query_read_done failure: can't fit "
2627 			"qbuffer in c->buffer");
2628 		return -1;
2629 	}
2630 
2631 	sldns_buffer_write(h2_session->c->buffer,
2632 		sldns_buffer_current(h2_stream->qbuffer),
2633 		sldns_buffer_remaining(h2_stream->qbuffer));
2634 
2635 	lock_basic_lock(&http2_query_buffer_count_lock);
2636 	http2_query_buffer_count -= sldns_buffer_capacity(h2_stream->qbuffer);
2637 	lock_basic_unlock(&http2_query_buffer_count_lock);
2638 	sldns_buffer_free(h2_stream->qbuffer);
2639 	h2_stream->qbuffer = NULL;
2640 
2641 	sldns_buffer_flip(h2_session->c->buffer);
2642 	h2_session->c->h2_stream = h2_stream;
2643 	fptr_ok(fptr_whitelist_comm_point(h2_session->c->callback));
2644 	if((*h2_session->c->callback)(h2_session->c, h2_session->c->cb_arg,
2645 		NETEVENT_NOERROR, &h2_session->c->repinfo)) {
2646 		return 1; /* answer in c->buffer */
2647 	}
2648 	sldns_buffer_clear(h2_session->c->buffer);
2649 	h2_session->c->h2_stream = NULL;
2650 	return 0; /* mesh state added, or dropped */
2651 }
2652 
2653 /** nghttp2 callback. Used to check if the received frame indicates the end of a
2654  * stream. Gather collected request data and start query handling. */
2655 static int http2_req_frame_recv_cb(nghttp2_session* session,
2656 	const nghttp2_frame* frame, void* cb_arg)
2657 {
2658 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2659 	struct http2_stream* h2_stream;
2660 	int query_read_done;
2661 
2662 	if((frame->hd.type != NGHTTP2_DATA &&
2663 		frame->hd.type != NGHTTP2_HEADERS) ||
2664 		!(frame->hd.flags & NGHTTP2_FLAG_END_STREAM)) {
2665 			return 0;
2666 	}
2667 
2668 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2669 		session, frame->hd.stream_id)))
2670 		return 0;
2671 
2672 	if(h2_stream->invalid_endpoint) {
2673 		h2_stream->status = HTTP_STATUS_NOT_FOUND;
2674 		goto submit_http_error;
2675 	}
2676 
2677 	if(h2_stream->invalid_content_type) {
2678 		h2_stream->status = HTTP_STATUS_UNSUPPORTED_MEDIA_TYPE;
2679 		goto submit_http_error;
2680 	}
2681 
2682 	if(h2_stream->http_method != HTTP_METHOD_GET &&
2683 		h2_stream->http_method != HTTP_METHOD_POST) {
2684 		h2_stream->status = HTTP_STATUS_NOT_IMPLEMENTED;
2685 		goto submit_http_error;
2686 	}
2687 
2688 	if(h2_stream->query_too_large) {
2689 		if(h2_stream->http_method == HTTP_METHOD_POST)
2690 			h2_stream->status = HTTP_STATUS_PAYLOAD_TOO_LARGE;
2691 		else
2692 			h2_stream->status = HTTP_STATUS_URI_TOO_LONG;
2693 		goto submit_http_error;
2694 	}
2695 
2696 	if(!h2_stream->qbuffer) {
2697 		h2_stream->status = HTTP_STATUS_BAD_REQUEST;
2698 		goto submit_http_error;
2699 	}
2700 
2701 	if(h2_stream->status) {
2702 submit_http_error:
2703 		verbose(VERB_QUERY, "http2 request invalid, returning :status="
2704 			"%d", h2_stream->status);
2705 		if(!http2_submit_error(h2_session, h2_stream)) {
2706 			return NGHTTP2_ERR_CALLBACK_FAILURE;
2707 		}
2708 		return 0;
2709 	}
2710 	h2_stream->status = HTTP_STATUS_OK;
2711 
2712 	sldns_buffer_flip(h2_stream->qbuffer);
2713 	h2_session->postpone_drop = 1;
2714 	query_read_done = http2_query_read_done(h2_session, h2_stream);
2715 	if(query_read_done < 0)
2716 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2717 	else if(!query_read_done) {
2718 		if(h2_session->is_drop) {
2719 			/* connection needs to be closed. Return failure to make
2720 			 * sure no other action are taken anymore on comm point.
2721 			 * failure will result in reclaiming (and closing)
2722 			 * of comm point. */
2723 			verbose(VERB_QUERY, "http2 query dropped in worker cb");
2724 			h2_session->postpone_drop = 0;
2725 			return NGHTTP2_ERR_CALLBACK_FAILURE;
2726 		}
2727 		/* nothing to submit right now, query added to mesh. */
2728 		h2_session->postpone_drop = 0;
2729 		return 0;
2730 	}
2731 	if(!http2_submit_dns_response(h2_session)) {
2732 		sldns_buffer_clear(h2_session->c->buffer);
2733 		h2_session->c->h2_stream = NULL;
2734 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2735 	}
2736 	verbose(VERB_QUERY, "http2 query submitted to session");
2737 	sldns_buffer_clear(h2_session->c->buffer);
2738 	h2_session->c->h2_stream = NULL;
2739 	return 0;
2740 }
2741 
2742 /** nghttp2 callback. Used to detect start of new streams. */
2743 static int http2_req_begin_headers_cb(nghttp2_session* session,
2744 	const nghttp2_frame* frame, void* cb_arg)
2745 {
2746 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2747 	struct http2_stream* h2_stream;
2748 	int ret;
2749 	if(frame->hd.type != NGHTTP2_HEADERS ||
2750 		frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
2751 		/* only interested in request headers */
2752 		return 0;
2753 	}
2754 	if(!(h2_stream = http2_stream_create(frame->hd.stream_id))) {
2755 		log_err("malloc failure while creating http2 stream");
2756 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2757 	}
2758 	http2_session_add_stream(h2_session, h2_stream);
2759 	ret = nghttp2_session_set_stream_user_data(session,
2760 		frame->hd.stream_id, h2_stream);
2761 	if(ret) {
2762 		/* stream does not exist */
2763 		verbose(VERB_QUERY, "http2: set_stream_user_data failed, "
2764 			"error: %s", nghttp2_strerror(ret));
2765 		return NGHTTP2_ERR_CALLBACK_FAILURE;
2766 	}
2767 
2768 	return 0;
2769 }
2770 
2771 /**
2772  * base64url decode, store in qbuffer
2773  * @param h2_session: http2 session
2774  * @param h2_stream: http2 stream
2775  * @param start: start of the base64 string
2776  * @param length: length of the base64 string
2777  * @return: 0 on error, 1 otherwise. query will be stored in h2_stream->qbuffer,
2778  * buffer will be NULL is unparseble.
2779  */
2780 static int http2_buffer_uri_query(struct http2_session* h2_session,
2781 	struct http2_stream* h2_stream, const uint8_t* start, size_t length)
2782 {
2783 	size_t expectb64len;
2784 	int b64len;
2785 	if(h2_stream->http_method == HTTP_METHOD_POST)
2786 		return 1;
2787 	if(length == 0)
2788 		return 1;
2789 	if(h2_stream->qbuffer) {
2790 		verbose(VERB_ALGO, "http2_req_header fail, "
2791 			"qbuffer already set");
2792 		return 0;
2793 	}
2794 
2795 	/* calculate size, might be a bit bigger than the real
2796 	 * decoded buffer size */
2797 	expectb64len = sldns_b64_pton_calculate_size(length);
2798 	log_assert(expectb64len > 0);
2799 	if(expectb64len >
2800 		h2_session->c->http2_stream_max_qbuffer_size) {
2801 		h2_stream->query_too_large = 1;
2802 		return 1;
2803 	}
2804 
2805 	lock_basic_lock(&http2_query_buffer_count_lock);
2806 	if(http2_query_buffer_count + expectb64len > http2_query_buffer_max) {
2807 		lock_basic_unlock(&http2_query_buffer_count_lock);
2808 		verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
2809 			"in http2-query-buffer-size");
2810 		return http2_submit_rst_stream(h2_session, h2_stream);
2811 	}
2812 	http2_query_buffer_count += expectb64len;
2813 	lock_basic_unlock(&http2_query_buffer_count_lock);
2814 	if(!(h2_stream->qbuffer = sldns_buffer_new(expectb64len))) {
2815 		lock_basic_lock(&http2_query_buffer_count_lock);
2816 		http2_query_buffer_count -= expectb64len;
2817 		lock_basic_unlock(&http2_query_buffer_count_lock);
2818 		log_err("http2_req_header fail, qbuffer "
2819 			"malloc failure");
2820 		return 0;
2821 	}
2822 
2823 	if(sldns_b64_contains_nonurl((char const*)start, length)) {
2824 		char buf[65536+4];
2825 		verbose(VERB_ALGO, "HTTP2 stream contains wrong b64 encoding");
2826 		/* copy to the scratch buffer temporarily to terminate the
2827 		 * string with a zero */
2828 		if(length+1 > sizeof(buf)) {
2829 			/* too long */
2830 			lock_basic_lock(&http2_query_buffer_count_lock);
2831 			http2_query_buffer_count -= expectb64len;
2832 			lock_basic_unlock(&http2_query_buffer_count_lock);
2833 			sldns_buffer_free(h2_stream->qbuffer);
2834 			h2_stream->qbuffer = NULL;
2835 			return 1;
2836 		}
2837 		memmove(buf, start, length);
2838 		buf[length] = 0;
2839 		if(!(b64len = sldns_b64_pton(buf, sldns_buffer_current(
2840 			h2_stream->qbuffer), expectb64len)) || b64len < 0) {
2841 			lock_basic_lock(&http2_query_buffer_count_lock);
2842 			http2_query_buffer_count -= expectb64len;
2843 			lock_basic_unlock(&http2_query_buffer_count_lock);
2844 			sldns_buffer_free(h2_stream->qbuffer);
2845 			h2_stream->qbuffer = NULL;
2846 			return 1;
2847 		}
2848 	} else {
2849 		if(!(b64len = sldns_b64url_pton(
2850 			(char const *)start, length,
2851 			sldns_buffer_current(h2_stream->qbuffer),
2852 			expectb64len)) || b64len < 0) {
2853 			lock_basic_lock(&http2_query_buffer_count_lock);
2854 			http2_query_buffer_count -= expectb64len;
2855 			lock_basic_unlock(&http2_query_buffer_count_lock);
2856 			sldns_buffer_free(h2_stream->qbuffer);
2857 			h2_stream->qbuffer = NULL;
2858 			/* return without error, method can be an
2859 			 * unknown POST */
2860 			return 1;
2861 		}
2862 	}
2863 	sldns_buffer_skip(h2_stream->qbuffer, (size_t)b64len);
2864 	return 1;
2865 }
2866 
2867 /** nghttp2 callback. Used to parse headers from HEADER frames. */
2868 static int http2_req_header_cb(nghttp2_session* session,
2869 	const nghttp2_frame* frame, const uint8_t* name, size_t namelen,
2870 	const uint8_t* value, size_t valuelen, uint8_t ATTR_UNUSED(flags),
2871 	void* cb_arg)
2872 {
2873 	struct http2_stream* h2_stream = NULL;
2874 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2875 	/* nghttp2 deals with CONTINUATION frames and provides them as part of
2876 	 * the HEADER */
2877 	if(frame->hd.type != NGHTTP2_HEADERS ||
2878 		frame->headers.cat != NGHTTP2_HCAT_REQUEST) {
2879 		/* only interested in request headers */
2880 		return 0;
2881 	}
2882 	if(!(h2_stream = nghttp2_session_get_stream_user_data(session,
2883 		frame->hd.stream_id)))
2884 		return 0;
2885 
2886 	/* earlier checks already indicate we can stop handling this query */
2887 	if(h2_stream->http_method == HTTP_METHOD_UNSUPPORTED ||
2888 		h2_stream->invalid_content_type ||
2889 		h2_stream->invalid_endpoint)
2890 		return 0;
2891 
2892 
2893 	/* nghttp2 performs some sanity checks in the headers, including:
2894 	 * name and value are guaranteed to be null terminated
2895 	 * name is guaranteed to be lowercase
2896 	 * content-length value is guaranteed to contain digits
2897 	 */
2898 
2899 	if(!h2_stream->http_method && namelen == 7 &&
2900 		memcmp(":method", name, namelen) == 0) {
2901 		/* Case insensitive check on :method value to be on the safe
2902 		 * side. I failed to find text about case sensitivity in specs.
2903 		 */
2904 		if(valuelen == 3 && strcasecmp("GET", (const char*)value) == 0)
2905 			h2_stream->http_method = HTTP_METHOD_GET;
2906 		else if(valuelen == 4 &&
2907 			strcasecmp("POST", (const char*)value) == 0) {
2908 			h2_stream->http_method = HTTP_METHOD_POST;
2909 			if(h2_stream->qbuffer) {
2910 				/* POST method uses query from DATA frames */
2911 				lock_basic_lock(&http2_query_buffer_count_lock);
2912 				http2_query_buffer_count -=
2913 					sldns_buffer_capacity(h2_stream->qbuffer);
2914 				lock_basic_unlock(&http2_query_buffer_count_lock);
2915 				sldns_buffer_free(h2_stream->qbuffer);
2916 				h2_stream->qbuffer = NULL;
2917 			}
2918 		} else
2919 			h2_stream->http_method = HTTP_METHOD_UNSUPPORTED;
2920 		return 0;
2921 	}
2922 	if(namelen == 5 && memcmp(":path", name, namelen) == 0) {
2923 		/* :path may contain DNS query, depending on method. Method might
2924 		 * not be known yet here, so check after finishing receiving
2925 		 * stream. */
2926 #define	HTTP_QUERY_PARAM "?dns="
2927 		size_t el = strlen(h2_session->c->http_endpoint);
2928 		size_t qpl = strlen(HTTP_QUERY_PARAM);
2929 
2930 		if(valuelen < el || memcmp(h2_session->c->http_endpoint,
2931 			value, el) != 0) {
2932 			h2_stream->invalid_endpoint = 1;
2933 			return 0;
2934 		}
2935 		/* larger than endpoint only allowed if it is for the query
2936 		 * parameter */
2937 		if(valuelen <= el+qpl ||
2938 			memcmp(HTTP_QUERY_PARAM, value+el, qpl) != 0) {
2939 			if(valuelen != el)
2940 				h2_stream->invalid_endpoint = 1;
2941 			return 0;
2942 		}
2943 
2944 		if(!http2_buffer_uri_query(h2_session, h2_stream,
2945 			value+(el+qpl), valuelen-(el+qpl))) {
2946 			return NGHTTP2_ERR_CALLBACK_FAILURE;
2947 		}
2948 		return 0;
2949 	}
2950 	/* Content type is a SHOULD (rfc7231#section-3.1.1.5) when using POST,
2951 	 * and not needed when using GET. Don't enfore.
2952 	 * If set only allow lowercase "application/dns-message".
2953 	 *
2954 	 * Clients SHOULD (rfc8484#section-4.1) set an accept header, but MUST
2955 	 * be able to handle "application/dns-message". Since that is the only
2956 	 * content-type supported we can ignore the accept header.
2957 	 */
2958 	if((namelen == 12 && memcmp("content-type", name, namelen) == 0)) {
2959 		if(valuelen != 23 || memcmp("application/dns-message", value,
2960 			valuelen) != 0) {
2961 			h2_stream->invalid_content_type = 1;
2962 		}
2963 	}
2964 
2965 	/* Only interested in content-lentg for POST (on not yet known) method.
2966 	 */
2967 	if((!h2_stream->http_method ||
2968 		h2_stream->http_method == HTTP_METHOD_POST) &&
2969 		!h2_stream->content_length && namelen  == 14 &&
2970 		memcmp("content-length", name, namelen) == 0) {
2971 		if(valuelen > 5) {
2972 			h2_stream->query_too_large = 1;
2973 			return 0;
2974 		}
2975 		/* guaranteed to only contain digits and be null terminated */
2976 		h2_stream->content_length = atoi((const char*)value);
2977 		if(h2_stream->content_length >
2978 			h2_session->c->http2_stream_max_qbuffer_size) {
2979 			h2_stream->query_too_large = 1;
2980 			return 0;
2981 		}
2982 	}
2983 	return 0;
2984 }
2985 
2986 /** nghttp2 callback. Used to get data from DATA frames, which can contain
2987  * queries in POST requests. */
2988 static int http2_req_data_chunk_recv_cb(nghttp2_session* ATTR_UNUSED(session),
2989 	uint8_t ATTR_UNUSED(flags), int32_t stream_id, const uint8_t* data,
2990 	size_t len, void* cb_arg)
2991 {
2992 	struct http2_session* h2_session = (struct http2_session*)cb_arg;
2993 	struct http2_stream* h2_stream;
2994 	size_t qlen = 0;
2995 
2996 	if(!(h2_stream = nghttp2_session_get_stream_user_data(
2997 		h2_session->session, stream_id))) {
2998 		return 0;
2999 	}
3000 
3001 	if(h2_stream->query_too_large)
3002 		return 0;
3003 
3004 	if(!h2_stream->qbuffer) {
3005 		if(h2_stream->content_length) {
3006 			if(h2_stream->content_length < len)
3007 				/* getting more data in DATA frame than
3008 				 * advertised in content-length header. */
3009 				return NGHTTP2_ERR_CALLBACK_FAILURE;
3010 			qlen = h2_stream->content_length;
3011 		} else if(len <= h2_session->c->http2_stream_max_qbuffer_size) {
3012 			/* setting this to msg-buffer-size can result in a lot
3013 			 * of memory consuption. Most queries should fit in a
3014 			 * single DATA frame, and most POST queries will
3015 			 * contain content-length which does not impose this
3016 			 * limit. */
3017 			qlen = len;
3018 		}
3019 	}
3020 	if(!h2_stream->qbuffer && qlen) {
3021 		lock_basic_lock(&http2_query_buffer_count_lock);
3022 		if(http2_query_buffer_count + qlen > http2_query_buffer_max) {
3023 			lock_basic_unlock(&http2_query_buffer_count_lock);
3024 			verbose(VERB_ALGO, "reset HTTP2 stream, no space left, "
3025 				"in http2-query-buffer-size");
3026 			return http2_submit_rst_stream(h2_session, h2_stream);
3027 		}
3028 		http2_query_buffer_count += qlen;
3029 		lock_basic_unlock(&http2_query_buffer_count_lock);
3030 		if(!(h2_stream->qbuffer = sldns_buffer_new(qlen))) {
3031 			lock_basic_lock(&http2_query_buffer_count_lock);
3032 			http2_query_buffer_count -= qlen;
3033 			lock_basic_unlock(&http2_query_buffer_count_lock);
3034 		}
3035 	}
3036 
3037 	if(!h2_stream->qbuffer ||
3038 		sldns_buffer_remaining(h2_stream->qbuffer) < len) {
3039 		verbose(VERB_ALGO, "http2 data_chunck_recv failed. Not enough "
3040 			"buffer space for POST query. Can happen on multi "
3041 			"frame requests without content-length header");
3042 		h2_stream->query_too_large = 1;
3043 		return 0;
3044 	}
3045 
3046 	sldns_buffer_write(h2_stream->qbuffer, data, len);
3047 
3048 	return 0;
3049 }
3050 
3051 void http2_req_stream_clear(struct http2_stream* h2_stream)
3052 {
3053 	if(h2_stream->qbuffer) {
3054 		lock_basic_lock(&http2_query_buffer_count_lock);
3055 		http2_query_buffer_count -=
3056 			sldns_buffer_capacity(h2_stream->qbuffer);
3057 		lock_basic_unlock(&http2_query_buffer_count_lock);
3058 		sldns_buffer_free(h2_stream->qbuffer);
3059 		h2_stream->qbuffer = NULL;
3060 	}
3061 	if(h2_stream->rbuffer) {
3062 		lock_basic_lock(&http2_response_buffer_count_lock);
3063 		http2_response_buffer_count -=
3064 			sldns_buffer_capacity(h2_stream->rbuffer);
3065 		lock_basic_unlock(&http2_response_buffer_count_lock);
3066 		sldns_buffer_free(h2_stream->rbuffer);
3067 		h2_stream->rbuffer = NULL;
3068 	}
3069 }
3070 
3071 nghttp2_session_callbacks* http2_req_callbacks_create(void)
3072 {
3073 	nghttp2_session_callbacks *callbacks;
3074 	if(nghttp2_session_callbacks_new(&callbacks) == NGHTTP2_ERR_NOMEM) {
3075 		log_err("failed to initialize nghttp2 callback");
3076 		return NULL;
3077 	}
3078 	/* reception of header block started, used to create h2_stream */
3079 	nghttp2_session_callbacks_set_on_begin_headers_callback(callbacks,
3080 		http2_req_begin_headers_cb);
3081 	/* complete frame received, used to get data from stream if frame
3082 	 * has end stream flag, and start processing query */
3083 	nghttp2_session_callbacks_set_on_frame_recv_callback(callbacks,
3084 		http2_req_frame_recv_cb);
3085 	/* get request info from headers */
3086 	nghttp2_session_callbacks_set_on_header_callback(callbacks,
3087 		http2_req_header_cb);
3088 	/* get data from DATA frames, containing POST query */
3089 	nghttp2_session_callbacks_set_on_data_chunk_recv_callback(callbacks,
3090 		http2_req_data_chunk_recv_cb);
3091 
3092 	/* generic HTTP2 callbacks */
3093 	nghttp2_session_callbacks_set_recv_callback(callbacks, http2_recv_cb);
3094 	nghttp2_session_callbacks_set_send_callback(callbacks, http2_send_cb);
3095 	nghttp2_session_callbacks_set_on_stream_close_callback(callbacks,
3096 		http2_stream_close_cb);
3097 
3098 	return callbacks;
3099 }
3100 #endif /* HAVE_NGHTTP2 */
3101