xref: /netbsd/external/bsd/nsd/dist/rrl.c (revision d20bac77)
1 
2 /* rrl.c - Response Rate Limiting for NSD.
3  * By W.C.A. Wijngaards
4  * Copyright 2012, NLnet Labs.
5  * BSD, see LICENSE.
6  */
7 #include "config.h"
8 #include <errno.h>
9 #include "rrl.h"
10 #include "util.h"
11 #include "lookup3.h"
12 #include "options.h"
13 
14 #ifdef RATELIMIT
15 
16 #ifdef HAVE_MMAP
17 #include <sys/mman.h>
18 #if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
19 #define MAP_ANONYMOUS   MAP_ANON
20 #endif
21 #endif /* HAVE_MMAP */
22 
23 
24 /**
25  * The rate limiting data structure bucket, this represents one rate of
26  * packets from a single source.
27  * Smoothed average rates.
28  */
29 struct rrl_bucket {
30 	/* the source netmask */
31 	uint64_t source;
32 	/* rate, in queries per second, which due to rate=r(t)+r(t-1)/2 is
33 	 * equal to double the queries per second */
34 	uint32_t rate;
35 	/* the full hash */
36 	uint32_t hash;
37 	/* counter for queries arrived in this second */
38 	uint32_t counter;
39 	/* timestamp, which time is the time of the counter, the rate is from
40 	 * one timestep before that. */
41 	int32_t stamp;
42 	/* flags for the source mask and type */
43 	uint16_t flags;
44 };
45 
46 /* the (global) array of RRL buckets */
47 static struct rrl_bucket* rrl_array = NULL;
48 static size_t rrl_array_size = RRL_BUCKETS;
49 static uint32_t rrl_ratelimit = RRL_LIMIT; /* 2x qps */
50 static uint8_t rrl_slip_ratio = RRL_SLIP;
51 static uint8_t rrl_ipv4_prefixlen = RRL_IPV4_PREFIX_LENGTH;
52 static uint8_t rrl_ipv6_prefixlen = RRL_IPV6_PREFIX_LENGTH;
53 static uint64_t rrl_ipv6_mask; /* max prefixlen 64 */
54 static uint32_t rrl_whitelist_ratelimit = RRL_WLIST_LIMIT; /* 2x qps */
55 
56 /* the array of mmaps for the children (saved between reloads) */
57 static void** rrl_maps = NULL;
58 static size_t rrl_maps_num = 0;
59 
rrl_mmap_init(int numch,size_t numbuck,size_t lm,size_t wlm,size_t sm,size_t plf,size_t pls)60 void rrl_mmap_init(int numch, size_t numbuck, size_t lm, size_t wlm, size_t sm,
61 	size_t plf, size_t pls)
62 {
63 #ifdef HAVE_MMAP
64 	size_t i;
65 #endif
66 	if(numbuck != 0)
67 		rrl_array_size = numbuck;
68 	rrl_ratelimit = lm*2;
69 	rrl_slip_ratio = sm;
70 	rrl_ipv4_prefixlen = plf;
71 	rrl_ipv6_prefixlen = pls;
72 	if (pls <= 32) {
73 		rrl_ipv6_mask = ((uint64_t) htonl(0xffffffff << (32-pls))) << 32;
74 	} else {
75 		rrl_ipv6_mask =  ((uint64_t) htonl(0xffffffff << (64-pls))) |
76 			(((uint64_t)0xffffffff)<<32);
77 	}
78 	rrl_whitelist_ratelimit = wlm*2;
79 #ifdef HAVE_MMAP
80 	/* allocate the ratelimit hashtable in a memory map so it is
81 	 * preserved across reforks (every child its own table) */
82 	rrl_maps_num = (size_t)numch;
83 	rrl_maps = (void**)xmallocarray(rrl_maps_num, sizeof(void*));
84 	for(i=0; i<rrl_maps_num; i++) {
85 		rrl_maps[i] = mmap(NULL,
86 			sizeof(struct rrl_bucket)*rrl_array_size,
87 			PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
88 		if(rrl_maps[i] == MAP_FAILED) {
89 			log_msg(LOG_ERR, "rrl: mmap failed: %s",
90 				strerror(errno));
91 			exit(1);
92 		}
93 		memset(rrl_maps[i], 0,
94 			sizeof(struct rrl_bucket)*rrl_array_size);
95 	}
96 #else
97 	(void)numch;
98 	rrl_maps_num = 0;
99 	rrl_maps = NULL;
100 #endif
101 }
102 
rrl_mmap_deinit(void)103 void rrl_mmap_deinit(void)
104 {
105 #ifdef HAVE_MMAP
106 	size_t i;
107 	for(i=0; i<rrl_maps_num; i++) {
108 		munmap(rrl_maps[i], sizeof(struct rrl_bucket)*rrl_array_size);
109 		rrl_maps[i] = NULL;
110 	}
111 	free(rrl_maps);
112 	rrl_maps = NULL;
113 #endif
114 }
115 
rrl_mmap_deinit_keep_mmap(void)116 void rrl_mmap_deinit_keep_mmap(void)
117 {
118 #ifdef HAVE_MMAP
119 	free(rrl_maps);
120 	rrl_maps = NULL;
121 #endif
122 }
123 
rrl_set_limit(size_t lm,size_t wlm,size_t sm)124 void rrl_set_limit(size_t lm, size_t wlm, size_t sm)
125 {
126 	rrl_ratelimit = lm*2;
127 	rrl_whitelist_ratelimit = wlm*2;
128 	rrl_slip_ratio = sm;
129 }
130 
rrl_init(size_t ch)131 void rrl_init(size_t ch)
132 {
133 	if(!rrl_maps || ch >= rrl_maps_num)
134 	    rrl_array = xalloc_array_zero(sizeof(struct rrl_bucket),
135 	    	rrl_array_size);
136 #ifdef HAVE_MMAP
137 	else rrl_array = (struct rrl_bucket*)rrl_maps[ch];
138 #endif
139 }
140 
rrl_deinit(size_t ch)141 void rrl_deinit(size_t ch)
142 {
143 	if(!rrl_maps || ch >= rrl_maps_num)
144 		free(rrl_array);
145 	rrl_array = NULL;
146 }
147 
148 /** return the source netblock of the query, this is the genuine source
149  * for genuine queries and the target for reflected packets */
rrl_get_source(query_type * query,uint16_t * c2)150 static uint64_t rrl_get_source(query_type* query, uint16_t* c2)
151 {
152 	/* note there is an IPv6 subnet, that maps
153 	 * to the same buckets as IPv4 space, but there is a flag in c2
154 	 * that makes the hash different */
155 #ifdef INET6
156 	if( ((struct sockaddr_in*)&query->addr)->sin_family == AF_INET) {
157 		*c2 = 0;
158 		return ((struct sockaddr_in*)&query->addr)->
159 			sin_addr.s_addr & htonl(0xffffffff << (32-rrl_ipv4_prefixlen));
160 	} else {
161 		uint64_t s;
162 		*c2 = rrl_ip6;
163 		memmove(&s, &((struct sockaddr_in6*)&query->addr)->sin6_addr,
164 			sizeof(s));
165 		return s & rrl_ipv6_mask;
166 	}
167 #else
168 	*c2 = 0;
169 	return query->addr.sin_addr.s_addr & htonl(0xffffffff << (32-rrl_ipv4_prefixlen));
170 #endif
171 }
172 
173 /** debug source to string */
rrlsource2str(uint64_t s,uint16_t c2)174 static const char* rrlsource2str(uint64_t s, uint16_t c2)
175 {
176 	static char buf[64];
177 	struct in_addr a4;
178 #ifdef INET6
179 	if(c2) {
180 		/* IPv6 */
181 		struct in6_addr a6;
182 		memset(&a6, 0, sizeof(a6));
183 		memmove(&a6, &s, sizeof(s));
184 		if(!inet_ntop(AF_INET6, &a6, buf, sizeof(buf)))
185 			strlcpy(buf, "[ip6 ntop failed]", sizeof(buf));
186 		else {
187 			static char prefix[5];
188 			snprintf(prefix, sizeof(prefix), "/%d", rrl_ipv6_prefixlen);
189 			strlcat(buf, &prefix[0], sizeof(buf));
190 		}
191 		return buf;
192 	}
193 #else
194 	(void)c2;
195 #endif
196 	/* ipv4 */
197 	a4.s_addr = (uint32_t)s;
198 	if(!inet_ntop(AF_INET, &a4, buf, sizeof(buf)))
199 		strlcpy(buf, "[ip4 ntop failed]", sizeof(buf));
200 	else {
201 		static char prefix[5];
202 		snprintf(prefix, sizeof(prefix), "/%d", rrl_ipv4_prefixlen);
203 		strlcat(buf, &prefix[0], sizeof(buf));
204 	}
205 	return buf;
206 }
207 
rrlstr2type(const char * s)208 enum rrl_type rrlstr2type(const char* s)
209 {
210 	if(strcmp(s, "nxdomain")==0) return rrl_type_nxdomain;
211 	else if(strcmp(s, "error")==0) return rrl_type_error;
212 	else if(strcmp(s, "referral")==0) return rrl_type_referral;
213 	else if(strcmp(s, "any")==0) return rrl_type_any;
214 	else if(strcmp(s, "wildcard")==0) return rrl_type_wildcard;
215 	else if(strcmp(s, "nodata")==0) return rrl_type_nodata;
216 	else if(strcmp(s, "dnskey")==0) return rrl_type_dnskey;
217 	else if(strcmp(s, "positive")==0) return rrl_type_positive;
218 	else if(strcmp(s, "rrsig")==0) return rrl_type_rrsig;
219 	else if(strcmp(s, "all")==0) return rrl_type_all;
220 	return 0; /* unknown */
221 }
222 
rrltype2str(enum rrl_type c)223 const char* rrltype2str(enum rrl_type c)
224 {
225 	switch(c & 0x0fff) {
226 		case rrl_type_nxdomain: return "nxdomain";
227 		case rrl_type_error: return "error";
228 		case rrl_type_referral: return "referral";
229 		case rrl_type_any: return "any";
230 		case rrl_type_wildcard: return "wildcard";
231 		case rrl_type_nodata: return "nodata";
232 		case rrl_type_dnskey: return "dnskey";
233 		case rrl_type_positive: return "positive";
234 		case rrl_type_rrsig: return "rrsig";
235 		case rrl_type_all: return "all";
236 	}
237 	return "unknown";
238 }
239 
240 /** classify the query in a number of different types, each has separate
241  * ratelimiting, so that positive queries are not impeded by others */
rrl_classify(query_type * query,const uint8_t ** d,size_t * d_len)242 static uint16_t rrl_classify(query_type* query, const uint8_t** d,
243 	size_t* d_len)
244 {
245 	if(RCODE(query->packet) == RCODE_NXDOMAIN) {
246 		if(query->zone && query->zone->apex) {
247 			*d = dname_name(domain_dname(query->zone->apex));
248 			*d_len = domain_dname(query->zone->apex)->name_size;
249 		}
250 		return rrl_type_nxdomain;
251 	}
252 	if(RCODE(query->packet) != RCODE_OK) {
253 		if(query->zone && query->zone->apex) {
254 			*d = dname_name(domain_dname(query->zone->apex));
255 			*d_len = domain_dname(query->zone->apex)->name_size;
256 		}
257 		return rrl_type_error;
258 	}
259 	if(query->delegation_domain) {
260 		*d = dname_name(domain_dname(query->delegation_domain));
261 		*d_len = domain_dname(query->delegation_domain)->name_size;
262 		return rrl_type_referral;
263 	}
264 	if(query->qtype == TYPE_ANY) {
265 		if(query->qname) {
266 			*d = dname_name(query->qname);
267 			*d_len = query->qname->name_size;
268 		}
269 		return rrl_type_any;
270 	}
271 	if(query->qtype == TYPE_RRSIG) {
272 		if(query->qname) {
273 			*d = dname_name(query->qname);
274 			*d_len = query->qname->name_size;
275 		}
276 		return rrl_type_rrsig;
277 	}
278 	if(query->wildcard_domain) {
279 		*d = dname_name(domain_dname(query->wildcard_domain));
280 		*d_len = domain_dname(query->wildcard_domain)->name_size;
281 		return rrl_type_wildcard;
282 	}
283 	if(ANCOUNT(query->packet) == 0) {
284 		if(query->zone && query->zone->apex) {
285 			*d = dname_name(domain_dname(query->zone->apex));
286 			*d_len = domain_dname(query->zone->apex)->name_size;
287 		}
288 		return rrl_type_nodata;
289 	}
290 	if(query->qtype == TYPE_DNSKEY) {
291 		if(query->qname) {
292 			*d = dname_name(query->qname);
293 			*d_len = query->qname->name_size;
294 		}
295 		return rrl_type_dnskey;
296 	}
297 	/* positive */
298 	if(query->qname) {
299 		*d = dname_name(query->qname);
300 		*d_len = query->qname->name_size;
301 	}
302 	return rrl_type_positive;
303 }
304 
305 /** Examine the query and return hash and source of netblock. */
examine_query(query_type * query,uint32_t * hash,uint64_t * source,uint16_t * flags,uint32_t * lm)306 static void examine_query(query_type* query, uint32_t* hash, uint64_t* source,
307 	uint16_t* flags, uint32_t* lm)
308 {
309 	/* compile a binary string representing the query */
310 	uint16_t c, c2;
311 	/* size with 16 bytes to spare */
312 	uint8_t buf[MAXDOMAINLEN + sizeof(*source) + sizeof(c) + 16];
313 	const uint8_t* dname = NULL; size_t dname_len = 0;
314 	uint32_t r = 0x267fcd16;
315 
316 	*source = rrl_get_source(query, &c2);
317 	c = rrl_classify(query, &dname, &dname_len);
318 	if(query->zone && query->zone->opts &&
319 		(query->zone->opts->pattern->rrl_whitelist & c))
320 		*lm = rrl_whitelist_ratelimit;
321 	if(*lm == 0) return;
322 	c |= c2;
323 	*flags = c;
324 	memmove(buf, source, sizeof(*source));
325 	memmove(buf+sizeof(*source), &c, sizeof(c));
326 
327 	DEBUG(DEBUG_QUERY, 1, (LOG_INFO, "rrl_examine type %s name %s", rrltype2str(c), dname?wiredname2str(dname):"NULL"));
328 
329 	/* and hash it */
330 	if(dname && dname_len <= MAXDOMAINLEN) {
331 		memmove(buf+sizeof(*source)+sizeof(c), dname, dname_len);
332 		*hash = hashlittle(buf, sizeof(*source)+sizeof(c)+dname_len, r);
333 	} else
334 		*hash = hashlittle(buf, sizeof(*source)+sizeof(c), r);
335 }
336 
337 /* age the bucket because elapsed time steps have gone by */
rrl_attenuate_bucket(struct rrl_bucket * b,int32_t elapsed)338 static void rrl_attenuate_bucket(struct rrl_bucket* b, int32_t elapsed)
339 {
340 	if(elapsed > 16) {
341 		b->rate = 0;
342 	} else {
343 		/* divide rate /2 for every elapsed time step, because
344 		 * the counters in the inbetween steps were 0 */
345 		/* r(t) = 0 + 0/2 + 0/4 + .. + oldrate/2^dt */
346 		b->rate >>= elapsed;
347 		/* we know that elapsed >= 2 */
348 		b->rate += (b->counter>>(elapsed-1));
349 	}
350 }
351 
352 /** log a message about ratelimits */
353 static void
rrl_msg(query_type * query,const char * str)354 rrl_msg(query_type* query, const char* str)
355 {
356 	uint16_t c, c2, wl = 0;
357 	const uint8_t* d = NULL;
358 	size_t d_len;
359 	uint64_t s;
360 	char address[128];
361 	if(verbosity < 1) return;
362 	addr2str(&query->addr, address, sizeof(address));
363 	s = rrl_get_source(query, &c2);
364 	c = rrl_classify(query, &d, &d_len) | c2;
365 	if(query->zone && query->zone->opts &&
366 		(query->zone->opts->pattern->rrl_whitelist & c))
367 		wl = 1;
368 	log_msg(LOG_INFO, "ratelimit %s %s type %s%s target %s query %s %s",
369 		str, d?wiredname2str(d):"", rrltype2str(c),
370 		wl?"(whitelisted)":"", rrlsource2str(s, c2),
371 		address, rrtype_to_string(query->qtype));
372 }
373 
374 /** true if the query used to be blocked by the ratelimit */
375 static int
used_to_block(uint32_t rate,uint32_t counter,uint32_t lm)376 used_to_block(uint32_t rate, uint32_t counter, uint32_t lm)
377 {
378 	return rate >= lm || counter+rate/2 >= lm;
379 }
380 
381 /** update the rate in a ratelimit bucket, return actual rate */
rrl_update(query_type * query,uint32_t hash,uint64_t source,uint16_t flags,int32_t now,uint32_t lm)382 uint32_t rrl_update(query_type* query, uint32_t hash, uint64_t source,
383 	uint16_t flags, int32_t now, uint32_t lm)
384 {
385 	struct rrl_bucket* b = &rrl_array[hash % rrl_array_size];
386 
387 	DEBUG(DEBUG_QUERY, 1, (LOG_INFO, "source %llx hash %x oldrate %d oldcount %d stamp %d",
388 		(long long unsigned)source, hash, b->rate, b->counter, b->stamp));
389 
390 	/* check if different source */
391 	if(b->source != source || b->flags != flags || b->hash != hash) {
392 		/* initialise */
393 		/* potentially the wrong limit here, used lower nonwhitelim */
394 		if(verbosity >= 1 &&
395 			used_to_block(b->rate, b->counter, rrl_ratelimit)) {
396 			char address[128];
397 			addr2str(&query->addr, address, sizeof(address));
398 			log_msg(LOG_INFO, "ratelimit unblock ~ type %s target %s query %s %s (%s collision)",
399 				rrltype2str(b->flags),
400 				rrlsource2str(b->source, b->flags),
401 				address, rrtype_to_string(query->qtype),
402 				(b->hash!=hash?"bucket":"hash"));
403 		}
404 		b->hash = hash;
405 		b->source = source;
406 		b->flags = flags;
407 		b->counter = 1;
408 		b->rate = 0;
409 		b->stamp = now;
410 		return 1;
411 	}
412 	/* this is the same source */
413 
414 	/* check if old, zero or smooth it */
415 	/* circular arith for time */
416 	if(now - b->stamp == 1) {
417 		/* very busy bucket and time just stepped one step */
418 		int oldblock = used_to_block(b->rate, b->counter, lm);
419 		b->rate = b->rate/2 + b->counter;
420 		if(oldblock && b->rate < lm)
421 			rrl_msg(query, "unblock");
422 		b->counter = 1;
423 		b->stamp = now;
424 	} else if(now - b->stamp > 0) {
425 		/* older bucket */
426 		int olderblock = used_to_block(b->rate, b->counter, lm);
427 		rrl_attenuate_bucket(b, now - b->stamp);
428 		if(olderblock && b->rate < lm)
429 			rrl_msg(query, "unblock");
430 		b->counter = 1;
431 		b->stamp = now;
432 	} else if(now != b->stamp) {
433 		/* robust, timestamp from the future */
434 		if(used_to_block(b->rate, b->counter, lm))
435 			rrl_msg(query, "unblock");
436 		b->rate = 0;
437 		b->counter = 1;
438 		b->stamp = now;
439 	} else {
440 		/* bucket is from the current timestep, update counter */
441 		b->counter ++;
442 
443 		/* log what is blocked for operational debugging */
444 		if(b->counter + b->rate/2 == lm && b->rate < lm)
445 			rrl_msg(query, "block");
446 	}
447 
448 	/* return max from current rate and projected next-value for rate */
449 	/* so that if the rate increases suddenly very high, it is
450 	 * stopped halfway into the time step */
451 	if(b->counter > b->rate/2)
452 		return b->counter + b->rate/2;
453 	return b->rate;
454 }
455 
rrl_process_query(query_type * query)456 int rrl_process_query(query_type* query)
457 {
458 	uint64_t source;
459 	uint32_t hash;
460 	/* we can use circular arithmetic here, so int32 works after 2038 */
461 	int32_t now = (int32_t)time(NULL);
462 	uint32_t lm = rrl_ratelimit;
463 	uint16_t flags;
464 	if(rrl_ratelimit == 0 && rrl_whitelist_ratelimit == 0)
465 		return 0;
466 
467 	/* examine query */
468 	examine_query(query, &hash, &source, &flags, &lm);
469 
470 	if(lm == 0)
471 		return 0; /* no limit for this */
472 
473 	/* update rate */
474 	return (rrl_update(query, hash, source, flags, now, lm) >= lm);
475 }
476 
rrl_slip(query_type * query)477 query_state_type rrl_slip(query_type* query)
478 {
479 	/* discard number the packets, randomly */
480 #ifdef HAVE_ARC4RANDOM_UNIFORM
481 	if((rrl_slip_ratio > 0) && ((rrl_slip_ratio == 1) || ((arc4random_uniform(rrl_slip_ratio)) == 0))) {
482 #elif HAVE_ARC4RANDOM
483 	if((rrl_slip_ratio > 0) && ((rrl_slip_ratio == 1) || ((arc4random() % rrl_slip_ratio) == 0))) {
484 #else
485 	if((rrl_slip_ratio > 0) && ((rrl_slip_ratio == 1) || ((random() % rrl_slip_ratio) == 0))) {
486 #endif
487 		/* set TC on the rest */
488 		TC_SET(query->packet);
489 		ANCOUNT_SET(query->packet, 0);
490 		NSCOUNT_SET(query->packet, 0);
491 		ARCOUNT_SET(query->packet, 0);
492 		if(query->qname)
493 			/* header, type, class, qname */
494 			buffer_set_position(query->packet,
495 				QHEADERSZ+4+query->qname->name_size);
496 		else 	buffer_set_position(query->packet, QHEADERSZ);
497 		return QUERY_PROCESSED;
498 	}
499 	return QUERY_DISCARDED;
500 }
501 
502 #endif /* RATELIMIT */
503