1
2 /* rrl.c - Response Rate Limiting for NSD.
3 * By W.C.A. Wijngaards
4 * Copyright 2012, NLnet Labs.
5 * BSD, see LICENSE.
6 */
7 #include "config.h"
8 #include <errno.h>
9 #include "rrl.h"
10 #include "util.h"
11 #include "lookup3.h"
12 #include "options.h"
13
14 #ifdef RATELIMIT
15
16 #ifdef HAVE_MMAP
17 #include <sys/mman.h>
18 #if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
19 #define MAP_ANONYMOUS MAP_ANON
20 #endif
21 #endif /* HAVE_MMAP */
22
23
24 /**
25 * The rate limiting data structure bucket, this represents one rate of
26 * packets from a single source.
27 * Smoothed average rates.
28 */
29 struct rrl_bucket {
30 /* the source netmask */
31 uint64_t source;
32 /* rate, in queries per second, which due to rate=r(t)+r(t-1)/2 is
33 * equal to double the queries per second */
34 uint32_t rate;
35 /* the full hash */
36 uint32_t hash;
37 /* counter for queries arrived in this second */
38 uint32_t counter;
39 /* timestamp, which time is the time of the counter, the rate is from
40 * one timestep before that. */
41 int32_t stamp;
42 /* flags for the source mask and type */
43 uint16_t flags;
44 };
45
46 /* the (global) array of RRL buckets */
47 static struct rrl_bucket* rrl_array = NULL;
48 static size_t rrl_array_size = RRL_BUCKETS;
49 static uint32_t rrl_ratelimit = RRL_LIMIT; /* 2x qps */
50 static uint8_t rrl_slip_ratio = RRL_SLIP;
51 static uint8_t rrl_ipv4_prefixlen = RRL_IPV4_PREFIX_LENGTH;
52 static uint8_t rrl_ipv6_prefixlen = RRL_IPV6_PREFIX_LENGTH;
53 static uint64_t rrl_ipv6_mask; /* max prefixlen 64 */
54 static uint32_t rrl_whitelist_ratelimit = RRL_WLIST_LIMIT; /* 2x qps */
55
56 /* the array of mmaps for the children (saved between reloads) */
57 static void** rrl_maps = NULL;
58 static size_t rrl_maps_num = 0;
59
rrl_mmap_init(int numch,size_t numbuck,size_t lm,size_t wlm,size_t sm,size_t plf,size_t pls)60 void rrl_mmap_init(int numch, size_t numbuck, size_t lm, size_t wlm, size_t sm,
61 size_t plf, size_t pls)
62 {
63 #ifdef HAVE_MMAP
64 size_t i;
65 #endif
66 if(numbuck != 0)
67 rrl_array_size = numbuck;
68 rrl_ratelimit = lm*2;
69 rrl_slip_ratio = sm;
70 rrl_ipv4_prefixlen = plf;
71 rrl_ipv6_prefixlen = pls;
72 if (pls <= 32) {
73 rrl_ipv6_mask = ((uint64_t) htonl(0xffffffff << (32-pls))) << 32;
74 } else {
75 rrl_ipv6_mask = ((uint64_t) htonl(0xffffffff << (64-pls))) |
76 (((uint64_t)0xffffffff)<<32);
77 }
78 rrl_whitelist_ratelimit = wlm*2;
79 #ifdef HAVE_MMAP
80 /* allocate the ratelimit hashtable in a memory map so it is
81 * preserved across reforks (every child its own table) */
82 rrl_maps_num = (size_t)numch;
83 rrl_maps = (void**)xmallocarray(rrl_maps_num, sizeof(void*));
84 for(i=0; i<rrl_maps_num; i++) {
85 rrl_maps[i] = mmap(NULL,
86 sizeof(struct rrl_bucket)*rrl_array_size,
87 PROT_READ|PROT_WRITE, MAP_SHARED|MAP_ANONYMOUS, -1, 0);
88 if(rrl_maps[i] == MAP_FAILED) {
89 log_msg(LOG_ERR, "rrl: mmap failed: %s",
90 strerror(errno));
91 exit(1);
92 }
93 memset(rrl_maps[i], 0,
94 sizeof(struct rrl_bucket)*rrl_array_size);
95 }
96 #else
97 (void)numch;
98 rrl_maps_num = 0;
99 rrl_maps = NULL;
100 #endif
101 }
102
rrl_mmap_deinit(void)103 void rrl_mmap_deinit(void)
104 {
105 #ifdef HAVE_MMAP
106 size_t i;
107 for(i=0; i<rrl_maps_num; i++) {
108 munmap(rrl_maps[i], sizeof(struct rrl_bucket)*rrl_array_size);
109 rrl_maps[i] = NULL;
110 }
111 free(rrl_maps);
112 rrl_maps = NULL;
113 #endif
114 }
115
rrl_mmap_deinit_keep_mmap(void)116 void rrl_mmap_deinit_keep_mmap(void)
117 {
118 #ifdef HAVE_MMAP
119 free(rrl_maps);
120 rrl_maps = NULL;
121 #endif
122 }
123
rrl_set_limit(size_t lm,size_t wlm,size_t sm)124 void rrl_set_limit(size_t lm, size_t wlm, size_t sm)
125 {
126 rrl_ratelimit = lm*2;
127 rrl_whitelist_ratelimit = wlm*2;
128 rrl_slip_ratio = sm;
129 }
130
rrl_init(size_t ch)131 void rrl_init(size_t ch)
132 {
133 if(!rrl_maps || ch >= rrl_maps_num)
134 rrl_array = xalloc_array_zero(sizeof(struct rrl_bucket),
135 rrl_array_size);
136 #ifdef HAVE_MMAP
137 else rrl_array = (struct rrl_bucket*)rrl_maps[ch];
138 #endif
139 }
140
rrl_deinit(size_t ch)141 void rrl_deinit(size_t ch)
142 {
143 if(!rrl_maps || ch >= rrl_maps_num)
144 free(rrl_array);
145 rrl_array = NULL;
146 }
147
148 /** return the source netblock of the query, this is the genuine source
149 * for genuine queries and the target for reflected packets */
rrl_get_source(query_type * query,uint16_t * c2)150 static uint64_t rrl_get_source(query_type* query, uint16_t* c2)
151 {
152 /* note there is an IPv6 subnet, that maps
153 * to the same buckets as IPv4 space, but there is a flag in c2
154 * that makes the hash different */
155 #ifdef INET6
156 if( ((struct sockaddr_in*)&query->addr)->sin_family == AF_INET) {
157 *c2 = 0;
158 return ((struct sockaddr_in*)&query->addr)->
159 sin_addr.s_addr & htonl(0xffffffff << (32-rrl_ipv4_prefixlen));
160 } else {
161 uint64_t s;
162 *c2 = rrl_ip6;
163 memmove(&s, &((struct sockaddr_in6*)&query->addr)->sin6_addr,
164 sizeof(s));
165 return s & rrl_ipv6_mask;
166 }
167 #else
168 *c2 = 0;
169 return query->addr.sin_addr.s_addr & htonl(0xffffffff << (32-rrl_ipv4_prefixlen));
170 #endif
171 }
172
173 /** debug source to string */
rrlsource2str(uint64_t s,uint16_t c2)174 static const char* rrlsource2str(uint64_t s, uint16_t c2)
175 {
176 static char buf[64];
177 struct in_addr a4;
178 #ifdef INET6
179 if(c2) {
180 /* IPv6 */
181 struct in6_addr a6;
182 memset(&a6, 0, sizeof(a6));
183 memmove(&a6, &s, sizeof(s));
184 if(!inet_ntop(AF_INET6, &a6, buf, sizeof(buf)))
185 strlcpy(buf, "[ip6 ntop failed]", sizeof(buf));
186 else {
187 static char prefix[5];
188 snprintf(prefix, sizeof(prefix), "/%d", rrl_ipv6_prefixlen);
189 strlcat(buf, &prefix[0], sizeof(buf));
190 }
191 return buf;
192 }
193 #else
194 (void)c2;
195 #endif
196 /* ipv4 */
197 a4.s_addr = (uint32_t)s;
198 if(!inet_ntop(AF_INET, &a4, buf, sizeof(buf)))
199 strlcpy(buf, "[ip4 ntop failed]", sizeof(buf));
200 else {
201 static char prefix[5];
202 snprintf(prefix, sizeof(prefix), "/%d", rrl_ipv4_prefixlen);
203 strlcat(buf, &prefix[0], sizeof(buf));
204 }
205 return buf;
206 }
207
rrlstr2type(const char * s)208 enum rrl_type rrlstr2type(const char* s)
209 {
210 if(strcmp(s, "nxdomain")==0) return rrl_type_nxdomain;
211 else if(strcmp(s, "error")==0) return rrl_type_error;
212 else if(strcmp(s, "referral")==0) return rrl_type_referral;
213 else if(strcmp(s, "any")==0) return rrl_type_any;
214 else if(strcmp(s, "wildcard")==0) return rrl_type_wildcard;
215 else if(strcmp(s, "nodata")==0) return rrl_type_nodata;
216 else if(strcmp(s, "dnskey")==0) return rrl_type_dnskey;
217 else if(strcmp(s, "positive")==0) return rrl_type_positive;
218 else if(strcmp(s, "rrsig")==0) return rrl_type_rrsig;
219 else if(strcmp(s, "all")==0) return rrl_type_all;
220 return 0; /* unknown */
221 }
222
rrltype2str(enum rrl_type c)223 const char* rrltype2str(enum rrl_type c)
224 {
225 switch(c & 0x0fff) {
226 case rrl_type_nxdomain: return "nxdomain";
227 case rrl_type_error: return "error";
228 case rrl_type_referral: return "referral";
229 case rrl_type_any: return "any";
230 case rrl_type_wildcard: return "wildcard";
231 case rrl_type_nodata: return "nodata";
232 case rrl_type_dnskey: return "dnskey";
233 case rrl_type_positive: return "positive";
234 case rrl_type_rrsig: return "rrsig";
235 case rrl_type_all: return "all";
236 }
237 return "unknown";
238 }
239
240 /** classify the query in a number of different types, each has separate
241 * ratelimiting, so that positive queries are not impeded by others */
rrl_classify(query_type * query,const uint8_t ** d,size_t * d_len)242 static uint16_t rrl_classify(query_type* query, const uint8_t** d,
243 size_t* d_len)
244 {
245 if(RCODE(query->packet) == RCODE_NXDOMAIN) {
246 if(query->zone && query->zone->apex) {
247 *d = dname_name(domain_dname(query->zone->apex));
248 *d_len = domain_dname(query->zone->apex)->name_size;
249 }
250 return rrl_type_nxdomain;
251 }
252 if(RCODE(query->packet) != RCODE_OK) {
253 if(query->zone && query->zone->apex) {
254 *d = dname_name(domain_dname(query->zone->apex));
255 *d_len = domain_dname(query->zone->apex)->name_size;
256 }
257 return rrl_type_error;
258 }
259 if(query->delegation_domain) {
260 *d = dname_name(domain_dname(query->delegation_domain));
261 *d_len = domain_dname(query->delegation_domain)->name_size;
262 return rrl_type_referral;
263 }
264 if(query->qtype == TYPE_ANY) {
265 if(query->qname) {
266 *d = dname_name(query->qname);
267 *d_len = query->qname->name_size;
268 }
269 return rrl_type_any;
270 }
271 if(query->qtype == TYPE_RRSIG) {
272 if(query->qname) {
273 *d = dname_name(query->qname);
274 *d_len = query->qname->name_size;
275 }
276 return rrl_type_rrsig;
277 }
278 if(query->wildcard_domain) {
279 *d = dname_name(domain_dname(query->wildcard_domain));
280 *d_len = domain_dname(query->wildcard_domain)->name_size;
281 return rrl_type_wildcard;
282 }
283 if(ANCOUNT(query->packet) == 0) {
284 if(query->zone && query->zone->apex) {
285 *d = dname_name(domain_dname(query->zone->apex));
286 *d_len = domain_dname(query->zone->apex)->name_size;
287 }
288 return rrl_type_nodata;
289 }
290 if(query->qtype == TYPE_DNSKEY) {
291 if(query->qname) {
292 *d = dname_name(query->qname);
293 *d_len = query->qname->name_size;
294 }
295 return rrl_type_dnskey;
296 }
297 /* positive */
298 if(query->qname) {
299 *d = dname_name(query->qname);
300 *d_len = query->qname->name_size;
301 }
302 return rrl_type_positive;
303 }
304
305 /** Examine the query and return hash and source of netblock. */
examine_query(query_type * query,uint32_t * hash,uint64_t * source,uint16_t * flags,uint32_t * lm)306 static void examine_query(query_type* query, uint32_t* hash, uint64_t* source,
307 uint16_t* flags, uint32_t* lm)
308 {
309 /* compile a binary string representing the query */
310 uint16_t c, c2;
311 /* size with 16 bytes to spare */
312 uint8_t buf[MAXDOMAINLEN + sizeof(*source) + sizeof(c) + 16];
313 const uint8_t* dname = NULL; size_t dname_len = 0;
314 uint32_t r = 0x267fcd16;
315
316 *source = rrl_get_source(query, &c2);
317 c = rrl_classify(query, &dname, &dname_len);
318 if(query->zone && query->zone->opts &&
319 (query->zone->opts->pattern->rrl_whitelist & c))
320 *lm = rrl_whitelist_ratelimit;
321 if(*lm == 0) return;
322 c |= c2;
323 *flags = c;
324 memmove(buf, source, sizeof(*source));
325 memmove(buf+sizeof(*source), &c, sizeof(c));
326
327 DEBUG(DEBUG_QUERY, 1, (LOG_INFO, "rrl_examine type %s name %s", rrltype2str(c), dname?wiredname2str(dname):"NULL"));
328
329 /* and hash it */
330 if(dname && dname_len <= MAXDOMAINLEN) {
331 memmove(buf+sizeof(*source)+sizeof(c), dname, dname_len);
332 *hash = hashlittle(buf, sizeof(*source)+sizeof(c)+dname_len, r);
333 } else
334 *hash = hashlittle(buf, sizeof(*source)+sizeof(c), r);
335 }
336
337 /* age the bucket because elapsed time steps have gone by */
rrl_attenuate_bucket(struct rrl_bucket * b,int32_t elapsed)338 static void rrl_attenuate_bucket(struct rrl_bucket* b, int32_t elapsed)
339 {
340 if(elapsed > 16) {
341 b->rate = 0;
342 } else {
343 /* divide rate /2 for every elapsed time step, because
344 * the counters in the inbetween steps were 0 */
345 /* r(t) = 0 + 0/2 + 0/4 + .. + oldrate/2^dt */
346 b->rate >>= elapsed;
347 /* we know that elapsed >= 2 */
348 b->rate += (b->counter>>(elapsed-1));
349 }
350 }
351
352 /** log a message about ratelimits */
353 static void
rrl_msg(query_type * query,const char * str)354 rrl_msg(query_type* query, const char* str)
355 {
356 uint16_t c, c2, wl = 0;
357 const uint8_t* d = NULL;
358 size_t d_len;
359 uint64_t s;
360 char address[128];
361 if(verbosity < 1) return;
362 addr2str(&query->addr, address, sizeof(address));
363 s = rrl_get_source(query, &c2);
364 c = rrl_classify(query, &d, &d_len) | c2;
365 if(query->zone && query->zone->opts &&
366 (query->zone->opts->pattern->rrl_whitelist & c))
367 wl = 1;
368 log_msg(LOG_INFO, "ratelimit %s %s type %s%s target %s query %s %s",
369 str, d?wiredname2str(d):"", rrltype2str(c),
370 wl?"(whitelisted)":"", rrlsource2str(s, c2),
371 address, rrtype_to_string(query->qtype));
372 }
373
374 /** true if the query used to be blocked by the ratelimit */
375 static int
used_to_block(uint32_t rate,uint32_t counter,uint32_t lm)376 used_to_block(uint32_t rate, uint32_t counter, uint32_t lm)
377 {
378 return rate >= lm || counter+rate/2 >= lm;
379 }
380
381 /** update the rate in a ratelimit bucket, return actual rate */
rrl_update(query_type * query,uint32_t hash,uint64_t source,uint16_t flags,int32_t now,uint32_t lm)382 uint32_t rrl_update(query_type* query, uint32_t hash, uint64_t source,
383 uint16_t flags, int32_t now, uint32_t lm)
384 {
385 struct rrl_bucket* b = &rrl_array[hash % rrl_array_size];
386
387 DEBUG(DEBUG_QUERY, 1, (LOG_INFO, "source %llx hash %x oldrate %d oldcount %d stamp %d",
388 (long long unsigned)source, hash, b->rate, b->counter, b->stamp));
389
390 /* check if different source */
391 if(b->source != source || b->flags != flags || b->hash != hash) {
392 /* initialise */
393 /* potentially the wrong limit here, used lower nonwhitelim */
394 if(verbosity >= 1 &&
395 used_to_block(b->rate, b->counter, rrl_ratelimit)) {
396 char address[128];
397 addr2str(&query->addr, address, sizeof(address));
398 log_msg(LOG_INFO, "ratelimit unblock ~ type %s target %s query %s %s (%s collision)",
399 rrltype2str(b->flags),
400 rrlsource2str(b->source, b->flags),
401 address, rrtype_to_string(query->qtype),
402 (b->hash!=hash?"bucket":"hash"));
403 }
404 b->hash = hash;
405 b->source = source;
406 b->flags = flags;
407 b->counter = 1;
408 b->rate = 0;
409 b->stamp = now;
410 return 1;
411 }
412 /* this is the same source */
413
414 /* check if old, zero or smooth it */
415 /* circular arith for time */
416 if(now - b->stamp == 1) {
417 /* very busy bucket and time just stepped one step */
418 int oldblock = used_to_block(b->rate, b->counter, lm);
419 b->rate = b->rate/2 + b->counter;
420 if(oldblock && b->rate < lm)
421 rrl_msg(query, "unblock");
422 b->counter = 1;
423 b->stamp = now;
424 } else if(now - b->stamp > 0) {
425 /* older bucket */
426 int olderblock = used_to_block(b->rate, b->counter, lm);
427 rrl_attenuate_bucket(b, now - b->stamp);
428 if(olderblock && b->rate < lm)
429 rrl_msg(query, "unblock");
430 b->counter = 1;
431 b->stamp = now;
432 } else if(now != b->stamp) {
433 /* robust, timestamp from the future */
434 if(used_to_block(b->rate, b->counter, lm))
435 rrl_msg(query, "unblock");
436 b->rate = 0;
437 b->counter = 1;
438 b->stamp = now;
439 } else {
440 /* bucket is from the current timestep, update counter */
441 b->counter ++;
442
443 /* log what is blocked for operational debugging */
444 if(b->counter + b->rate/2 == lm && b->rate < lm)
445 rrl_msg(query, "block");
446 }
447
448 /* return max from current rate and projected next-value for rate */
449 /* so that if the rate increases suddenly very high, it is
450 * stopped halfway into the time step */
451 if(b->counter > b->rate/2)
452 return b->counter + b->rate/2;
453 return b->rate;
454 }
455
rrl_process_query(query_type * query)456 int rrl_process_query(query_type* query)
457 {
458 uint64_t source;
459 uint32_t hash;
460 /* we can use circular arithmetic here, so int32 works after 2038 */
461 int32_t now = (int32_t)time(NULL);
462 uint32_t lm = rrl_ratelimit;
463 uint16_t flags;
464 if(rrl_ratelimit == 0 && rrl_whitelist_ratelimit == 0)
465 return 0;
466
467 /* examine query */
468 examine_query(query, &hash, &source, &flags, &lm);
469
470 if(lm == 0)
471 return 0; /* no limit for this */
472
473 /* update rate */
474 return (rrl_update(query, hash, source, flags, now, lm) >= lm);
475 }
476
rrl_slip(query_type * query)477 query_state_type rrl_slip(query_type* query)
478 {
479 /* discard number the packets, randomly */
480 #ifdef HAVE_ARC4RANDOM_UNIFORM
481 if((rrl_slip_ratio > 0) && ((rrl_slip_ratio == 1) || ((arc4random_uniform(rrl_slip_ratio)) == 0))) {
482 #elif HAVE_ARC4RANDOM
483 if((rrl_slip_ratio > 0) && ((rrl_slip_ratio == 1) || ((arc4random() % rrl_slip_ratio) == 0))) {
484 #else
485 if((rrl_slip_ratio > 0) && ((rrl_slip_ratio == 1) || ((random() % rrl_slip_ratio) == 0))) {
486 #endif
487 /* set TC on the rest */
488 TC_SET(query->packet);
489 ANCOUNT_SET(query->packet, 0);
490 NSCOUNT_SET(query->packet, 0);
491 ARCOUNT_SET(query->packet, 0);
492 if(query->qname)
493 /* header, type, class, qname */
494 buffer_set_position(query->packet,
495 QHEADERSZ+4+query->qname->name_size);
496 else buffer_set_position(query->packet, QHEADERSZ);
497 return QUERY_PROCESSED;
498 }
499 return QUERY_DISCARDED;
500 }
501
502 #endif /* RATELIMIT */
503