1 /*  Copyright (C) 2021 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
2 
3     This program is free software: you can redistribute it and/or modify
4     it under the terms of the GNU General Public License as published by
5     the Free Software Foundation, either version 3 of the License, or
6     (at your option) any later version.
7 
8     This program is distributed in the hope that it will be useful,
9     but WITHOUT ANY WARRANTY; without even the implied warranty of
10     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11     GNU General Public License for more details.
12 
13     You should have received a copy of the GNU General Public License
14     along with this program.  If not, see <https://www.gnu.org/licenses/>.
15  */
16 
17 #include <assert.h>
18 #include <time.h>
19 
20 #include "knot/modules/rrl/functions.h"
21 #include "contrib/openbsd/strlcat.h"
22 #include "contrib/sockaddr.h"
23 #include "contrib/time.h"
24 #include "libdnssec/error.h"
25 #include "libdnssec/random.h"
26 
27 /* Hopscotch defines. */
28 #define HOP_LEN (sizeof(unsigned)*8)
29 /* Limits (class, ipv6 remote, dname) */
30 #define RRL_CLSBLK_MAXLEN (1 + 8 + 255)
31 /* CIDR block prefix lengths for v4/v6 */
32 #define RRL_V4_PREFIX_LEN 3 /* /24 */
33 #define RRL_V6_PREFIX_LEN 7 /* /56 */
34 /* Defaults */
35 #define RRL_SSTART 2 /* 1/Nth of the rate for slow start */
36 #define RRL_PSIZE_LARGE 1024
37 #define RRL_CAPACITY 4 /* Window size in seconds */
38 #define RRL_LOCK_GRANULARITY 32 /* Last digit granularity */
39 
40 /* Classification */
41 enum {
42 	CLS_NULL     = 0 << 0, /* Empty bucket. */
43 	CLS_NORMAL   = 1 << 0, /* Normal response. */
44 	CLS_ERROR    = 1 << 1, /* Error response. */
45 	CLS_NXDOMAIN = 1 << 2, /* NXDOMAIN (special case of error). */
46 	CLS_EMPTY    = 1 << 3, /* Empty response. */
47 	CLS_LARGE    = 1 << 4, /* Response size over threshold (1024k). */
48 	CLS_WILDCARD = 1 << 5, /* Wildcard query. */
49 	CLS_ANY      = 1 << 6, /* ANY query (spec. class). */
50 	CLS_DNSSEC   = 1 << 7  /* DNSSEC related RR query (spec. class) */
51 };
52 
53 /* Classification string. */
54 struct cls_name {
55 	int code;
56 	const char *name;
57 };
58 
59 static const struct cls_name rrl_cls_names[] = {
60 	{ CLS_NORMAL,   "POSITIVE" },
61 	{ CLS_ERROR,    "ERROR" },
62 	{ CLS_NXDOMAIN, "NXDOMAIN"},
63 	{ CLS_EMPTY,    "EMPTY"},
64 	{ CLS_LARGE,    "LARGE"},
65 	{ CLS_WILDCARD, "WILDCARD"},
66 	{ CLS_ANY,      "ANY"},
67 	{ CLS_DNSSEC,   "DNSSEC"},
68 	{ CLS_NULL,     "NULL"},
69 	{ CLS_NULL,     NULL}
70 };
71 
rrl_clsstr(int code)72 static inline const char *rrl_clsstr(int code)
73 {
74 	for (const struct cls_name *c = rrl_cls_names; c->name; c++) {
75 		if (c->code == code) {
76 			return c->name;
77 		}
78 	}
79 
80 	return "unknown class";
81 }
82 
83 /* Bucket flags. */
84 enum {
85 	RRL_BF_NULL   = 0 << 0, /* No flags. */
86 	RRL_BF_SSTART = 1 << 0, /* Bucket in slow-start after collision. */
87 	RRL_BF_ELIMIT = 1 << 1  /* Bucket is rate-limited. */
88 };
89 
rrl_clsid(rrl_req_t * p)90 static uint8_t rrl_clsid(rrl_req_t *p)
91 {
92 	/* Check error code */
93 	int ret = CLS_NULL;
94 	switch (knot_wire_get_rcode(p->wire)) {
95 	case KNOT_RCODE_NOERROR: ret = CLS_NORMAL; break;
96 	case KNOT_RCODE_NXDOMAIN: return CLS_NXDOMAIN; break;
97 	default: return CLS_ERROR; break;
98 	}
99 
100 	/* Check if answered from a qname */
101 	if (ret == CLS_NORMAL && p->flags & RRL_REQ_WILDCARD) {
102 		return CLS_WILDCARD;
103 	}
104 
105 	/* Check query type for spec. classes. */
106 	if (p->query) {
107 		switch(knot_pkt_qtype(p->query)) {
108 		case KNOT_RRTYPE_ANY:      /* ANY spec. class */
109 			return CLS_ANY;
110 			break;
111 		case KNOT_RRTYPE_DNSKEY:
112 		case KNOT_RRTYPE_RRSIG:
113 		case KNOT_RRTYPE_DS:      /* DNSSEC-related RR class. */
114 			return CLS_DNSSEC;
115 			break;
116 		default:
117 			break;
118 		}
119 	}
120 
121 	/* Check packet size for threshold. */
122 	if (p->len >= RRL_PSIZE_LARGE) {
123 		return CLS_LARGE;
124 	}
125 
126 	/* Check ancount */
127 	if (knot_wire_get_ancount(p->wire) == 0) {
128 		return CLS_EMPTY;
129 	}
130 
131 	return ret;
132 }
133 
rrl_clsname(uint8_t * dst,size_t maxlen,uint8_t cls,rrl_req_t * req,const knot_dname_t * name)134 static int rrl_clsname(uint8_t *dst, size_t maxlen, uint8_t cls, rrl_req_t *req,
135                        const knot_dname_t *name)
136 {
137 	if (name == NULL) {
138 		/* Fallback for errors etc. */
139 		name = (const knot_dname_t *)"\x00";
140 	}
141 
142 	switch (cls) {
143 	case CLS_ERROR:    /* Could be a non-existent zone or garbage. */
144 	case CLS_NXDOMAIN: /* Queries to non-existent names in zone. */
145 	case CLS_WILDCARD: /* Queries to names covered by a wildcard. */
146 		break;
147 	default:
148 		/* Use QNAME */
149 		if (req->query) {
150 			name = knot_pkt_qname(req->query);
151 		}
152 		break;
153 	}
154 
155 	/* Write to wire */
156 	return knot_dname_to_wire(dst, name, maxlen);
157 }
158 
rrl_classify(uint8_t * dst,size_t maxlen,const struct sockaddr_storage * remote,rrl_req_t * req,const knot_dname_t * name)159 static int rrl_classify(uint8_t *dst, size_t maxlen, const struct sockaddr_storage *remote,
160                         rrl_req_t *req, const knot_dname_t *name)
161 {
162 	/* Class */
163 	uint8_t cls = rrl_clsid(req);
164 	*dst = cls;
165 	int blklen = sizeof(cls);
166 
167 	/* Address (in network byteorder, adjust masks). */
168 	uint64_t netblk = 0;
169 	if (remote->ss_family == AF_INET6) {
170 		struct sockaddr_in6 *ipv6 = (struct sockaddr_in6 *)remote;
171 		memcpy(&netblk, &ipv6->sin6_addr, RRL_V6_PREFIX_LEN);
172 	} else {
173 		struct sockaddr_in *ipv4 = (struct sockaddr_in *)remote;
174 		memcpy(&netblk, &ipv4->sin_addr, RRL_V4_PREFIX_LEN);
175 	}
176 	memcpy(dst + blklen, &netblk, sizeof(netblk));
177 	blklen += sizeof(netblk);
178 
179 	/* Name */
180 	int ret = rrl_clsname(dst + blklen, maxlen - blklen, cls, req, name);
181 	if (ret < 0) {
182 		return ret;
183 	}
184 	uint8_t len = ret;
185 	blklen += len;
186 
187 	return blklen;
188 }
189 
bucket_free(rrl_item_t * bucket,uint32_t now)190 static int bucket_free(rrl_item_t *bucket, uint32_t now)
191 {
192 	return bucket->cls == CLS_NULL || (bucket->time + 1 < now);
193 }
194 
bucket_match(rrl_item_t * bucket,rrl_item_t * match)195 static int bucket_match(rrl_item_t *bucket, rrl_item_t *match)
196 {
197 	return bucket->cls    == match->cls &&
198 	       bucket->netblk == match->netblk &&
199 	       bucket->qname  == match->qname;
200 }
201 
find_free(rrl_table_t * tbl,unsigned id,uint32_t now)202 static int find_free(rrl_table_t *tbl, unsigned id, uint32_t now)
203 {
204 	for (int i = id; i < tbl->size; i++) {
205 		if (bucket_free(&tbl->arr[i], now)) {
206 			return i - id;
207 		}
208 	}
209 	for (int i = 0; i < id; i++) {
210 		if (bucket_free(&tbl->arr[i], now)) {
211 			return i + (tbl->size - id);
212 		}
213 	}
214 
215 	/* this happens if table is full... force vacate current elm */
216 	return id;
217 }
218 
find_match(rrl_table_t * tbl,uint32_t id,rrl_item_t * m)219 static inline unsigned find_match(rrl_table_t *tbl, uint32_t id, rrl_item_t *m)
220 {
221 	unsigned new_id = 0;
222 	unsigned hop = 0;
223 	unsigned match_bitmap = tbl->arr[id].hop;
224 	while (match_bitmap != 0) {
225 		hop = __builtin_ctz(match_bitmap); /* offset of next potential match */
226 		new_id = (id + hop) % tbl->size;
227 		if (bucket_match(&tbl->arr[new_id], m)) {
228 			return hop;
229 		} else {
230 			match_bitmap &= ~(1 << hop); /* clear potential match */
231 		}
232 	}
233 
234 	return HOP_LEN + 1;
235 }
236 
reduce_dist(rrl_table_t * tbl,unsigned id,unsigned dist,unsigned * free_id)237 static inline unsigned reduce_dist(rrl_table_t *tbl, unsigned id, unsigned dist, unsigned *free_id)
238 {
239 	unsigned rd = HOP_LEN - 1;
240 	while (rd > 0) {
241 		unsigned vacate_id = (tbl->size + *free_id - rd) % tbl->size; /* bucket to be vacated */
242 		if (tbl->arr[vacate_id].hop != 0) {
243 			unsigned hop = __builtin_ctz(tbl->arr[vacate_id].hop);  /* offset of first valid bucket */
244 			if (hop < rd) { /* only offsets in <vacate_id, free_id> are interesting */
245 				unsigned new_id = (vacate_id + hop) % tbl->size; /* this item will be displaced to [free_id] */
246 				unsigned keep_hop = tbl->arr[*free_id].hop; /* unpredictable padding */
247 				memcpy(tbl->arr + *free_id, tbl->arr + new_id, sizeof(rrl_item_t));
248 				tbl->arr[*free_id].hop = keep_hop;
249 				tbl->arr[new_id].cls = CLS_NULL;
250 				tbl->arr[vacate_id].hop &= ~(1 << hop);
251 				tbl->arr[vacate_id].hop |= 1 << rd;
252 				*free_id = new_id;
253 				return dist - (rd - hop);
254 			}
255 		}
256 		--rd;
257 	}
258 
259 	assert(rd == 0); /* this happens with p=1/fact(HOP_LEN) */
260 	*free_id = id;
261 	dist = 0; /* force vacate initial element */
262 	return dist;
263 }
264 
subnet_tostr(char * dst,size_t maxlen,const struct sockaddr_storage * ss)265 static void subnet_tostr(char *dst, size_t maxlen, const struct sockaddr_storage *ss)
266 {
267 	const void *addr;
268 	const char *suffix;
269 
270 	if (ss->ss_family == AF_INET6) {
271 		addr = &((struct sockaddr_in6 *)ss)->sin6_addr;
272 		suffix = "/56";
273 	} else {
274 		addr = &((struct sockaddr_in *)ss)->sin_addr;
275 		suffix = "/24";
276 	}
277 
278 	if (inet_ntop(ss->ss_family, addr, dst, maxlen) != NULL) {
279 		strlcat(dst, suffix, maxlen);
280 	} else {
281 		dst[0] = '\0';
282 	}
283 }
284 
rrl_log_state(knotd_mod_t * mod,const struct sockaddr_storage * ss,uint16_t flags,uint8_t cls,const knot_dname_t * qname)285 static void rrl_log_state(knotd_mod_t *mod, const struct sockaddr_storage *ss,
286                           uint16_t flags, uint8_t cls, const knot_dname_t *qname)
287 {
288 	if (mod == NULL || ss == NULL) {
289 		return;
290 	}
291 
292 	char addr_str[SOCKADDR_STRLEN];
293 	subnet_tostr(addr_str, sizeof(addr_str), ss);
294 
295 	const char *what = "leaves";
296 	if (flags & RRL_BF_ELIMIT) {
297 		what = "enters";
298 	}
299 
300 	knot_dname_txt_storage_t buf;
301 	char *qname_str = knot_dname_to_str(buf, qname, sizeof(buf));
302 	if (qname_str == NULL) {
303 		qname_str = "?";
304 	}
305 
306 	knotd_mod_log(mod, LOG_NOTICE, "address/subnet %s, class %s, qname %s, %s limiting",
307 	              addr_str, rrl_clsstr(cls), qname_str, what);
308 }
309 
rrl_lock(rrl_table_t * tbl,int lk_id)310 static void rrl_lock(rrl_table_t *tbl, int lk_id)
311 {
312 	assert(lk_id > -1);
313 	pthread_mutex_lock(tbl->lk + lk_id);
314 }
315 
rrl_unlock(rrl_table_t * tbl,int lk_id)316 static void rrl_unlock(rrl_table_t *tbl, int lk_id)
317 {
318 	assert(lk_id > -1);
319 	pthread_mutex_unlock(tbl->lk + lk_id);
320 }
321 
rrl_setlocks(rrl_table_t * tbl,uint32_t granularity)322 static int rrl_setlocks(rrl_table_t *tbl, uint32_t granularity)
323 {
324 	assert(!tbl->lk); /* Cannot change while locks are used. */
325 	assert(granularity <= tbl->size / 10); /* Due to int. division err. */
326 
327 	if (pthread_mutex_init(&tbl->ll, NULL) < 0) {
328 		return KNOT_ENOMEM;
329 	}
330 
331 	/* Alloc new locks. */
332 	tbl->lk = malloc(granularity * sizeof(pthread_mutex_t));
333 	if (!tbl->lk) {
334 		return KNOT_ENOMEM;
335 	}
336 	memset(tbl->lk, 0, granularity * sizeof(pthread_mutex_t));
337 
338 	/* Initialize. */
339 	for (size_t i = 0; i < granularity; ++i) {
340 		if (pthread_mutex_init(tbl->lk + i, NULL) < 0) {
341 			break;
342 		}
343 		++tbl->lk_count;
344 	}
345 
346 	/* Incomplete initialization */
347 	if (tbl->lk_count != granularity) {
348 		for (size_t i = 0; i < tbl->lk_count; ++i) {
349 			pthread_mutex_destroy(tbl->lk + i);
350 		}
351 		free(tbl->lk);
352 		tbl->lk_count = 0;
353 		return KNOT_ERROR;
354 	}
355 
356 	return KNOT_EOK;
357 }
358 
rrl_create(size_t size,uint32_t rate)359 rrl_table_t *rrl_create(size_t size, uint32_t rate)
360 {
361 	if (size == 0) {
362 		return NULL;
363 	}
364 
365 	const size_t tbl_len = sizeof(rrl_table_t) + size * sizeof(rrl_item_t);
366 	rrl_table_t *tbl = calloc(1, tbl_len);
367 	if (!tbl) {
368 		return NULL;
369 	}
370 	tbl->size = size;
371 	tbl->rate = rate;
372 
373 	if (dnssec_random_buffer((uint8_t *)&tbl->key, sizeof(tbl->key)) != DNSSEC_EOK) {
374 		free(tbl);
375 		return NULL;
376 	}
377 
378 	if (rrl_setlocks(tbl, RRL_LOCK_GRANULARITY) != KNOT_EOK) {
379 		free(tbl);
380 		return NULL;
381 	}
382 
383 	return tbl;
384 }
385 
buf_qname(uint8_t * buf)386 static knot_dname_t *buf_qname(uint8_t *buf)
387 {
388 	return buf + sizeof(uint8_t) + sizeof(uint64_t);
389 }
390 
391 /*! \brief Get bucket for current combination of parameters. */
rrl_hash(rrl_table_t * tbl,const struct sockaddr_storage * remote,rrl_req_t * req,const knot_dname_t * zone,uint32_t stamp,int * lock,uint8_t * buf,size_t buf_len)392 static rrl_item_t *rrl_hash(rrl_table_t *tbl, const struct sockaddr_storage *remote,
393                             rrl_req_t *req, const knot_dname_t *zone, uint32_t stamp,
394                             int *lock, uint8_t *buf, size_t buf_len)
395 {
396 	int len = rrl_classify(buf, buf_len, remote, req, zone);
397 	if (len < 0) {
398 		return NULL;
399 	}
400 
401 	uint32_t id = SipHash24(&tbl->key, buf, len) % tbl->size;
402 
403 	/* Lock for lookup. */
404 	pthread_mutex_lock(&tbl->ll);
405 
406 	/* Find an exact match in <id, id + HOP_LEN). */
407 	knot_dname_t *qname = buf_qname(buf);
408 	uint64_t netblk;
409 	memcpy(&netblk, buf + sizeof(uint8_t), sizeof(netblk));
410 	rrl_item_t match = {
411 		.hop = 0,
412 		.netblk = netblk,
413 		.ntok = tbl->rate * RRL_CAPACITY,
414 		.cls = buf[0],
415 		.flags = RRL_BF_NULL,
416 		.qname = SipHash24(&tbl->key, qname, knot_dname_size(qname)),
417 		.time = stamp
418 	};
419 
420 	unsigned dist = find_match(tbl, id, &match);
421 	if (dist > HOP_LEN) { /* not an exact match, find free element [f] */
422 		dist = find_free(tbl, id, stamp);
423 	}
424 
425 	/* Reduce distance to fit <id, id + HOP_LEN) */
426 	unsigned free_id = (id + dist) % tbl->size;
427 	while (dist >= HOP_LEN) {
428 		dist = reduce_dist(tbl, id, dist, &free_id);
429 	}
430 
431 	/* Assign granular lock and unlock lookup. */
432 	*lock = free_id % tbl->lk_count;
433 	rrl_lock(tbl, *lock);
434 	pthread_mutex_unlock(&tbl->ll);
435 
436 	/* found free bucket which is in <id, id + HOP_LEN) */
437 	tbl->arr[id].hop |= (1 << dist);
438 	rrl_item_t *bucket = &tbl->arr[free_id];
439 	assert(free_id == (id + dist) % tbl->size);
440 
441 	/* Inspect bucket state. */
442 	unsigned hop = bucket->hop;
443 	if (bucket->cls == CLS_NULL) {
444 		memcpy(bucket, &match, sizeof(rrl_item_t));
445 		bucket->hop = hop;
446 	}
447 	/* Check for collisions. */
448 	if (!bucket_match(bucket, &match)) {
449 		if (!(bucket->flags & RRL_BF_SSTART)) {
450 			memcpy(bucket, &match, sizeof(rrl_item_t));
451 			bucket->hop = hop;
452 			bucket->ntok = tbl->rate + tbl->rate / RRL_SSTART;
453 			bucket->flags |= RRL_BF_SSTART;
454 		}
455 	}
456 
457 	return bucket;
458 }
459 
rrl_query(rrl_table_t * rrl,const struct sockaddr_storage * remote,rrl_req_t * req,const knot_dname_t * zone,knotd_mod_t * mod)460 int rrl_query(rrl_table_t *rrl, const struct sockaddr_storage *remote,
461               rrl_req_t *req, const knot_dname_t *zone, knotd_mod_t *mod)
462 {
463 	if (!rrl || !req || !remote) {
464 		return KNOT_EINVAL;
465 	}
466 
467 	uint8_t buf[RRL_CLSBLK_MAXLEN];
468 
469 	/* Calculate hash and fetch */
470 	int ret = KNOT_EOK;
471 	int lock = -1;
472 	uint32_t now = time_now().tv_sec;
473 	rrl_item_t *bucket = rrl_hash(rrl, remote, req, zone, now, &lock, buf, sizeof(buf));
474 	if (!bucket) {
475 		if (lock > -1) {
476 			rrl_unlock(rrl, lock);
477 		}
478 		return KNOT_ERROR;
479 	}
480 
481 	/* Calculate rate for dT */
482 	uint32_t dt = now - bucket->time;
483 	if (dt > RRL_CAPACITY) {
484 		dt = RRL_CAPACITY;
485 	}
486 	/* Visit bucket. */
487 	bucket->time = now;
488 	if (dt > 0) { /* Window moved. */
489 
490 		/* Check state change. */
491 		if ((bucket->ntok > 0 || dt > 1) && (bucket->flags & RRL_BF_ELIMIT)) {
492 			bucket->flags &= ~RRL_BF_ELIMIT;
493 			rrl_log_state(mod, remote, bucket->flags, bucket->cls,
494 			              knot_pkt_qname(req->query));
495 		}
496 
497 		/* Add new tokens. */
498 		uint32_t dn = rrl->rate * dt;
499 		if (bucket->flags & RRL_BF_SSTART) { /* Bucket in slow-start. */
500 			bucket->flags &= ~RRL_BF_SSTART;
501 		}
502 		bucket->ntok += dn;
503 		if (bucket->ntok > RRL_CAPACITY * rrl->rate) {
504 			bucket->ntok = RRL_CAPACITY * rrl->rate;
505 		}
506 	}
507 
508 	/* Last item taken. */
509 	if (bucket->ntok == 1 && !(bucket->flags & RRL_BF_ELIMIT)) {
510 		bucket->flags |= RRL_BF_ELIMIT;
511 		rrl_log_state(mod, remote, bucket->flags, bucket->cls,
512 		              knot_pkt_qname(req->query));
513 	}
514 
515 	/* Decay current bucket. */
516 	if (bucket->ntok > 0) {
517 		--bucket->ntok;
518 	} else if (bucket->ntok == 0) {
519 		ret = KNOT_ELIMIT;
520 	}
521 
522 	if (lock > -1) {
523 		rrl_unlock(rrl, lock);
524 	}
525 	return ret;
526 }
527 
rrl_slip_roll(int n_slip)528 bool rrl_slip_roll(int n_slip)
529 {
530 	switch (n_slip) {
531 	case 0:
532 		return false;
533 	case 1:
534 		return true;
535 	default:
536 		return (dnssec_random_uint16_t() % n_slip == 0);
537 	}
538 }
539 
rrl_destroy(rrl_table_t * rrl)540 void rrl_destroy(rrl_table_t *rrl)
541 {
542 	if (rrl) {
543 		if (rrl->lk_count > 0) {
544 			pthread_mutex_destroy(&rrl->ll);
545 		}
546 		for (size_t i = 0; i < rrl->lk_count; ++i) {
547 			pthread_mutex_destroy(rrl->lk + i);
548 		}
549 		free(rrl->lk);
550 	}
551 
552 	free(rrl);
553 }
554