1 /* Copyright (C) 2014-2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
2 * SPDX-License-Identifier: GPL-3.0-or-later
3 */
4
5 #include <errno.h>
6 #include <limits.h>
7 #include <sys/stat.h>
8 #include <sys/time.h>
9 #include <time.h>
10 #include <unistd.h>
11
12 #include <libknot/descriptor.h>
13 #include <libknot/dname.h>
14 #include <libknot/errcode.h>
15 #include <libknot/rrtype/rrsig.h>
16
17 #include <uv.h>
18
19 #include "contrib/base32hex.h"
20 #include "contrib/cleanup.h"
21 #include "contrib/ucw/lib.h"
22 #include "lib/cache/api.h"
23 #include "lib/cache/cdb_lmdb.h"
24 #include "lib/defines.h"
25 #include "lib/dnssec/nsec3.h"
26 #include "lib/generic/trie.h"
27 #include "lib/resolve.h"
28 #include "lib/rplan.h"
29 #include "lib/utils.h"
30
31 #include "lib/cache/impl.h"
32
33 /* TODO:
34 * - Reconsider when RRSIGs are put in and retrieved from the cache.
35 * Currently it's always done, which _might_ be spurious, depending
36 * on how kresd will use the returned result.
37 * There's also the "problem" that kresd ATM does _not_ ask upstream
38 * with DO bit in some cases.
39 */
40
41
42 /** Cache version */
43 static const uint16_t CACHE_VERSION = 6;
44 /** Key size */
45 #define KEY_HSIZE (sizeof(uint8_t) + sizeof(uint16_t))
46 #define KEY_SIZE (KEY_HSIZE + KNOT_DNAME_MAXLEN)
47
48
49 /** @internal Forward declarations of the implementation details
50 * \param needs_pkt[out] optionally set *needs_pkt = true;
51 * We do that when some RRset wasn't stashed to aggressive cache,
52 * even though it might have taken part in a successful DNSSEC proof:
53 * 1. any opt-out NSEC3, as they typically aren't much use aggressively anyway
54 * 2. some kinds of minimal NSEC* ranges, as they'd seem more trouble than worth:
55 * - extremely short range of covered names limits the benefits severely
56 * - the type-set is often a lie, either a working lie, e.g. CloudFlare's
57 * black lies, or even a non-working lie, e.g. DVE-2018-0003
58 * 3. some kinds of "weird" RRsets, to get at least some caching on them
59 */
60 static ssize_t stash_rrset(struct kr_cache *cache, const struct kr_query *qry,
61 const knot_rrset_t *rr, const knot_rrset_t *rr_sigs, uint32_t timestamp,
62 uint8_t rank, trie_t *nsec_pmap, knot_mm_t *pool, bool *needs_pkt);
63 /** Preliminary checks before stash_rrset(). Don't call if returns <= 0. */
64 static int stash_rrset_precond(const knot_rrset_t *rr, const struct kr_query *qry/*logs*/);
65
66 /** @internal Ensure the cache version is right, possibly by clearing it. */
assert_right_version(struct kr_cache * cache)67 static int assert_right_version(struct kr_cache *cache)
68 {
69 /* Check cache ABI version. */
70 /* CACHE_KEY_DEF: to avoid collisions with kr_cache_match(). */
71 uint8_t key_str[4] = "VERS";
72 knot_db_val_t key = { .data = key_str, .len = sizeof(key_str) };
73 knot_db_val_t val = { NULL, 0 };
74 int ret = cache_op(cache, read, &key, &val, 1);
75 if (ret == 0 && val.len == sizeof(CACHE_VERSION)
76 && memcmp(val.data, &CACHE_VERSION, sizeof(CACHE_VERSION)) == 0) {
77 ret = kr_ok();
78 } else {
79 int oldret = ret;
80 /* Version doesn't match or we were unable to read it, possibly because DB is empty.
81 * Recreate cache and write version key. */
82 ret = cache_op(cache, count);
83 if (ret != 0) { /* Log for non-empty cache to limit noise on fresh start. */
84 kr_log_info(CACHE, "incompatible cache database detected, purging\n");
85 if (oldret) {
86 kr_log_debug(CACHE, "reading version returned: %d\n", oldret);
87 } else if (val.len != sizeof(CACHE_VERSION)) {
88 kr_log_debug(CACHE, "version has bad length: %d\n", (int)val.len);
89 } else {
90 uint16_t ver;
91 memcpy(&ver, val.data, sizeof(ver));
92 kr_log_debug(CACHE, "version has bad value: %d instead of %d\n",
93 (int)ver, (int)CACHE_VERSION);
94 }
95 }
96 ret = cache_op(cache, clear);
97 }
98 /* Rewrite the entry even if it isn't needed. Because of cache-size-changing
99 * possibility it's good to always perform some write during opening of cache. */
100 if (ret == 0) {
101 /* Key/Val is invalidated by cache purge, recreate it */
102 val.data = /*const-cast*/(void *)&CACHE_VERSION;
103 val.len = sizeof(CACHE_VERSION);
104 ret = cache_op(cache, write, &key, &val, 1);
105 }
106 kr_cache_commit(cache);
107 return ret;
108 }
109
kr_cache_open(struct kr_cache * cache,const struct kr_cdb_api * api,struct kr_cdb_opts * opts,knot_mm_t * mm)110 int kr_cache_open(struct kr_cache *cache, const struct kr_cdb_api *api, struct kr_cdb_opts *opts, knot_mm_t *mm)
111 {
112 if (kr_fails_assert(cache))
113 return kr_error(EINVAL);
114 memset(cache, 0, sizeof(*cache));
115 /* Open cache */
116 if (!api)
117 api = kr_cdb_lmdb();
118 cache->api = api;
119 int ret = cache->api->open(&cache->db, &cache->stats, opts, mm);
120 if (ret == 0) {
121 ret = assert_right_version(cache);
122 // The included write also committed maxsize increase to the file.
123 }
124 if (ret == 0 && opts->maxsize) {
125 /* If some maxsize is requested and it's smaller than in-file maxsize,
126 * LMDB only restricts our env without changing the in-file maxsize.
127 * That is worked around by reopening (found no other reliable way). */
128 cache->api->close(cache->db, &cache->stats);
129 struct kr_cdb_opts opts2;
130 memcpy(&opts2, opts, sizeof(opts2));
131 opts2.maxsize = 0;
132 ret = cache->api->open(&cache->db, &cache->stats, &opts2, mm);
133 }
134
135 char *fpath = kr_absolutize_path(opts->path, "data.mdb");
136 if (kr_fails_assert(fpath)) {
137 /* non-critical, but still */
138 fpath = "<ENOMEM>";
139 } else {
140 kr_cache_emergency_file_to_remove = fpath;
141 }
142
143 if (ret == 0 && opts->maxsize) {
144 size_t maxsize = cache->api->get_maxsize(cache->db);
145 if (maxsize > opts->maxsize) kr_log_warning(CACHE,
146 "Warning: real cache size is %zu instead of the requested %zu bytes."
147 " To reduce the size you need to remove the file '%s' by hand.\n",
148 maxsize, opts->maxsize, fpath);
149 }
150 if (ret != 0)
151 return ret;
152 cache->ttl_min = KR_CACHE_DEFAULT_TTL_MIN;
153 cache->ttl_max = KR_CACHE_DEFAULT_TTL_MAX;
154 kr_cache_make_checkpoint(cache);
155 return 0;
156 }
157
158 const char *kr_cache_emergency_file_to_remove = NULL;
159
160
161 #define cache_isvalid(cache) ((cache) && (cache)->api && (cache)->db)
162
kr_cache_close(struct kr_cache * cache)163 void kr_cache_close(struct kr_cache *cache)
164 {
165 kr_cache_check_health(cache, -1);
166 if (cache_isvalid(cache)) {
167 cache_op(cache, close);
168 cache->db = NULL;
169 }
170 free(/*const-cast*/(char*)kr_cache_emergency_file_to_remove);
171 kr_cache_emergency_file_to_remove = NULL;
172 }
173
kr_cache_commit(struct kr_cache * cache)174 int kr_cache_commit(struct kr_cache *cache)
175 {
176 if (!cache_isvalid(cache)) {
177 return kr_error(EINVAL);
178 }
179 if (cache->api->commit) {
180 return cache_op(cache, commit);
181 }
182 return kr_ok();
183 }
184
kr_cache_clear(struct kr_cache * cache)185 int kr_cache_clear(struct kr_cache *cache)
186 {
187 if (!cache_isvalid(cache)) {
188 return kr_error(EINVAL);
189 }
190 int ret = cache_op(cache, clear);
191 if (ret == 0) {
192 kr_cache_make_checkpoint(cache);
193 ret = assert_right_version(cache);
194 }
195 return ret;
196 }
197
198 /* When going stricter, BEWARE of breaking entry_h_consistent_NSEC() */
entry_h_consistent_E(knot_db_val_t data,uint16_t type)199 struct entry_h * entry_h_consistent_E(knot_db_val_t data, uint16_t type)
200 {
201 (void) type; /* unused, for now */
202 if (!data.data) return NULL;
203 /* Length checks. */
204 if (data.len < offsetof(struct entry_h, data))
205 return NULL;
206 const struct entry_h *eh = data.data;
207 if (eh->is_packet) {
208 uint16_t pkt_len;
209 if (data.len < offsetof(struct entry_h, data) + sizeof(pkt_len)) {
210 return NULL;
211 }
212 memcpy(&pkt_len, eh->data, sizeof(pkt_len));
213 if (data.len < offsetof(struct entry_h, data) + sizeof(pkt_len)
214 + pkt_len) {
215 return NULL;
216 }
217 }
218
219 bool ok = true;
220 ok = ok && kr_rank_check(eh->rank);
221 ok = ok && (!kr_rank_test(eh->rank, KR_RANK_BOGUS)
222 || eh->is_packet);
223 ok = ok && (eh->is_packet || !eh->has_optout);
224
225 return ok ? /*const-cast*/(struct entry_h *)eh : NULL;
226 }
227
get_new_ttl(const struct entry_h * entry,const struct kr_query * qry,const knot_dname_t * owner,uint16_t type,uint32_t now)228 int32_t get_new_ttl(const struct entry_h *entry, const struct kr_query *qry,
229 const knot_dname_t *owner, uint16_t type, uint32_t now)
230 {
231 int32_t diff = now - entry->time;
232 if (diff < 0) {
233 /* We may have obtained the record *after* the request started. */
234 diff = 0;
235 }
236 int32_t res = entry->ttl - diff;
237 if (res < 0 && owner && qry && qry->stale_cb) {
238 /* Stale-serving decision, delegated to a callback. */
239 int res_stale = qry->stale_cb(res, owner, type, qry);
240 if (res_stale >= 0)
241 return res_stale;
242 }
243 return res;
244 }
245
kr_cache_ttl(const struct kr_cache_p * peek,const struct kr_query * qry,const knot_dname_t * name,uint16_t type)246 int32_t kr_cache_ttl(const struct kr_cache_p *peek, const struct kr_query *qry,
247 const knot_dname_t *name, uint16_t type)
248 {
249 const struct entry_h *eh = peek->raw_data;
250 return get_new_ttl(eh, qry, name, type, qry->timestamp.tv_sec);
251 }
252
253 /** Check that no label contains a zero character, incl. a log trace.
254 *
255 * We refuse to work with those, as LF and our cache keys might become ambiguous.
256 * Assuming uncompressed name, as usual.
257 * CACHE_KEY_DEF
258 */
check_dname_for_lf(const knot_dname_t * n,const struct kr_query * qry)259 static bool check_dname_for_lf(const knot_dname_t *n, const struct kr_query *qry/*logging*/)
260 {
261 const bool ret = knot_dname_size(n) == strlen((const char *)n) + 1;
262 if (!ret && kr_log_is_debug_qry(CACHE, qry)) {
263 auto_free char *n_str = kr_dname_text(n);
264 VERBOSE_MSG(qry, "=> skipping zero-containing name %s\n", n_str);
265 }
266 return ret;
267 }
268
269 /** Return false on types to be ignored. Meant both for sname and direct cache requests. */
check_rrtype(uint16_t type,const struct kr_query * qry)270 static bool check_rrtype(uint16_t type, const struct kr_query *qry/*logging*/)
271 {
272 const bool ret = !knot_rrtype_is_metatype(type)
273 && type != KNOT_RRTYPE_RRSIG;
274 if (!ret && kr_log_is_debug_qry(CACHE, qry)) {
275 auto_free char *type_str = kr_rrtype_text(type);
276 VERBOSE_MSG(qry, "=> skipping RR type %s\n", type_str);
277 }
278 return ret;
279 }
280
281 /** Like key_exact_type() but omits a couple checks not holding for pkt cache. */
key_exact_type_maypkt(struct key * k,uint16_t type)282 knot_db_val_t key_exact_type_maypkt(struct key *k, uint16_t type)
283 {
284 if (kr_fails_assert(check_rrtype(type, NULL)))
285 return (knot_db_val_t){ NULL, 0 };
286 switch (type) {
287 case KNOT_RRTYPE_RRSIG: /* no RRSIG query caching, at least for now */
288 kr_assert(false);
289 return (knot_db_val_t){ NULL, 0 };
290 /* xNAME lumped into NS. */
291 case KNOT_RRTYPE_CNAME:
292 case KNOT_RRTYPE_DNAME:
293 type = KNOT_RRTYPE_NS;
294 default:
295 break;
296 }
297
298 int name_len = k->buf[0];
299 k->buf[name_len + 1] = 0; /* make sure different names can never match */
300 k->buf[name_len + 2] = 'E'; /* tag for exact name+type matches */
301 memcpy(k->buf + name_len + 3, &type, 2);
302 k->type = type;
303 /* CACHE_KEY_DEF: key == dname_lf + '\0' + 'E' + RRTYPE */
304 return (knot_db_val_t){ k->buf + 1, name_len + 4 };
305 }
306
307
308 /** The inside for cache_peek(); implementation separated to ./peek.c */
309 int peek_nosync(kr_layer_t *ctx, knot_pkt_t *pkt);
310 /** function for .produce phase */
cache_peek(kr_layer_t * ctx,knot_pkt_t * pkt)311 int cache_peek(kr_layer_t *ctx, knot_pkt_t *pkt)
312 {
313 struct kr_request *req = ctx->req;
314 struct kr_query *qry = req->current_query;
315 /* We first check various exit-conditions and then call the _real function. */
316
317 if (!kr_cache_is_open(&req->ctx->cache)
318 || ctx->state & (KR_STATE_FAIL|KR_STATE_DONE) || qry->flags.NO_CACHE
319 || (qry->flags.CACHE_TRIED && !qry->stale_cb)
320 || !check_rrtype(qry->stype, qry) /* LATER: some other behavior for some of these? */
321 || qry->sclass != KNOT_CLASS_IN) {
322 return ctx->state; /* Already resolved/failed or already tried, etc. */
323 }
324 /* ATM cache only peeks for qry->sname and that would be useless
325 * to repeat on every iteration, so disable it from now on.
326 * LATER(optim.): assist with more precise QNAME minimization. */
327 qry->flags.CACHE_TRIED = true;
328
329 if (qry->stype == KNOT_RRTYPE_NSEC) {
330 VERBOSE_MSG(qry, "=> skipping stype NSEC\n");
331 return ctx->state;
332 }
333 if (!check_dname_for_lf(qry->sname, qry)) {
334 return ctx->state;
335 }
336
337 int ret = peek_nosync(ctx, pkt);
338 kr_cache_commit(&req->ctx->cache);
339 return ret;
340 }
341
342
343
344 /** It's simply inside of cycle taken out to decrease indentation. \return error code. */
345 static int stash_rrarray_entry(ranked_rr_array_t *arr, int arr_i,
346 const struct kr_query *qry, struct kr_cache *cache,
347 int *unauth_cnt, trie_t *nsec_pmap, bool *needs_pkt);
348 /** Stash a single nsec_p. \return 0 (errors are ignored). */
349 static int stash_nsec_p(const knot_dname_t *dname, const char *nsec_p_v,
350 struct kr_cache *cache, uint32_t timestamp, knot_mm_t *pool,
351 const struct kr_query *qry/*logging*/);
352
353 /** The whole .consume phase for the cache module. */
cache_stash(kr_layer_t * ctx,knot_pkt_t * pkt)354 int cache_stash(kr_layer_t *ctx, knot_pkt_t *pkt)
355 {
356 struct kr_request *req = ctx->req;
357 struct kr_query *qry = req->current_query;
358 struct kr_cache *cache = &req->ctx->cache;
359
360 /* Note: we cache even in KR_STATE_FAIL. For example,
361 * BOGUS answer can go to +cd cache even without +cd request. */
362 if (!kr_cache_is_open(cache) || !qry
363 || qry->flags.CACHED || !check_rrtype(knot_pkt_qtype(pkt), qry)
364 || qry->sclass != KNOT_CLASS_IN) {
365 return ctx->state;
366 }
367 /* Do not cache truncated answers, at least for now. LATER */
368 if (knot_wire_get_tc(pkt->wire)) {
369 return ctx->state;
370 }
371 int unauth_cnt = 0;
372 bool needs_pkt = false;
373 if (qry->flags.STUB) {
374 needs_pkt = true;
375 goto stash_packet;
376 }
377
378 /* Stash individual records. */
379 ranked_rr_array_t *selected[] = kr_request_selected(req);
380 trie_t *nsec_pmap = trie_create(&req->pool);
381 if (kr_fails_assert(nsec_pmap))
382 goto finally;
383 for (int psec = KNOT_ANSWER; psec <= KNOT_ADDITIONAL; ++psec) {
384 ranked_rr_array_t *arr = selected[psec];
385 /* uncached entries are located at the end */
386 for (ssize_t i = arr->len - 1; i >= 0; --i) {
387 ranked_rr_array_entry_t *entry = arr->at[i];
388 if (entry->qry_uid != qry->uid || entry->dont_cache) {
389 continue;
390 /* TODO: probably safe to break on uid mismatch but maybe not worth it */
391 }
392 int ret = stash_rrarray_entry(
393 arr, i, qry, cache, &unauth_cnt, nsec_pmap,
394 /* ADDITIONAL RRs are considered non-essential
395 * in our (resolver) answers */
396 (psec == KNOT_ADDITIONAL ? NULL : &needs_pkt));
397 if (ret) {
398 VERBOSE_MSG(qry, "=> stashing RRs errored out\n");
399 goto finally;
400 }
401 /* LATER(optim.): maybe filter out some type-rank combinations
402 * that won't be useful as separate RRsets. */
403 }
404 }
405
406 trie_it_t *it;
407 for (it = trie_it_begin(nsec_pmap); !trie_it_finished(it); trie_it_next(it)) {
408 stash_nsec_p((const knot_dname_t *)trie_it_key(it, NULL),
409 (const char *)*trie_it_val(it),
410 cache, qry->timestamp.tv_sec, &req->pool, req->current_query);
411 }
412 trie_it_free(it);
413 /* LATER(optim.): typically we also have corresponding NS record in the list,
414 * so we might save a cache operation. */
415 stash_packet:
416 if (qry->flags.PKT_IS_SANE && check_dname_for_lf(knot_pkt_qname(pkt), qry)) {
417 stash_pkt(pkt, qry, req, needs_pkt);
418 }
419
420 finally:
421 if (unauth_cnt) {
422 VERBOSE_MSG(qry, "=> stashed also %d nonauth RRsets\n", unauth_cnt);
423 };
424 kr_cache_commit(cache);
425 return ctx->state; /* we ignore cache-stashing errors */
426 }
427
428 /** Preliminary checks before stash_rrset(). Don't call if returns <= 0. */
stash_rrset_precond(const knot_rrset_t * rr,const struct kr_query * qry)429 static int stash_rrset_precond(const knot_rrset_t *rr, const struct kr_query *qry/*logs*/)
430 {
431 if (kr_fails_assert(rr && rr->rclass == KNOT_CLASS_IN))
432 return kr_error(EINVAL);
433 if (!check_rrtype(rr->type, qry))
434 return kr_ok();
435 if (!check_dname_for_lf(rr->owner, qry))
436 return kr_ok();
437 return 1/*proceed*/;
438 }
439
440 /** Return true on some cases of NSEC* RRsets covering minimal ranges.
441 * Also include some abnormal RR cases; qry is just for logging. */
rrset_has_min_range_or_weird(const knot_rrset_t * rr,const struct kr_query * qry)442 static bool rrset_has_min_range_or_weird(const knot_rrset_t *rr, const struct kr_query *qry)
443 {
444 if (rr->rrs.count != 1) {
445 kr_assert(rr->rrs.count > 0);
446 if (rr->type == KNOT_RRTYPE_NSEC || rr->type == KNOT_RRTYPE_NSEC3
447 || rr->rrs.count == 0) {
448 return true; /*< weird */
449 }
450 }
451 bool ret; /**< NOT used for the weird cases */
452 if (rr->type == KNOT_RRTYPE_NSEC) {
453 if (!check_dname_for_lf(rr->owner, qry))
454 return true; /*< weird, probably filtered even before this point */
455 ret = !check_dname_for_lf(knot_nsec_next(rr->rrs.rdata), qry);
456 /* ^^ Zero inside the next-name label means it's probably a minimal range,
457 * and anyway it's problematic for our aggressive cache (comparisons).
458 * Real-life examples covered:
459 * NSEC: name -> \000.name (e.g. typical foobar.CloudFlare.net)
460 * NSEC: name -> name\000 (CloudFlare on delegations)
461 */
462 } else if (rr->type == KNOT_RRTYPE_NSEC3) {
463 if (knot_nsec3_next_len(rr->rrs.rdata) != NSEC3_HASH_LEN
464 || *rr->owner != NSEC3_HASH_TXT_LEN) {
465 return true; /*< weird */
466 }
467 /* Let's work on the binary hashes. Find if they "differ by one",
468 * by constructing the owner hash incremented by one and comparing. */
469 uint8_t owner_hash[NSEC3_HASH_LEN];
470 if (base32hex_decode(rr->owner + 1, NSEC3_HASH_TXT_LEN,
471 owner_hash, NSEC3_HASH_LEN) != NSEC3_HASH_LEN) {
472 return true; /*< weird */
473 }
474 for (int i = NSEC3_HASH_LEN - 1; i >= 0; --i) {
475 if (++owner_hash[i] != 0) break;
476 }
477 const uint8_t *next_hash = knot_nsec3_next(rr->rrs.rdata);
478 ret = memcmp(owner_hash, next_hash, NSEC3_HASH_LEN) == 0;
479 } else {
480 return false;
481 }
482 if (ret) VERBOSE_MSG(qry, "=> minimized NSEC* range detected\n");
483 return ret;
484 }
485
stash_rrset(struct kr_cache * cache,const struct kr_query * qry,const knot_rrset_t * rr,const knot_rrset_t * rr_sigs,uint32_t timestamp,uint8_t rank,trie_t * nsec_pmap,knot_mm_t * pool,bool * needs_pkt)486 static ssize_t stash_rrset(struct kr_cache *cache, const struct kr_query *qry,
487 const knot_rrset_t *rr, const knot_rrset_t *rr_sigs, uint32_t timestamp,
488 uint8_t rank, trie_t *nsec_pmap, knot_mm_t *pool, bool *needs_pkt)
489 {
490 if (kr_rank_test(rank, KR_RANK_BOGUS)) {
491 WITH_VERBOSE(qry) {
492 auto_free char *type_str = kr_rrtype_text(rr->type);
493 VERBOSE_MSG(qry, "=> skipping bogus RR set %s\n", type_str);
494 }
495 return kr_ok();
496 }
497 if (rr->type == KNOT_RRTYPE_NSEC3 && rr->rrs.count
498 && knot_nsec3_iters(rr->rrs.rdata) > KR_NSEC3_MAX_ITERATIONS) {
499 /* This shouldn't happen often, thanks to downgrades during validation. */
500 VERBOSE_MSG(qry, "=> skipping NSEC3 with too many iterations\n");
501 return kr_ok();
502 }
503 if (kr_fails_assert(cache && stash_rrset_precond(rr, qry) > 0))
504 return kr_error(EINVAL);
505
506 int ret = kr_ok();
507 if (rrset_has_min_range_or_weird(rr, qry))
508 goto return_needs_pkt;
509 const int wild_labels = rr_sigs == NULL ? 0 :
510 knot_dname_labels(rr->owner, NULL) - knot_rrsig_labels(rr_sigs->rrs.rdata);
511 if (wild_labels < 0)
512 goto return_needs_pkt;
513 const knot_dname_t *encloser = rr->owner; /**< the closest encloser name */
514 for (int i = 0; i < wild_labels; ++i) {
515 encloser = knot_wire_next_label(encloser, NULL);
516 }
517
518 /* Construct the key under which RRs will be stored,
519 * and add corresponding nsec_pmap item (if necessary). */
520 struct key k_storage, *k = &k_storage;
521 knot_db_val_t key;
522 switch (rr->type) {
523 case KNOT_RRTYPE_NSEC3:
524 /* Skip opt-out NSEC3 sets. */
525 if (KNOT_NSEC3_FLAG_OPT_OUT & knot_nsec3_flags(rr->rrs.rdata))
526 goto return_needs_pkt;
527 /* fall through */
528 case KNOT_RRTYPE_NSEC:
529 /* Skip any NSEC*s that aren't validated or are suspicious. */
530 if (!kr_rank_test(rank, KR_RANK_SECURE) || rr->rrs.count != 1)
531 goto return_needs_pkt;
532 if (kr_fails_assert(rr_sigs && rr_sigs->rrs.count && rr_sigs->rrs.rdata)) {
533 ret = kr_error(EINVAL);
534 goto return_needs_pkt;
535 }
536 const knot_dname_t *signer = knot_rrsig_signer_name(rr_sigs->rrs.rdata);
537 const int signer_size = knot_dname_size(signer);
538 k->zlf_len = signer_size - 1;
539
540 void **npp = NULL;
541 if (nsec_pmap) {
542 npp = trie_get_ins(nsec_pmap, (const char *)signer, signer_size);
543 if (kr_fails_assert(npp))
544 return kr_error(ENOMEM);
545 }
546 if (rr->type == KNOT_RRTYPE_NSEC) {
547 key = key_NSEC1(k, encloser, wild_labels);
548 break;
549 }
550
551 kr_require(rr->type == KNOT_RRTYPE_NSEC3);
552 const knot_rdata_t * const rdata = rr->rrs.rdata;
553 if (rdata->len <= 4) {
554 ret = kr_error(EILSEQ); /*< data from outside; less trust */
555 goto return_needs_pkt;
556 }
557 const int np_dlen = nsec_p_rdlen(rdata->data);
558 if (np_dlen > rdata->len) {
559 ret = kr_error(EILSEQ);
560 goto return_needs_pkt;
561 }
562 key = key_NSEC3(k, encloser, nsec_p_mkHash(rdata->data));
563 if (npp && !*npp) {
564 *npp = mm_alloc(pool, np_dlen);
565 if (kr_fails_assert(*npp))
566 break;
567 memcpy(*npp, rdata->data, np_dlen);
568 }
569 break;
570 default:
571 ret = kr_dname_lf(k->buf, encloser, wild_labels);
572 if (kr_fails_assert(ret == 0))
573 goto return_needs_pkt;
574 key = key_exact_type(k, rr->type);
575 }
576
577 /* Compute in-cache size for the new data. */
578 const knot_rdataset_t *rds_sigs = rr_sigs ? &rr_sigs->rrs : NULL;
579 const int rr_ssize = rdataset_dematerialize_size(&rr->rrs);
580 if (kr_fails_assert(rr_ssize == to_even(rr_ssize)))
581 return kr_error(EINVAL);
582 knot_db_val_t val_new_entry = {
583 .data = NULL,
584 .len = offsetof(struct entry_h, data) + rr_ssize
585 + rdataset_dematerialize_size(rds_sigs),
586 };
587
588 /* Prepare raw memory for the new entry. */
589 ret = entry_h_splice(&val_new_entry, rank, key, k->type, rr->type,
590 rr->owner, qry, cache, timestamp);
591 if (ret) return kr_ok(); /* some aren't really errors */
592 if (kr_fails_assert(val_new_entry.data))
593 return kr_error(EFAULT);
594
595 const uint32_t ttl = rr->ttl;
596 /* FIXME: consider TTLs and expirations of RRSIGs as well, just in case. */
597
598 /* Write the entry itself. */
599 struct entry_h *eh = val_new_entry.data;
600 memset(eh, 0, offsetof(struct entry_h, data));
601 eh->time = timestamp;
602 eh->ttl = MAX(MIN(ttl, cache->ttl_max), cache->ttl_min);
603 eh->rank = rank;
604 rdataset_dematerialize(&rr->rrs, eh->data);
605 rdataset_dematerialize(rds_sigs, eh->data + rr_ssize);
606 if (kr_fails_assert(entry_h_consistent_E(val_new_entry, rr->type)))
607 return kr_error(EINVAL);
608
609 #if 0 /* Occasionally useful when debugging some kinds of changes. */
610 {
611 kr_cache_commit(cache);
612 knot_db_val_t val = { NULL, 0 };
613 ret = cache_op(cache, read, &key, &val, 1);
614 if (ret != kr_error(ENOENT)) { // ENOENT might happen in some edge case, I guess
615 kr_assert(!ret);
616 entry_list_t el;
617 entry_list_parse(val, el);
618 }
619 }
620 #endif
621
622 /* Verbose-log some not-too-common cases. */
623 WITH_VERBOSE(qry) { if (kr_rank_test(rank, KR_RANK_AUTH)
624 || rr->type == KNOT_RRTYPE_NS) {
625 auto_free char *type_str = kr_rrtype_text(rr->type),
626 *encl_str = kr_dname_text(encloser);
627 VERBOSE_MSG(qry, "=> stashed %s%s %s, rank 0%.2o, "
628 "%d B total, incl. %d RRSIGs\n",
629 (wild_labels ? "*." : ""), encl_str, type_str, rank,
630 (int)val_new_entry.len, (rr_sigs ? rr_sigs->rrs.count : 0)
631 );
632 } }
633
634 return (ssize_t) val_new_entry.len;
635 return_needs_pkt:
636 if (needs_pkt) *needs_pkt = true;
637 return ret;
638 }
639
stash_rrarray_entry(ranked_rr_array_t * arr,int arr_i,const struct kr_query * qry,struct kr_cache * cache,int * unauth_cnt,trie_t * nsec_pmap,bool * needs_pkt)640 static int stash_rrarray_entry(ranked_rr_array_t *arr, int arr_i,
641 const struct kr_query *qry, struct kr_cache *cache,
642 int *unauth_cnt, trie_t *nsec_pmap, bool *needs_pkt)
643 {
644 ranked_rr_array_entry_t *entry = arr->at[arr_i];
645 if (entry->cached) {
646 return kr_ok();
647 }
648 const knot_rrset_t *rr = entry->rr;
649 if (rr->type == KNOT_RRTYPE_RRSIG) {
650 return kr_ok(); /* reduce verbose logging from the following call */
651 }
652 int ret = stash_rrset_precond(rr, qry);
653 if (ret <= 0) {
654 return ret;
655 }
656
657 /* Try to find corresponding signatures, always. LATER(optim.): speed. */
658 ranked_rr_array_entry_t *entry_rrsigs = NULL;
659 const knot_rrset_t *rr_sigs = NULL;
660 for (ssize_t j = arr->len - 1; j >= 0; --j) {
661 /* TODO: ATM we assume that some properties are the same
662 * for all RRSIGs in the set (esp. label count). */
663 ranked_rr_array_entry_t *e = arr->at[j];
664 if (kr_fails_assert(!e->in_progress))
665 return kr_error(EINVAL);
666 bool ok = e->qry_uid == qry->uid && !e->cached
667 && e->rr->type == KNOT_RRTYPE_RRSIG
668 && knot_rrsig_type_covered(e->rr->rrs.rdata) == rr->type
669 && knot_dname_is_equal(rr->owner, e->rr->owner);
670 if (!ok) continue;
671 entry_rrsigs = e;
672 rr_sigs = e->rr;
673 break;
674 }
675
676 ssize_t written = stash_rrset(cache, qry, rr, rr_sigs, qry->timestamp.tv_sec,
677 entry->rank, nsec_pmap, &qry->request->pool, needs_pkt);
678 if (written < 0) {
679 kr_log_error(CACHE, "[%05u.%02u] stash failed, ret = %d\n", qry->request->uid,
680 qry->uid, ret);
681 return (int) written;
682 }
683
684 if (written > 0) {
685 /* Mark entry as cached for the rest of the query processing */
686 entry->cached = true;
687 if (entry_rrsigs) {
688 entry_rrsigs->cached = true;
689 }
690 if (!kr_rank_test(entry->rank, KR_RANK_AUTH) && rr->type != KNOT_RRTYPE_NS) {
691 *unauth_cnt += 1;
692 }
693 }
694
695 return kr_ok();
696 }
697
stash_nsec_p(const knot_dname_t * dname,const char * nsec_p_v,struct kr_cache * cache,uint32_t timestamp,knot_mm_t * pool,const struct kr_query * qry)698 static int stash_nsec_p(const knot_dname_t *dname, const char *nsec_p_v,
699 struct kr_cache *cache, uint32_t timestamp, knot_mm_t *pool,
700 const struct kr_query *qry/*logging*/)
701 {
702 uint32_t valid_until = timestamp + cache->ttl_max;
703 /* LATER(optim.): be more precise here ^^ and reduce calls. */
704 static const int32_t ttl_margin = 3600;
705 const uint8_t *nsec_p = (const uint8_t *)nsec_p_v;
706 int data_stride = sizeof(valid_until) + nsec_p_rdlen(nsec_p);
707
708 unsigned int log_hash = 0xFeeeFeee; /* this type is simpler for printf args */
709 auto_free char *log_dname = NULL;
710 WITH_VERBOSE(qry) {
711 log_hash = nsec_p_v ? nsec_p_mkHash((const uint8_t *)nsec_p_v) : 0;
712 log_dname = kr_dname_text(dname);
713 }
714 /* Find what's in the cache. */
715 struct key k_storage, *k = &k_storage;
716 int ret = kr_dname_lf(k->buf, dname, false);
717 if (ret) return kr_error(ret);
718 knot_db_val_t key = key_exact_type(k, KNOT_RRTYPE_NS);
719 knot_db_val_t val_orig = { NULL, 0 };
720 ret = cache_op(cache, read, &key, &val_orig, 1);
721 if (ret && ret != -ABS(ENOENT)) {
722 VERBOSE_MSG(qry, "=> EL read failed (ret: %d)\n", ret);
723 return kr_ok();
724 }
725 /* Prepare new entry_list_t so we can just write at el[0]. */
726 entry_list_t el;
727 int log_refresh_by = 0;
728 if (ret == -ABS(ENOENT)) {
729 memset(el, 0, sizeof(el));
730 } else {
731 ret = entry_list_parse(val_orig, el);
732 if (ret) {
733 VERBOSE_MSG(qry, "=> EL parse failed (ret: %d)\n", ret);
734 return kr_error(0);
735 }
736 /* Find the index to replace. */
737 int i_replace = ENTRY_APEX_NSECS_CNT - 1;
738 for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) {
739 if (el[i].len != data_stride) continue;
740 if (nsec_p && memcmp(nsec_p, (uint8_t *)el[i].data + sizeof(uint32_t),
741 data_stride - sizeof(uint32_t)) != 0) {
742 continue;
743 }
744 /* Save a cache operation if TTL extended only a little. */
745 uint32_t valid_orig;
746 memcpy(&valid_orig, el[i].data, sizeof(valid_orig));
747 const int32_t ttl_extended_by = valid_until - valid_orig;
748 if (ttl_extended_by < ttl_margin) {
749 VERBOSE_MSG(qry,
750 "=> nsec_p stash for %s skipped (extra TTL: %d, hash: %x)\n",
751 log_dname, ttl_extended_by, log_hash);
752 return kr_ok();
753 }
754 i_replace = i;
755 log_refresh_by = ttl_extended_by;
756 break;
757 }
758 /* Shift the other indices: move the first `i_replace` blocks
759 * by one position. */
760 if (i_replace) {
761 memmove(&el[1], &el[0], sizeof(el[0]) * i_replace);
762 }
763 }
764 /* Prepare old data into a buffer. See entry_h_splice() for why. LATER(optim.) */
765 el[0].len = data_stride;
766 el[0].data = NULL;
767 knot_db_val_t val;
768 val.len = entry_list_serial_size(el),
769 val.data = mm_alloc(pool, val.len),
770 entry_list_memcpy(val.data, el);
771 /* Prepare the new data chunk */
772 memcpy(el[0].data, &valid_until, sizeof(valid_until));
773 if (nsec_p) {
774 memcpy((uint8_t *)el[0].data + sizeof(valid_until), nsec_p,
775 data_stride - sizeof(valid_until));
776 }
777 /* Write it all to the cache */
778 ret = cache_op(cache, write, &key, &val, 1);
779 mm_free(pool, val.data);
780 if (ret || !val.data) {
781 VERBOSE_MSG(qry, "=> EL write failed (ret: %d)\n", ret);
782 return kr_ok();
783 }
784 if (log_refresh_by) {
785 VERBOSE_MSG(qry, "=> nsec_p stashed for %s (refresh by %d, hash: %x)\n",
786 log_dname, log_refresh_by, log_hash);
787 } else {
788 VERBOSE_MSG(qry, "=> nsec_p stashed for %s (new, hash: %x)\n",
789 log_dname, log_hash);
790 }
791 return kr_ok();
792 }
793
kr_cache_insert_rr(struct kr_cache * cache,const knot_rrset_t * rr,const knot_rrset_t * rrsig,uint8_t rank,uint32_t timestamp,bool ins_nsec_p)794 int kr_cache_insert_rr(struct kr_cache *cache,
795 const knot_rrset_t *rr, const knot_rrset_t *rrsig,
796 uint8_t rank, uint32_t timestamp, bool ins_nsec_p)
797 {
798 int err = stash_rrset_precond(rr, NULL);
799 if (err <= 0) {
800 return kr_ok();
801 }
802
803 trie_t *nsec_pmap = NULL;
804 knot_mm_t *pool = NULL;
805 if (ins_nsec_p && (rr->type == KNOT_RRTYPE_NSEC || rr->type == KNOT_RRTYPE_NSEC3)) {
806 pool = mm_ctx_mempool2(4096);
807 nsec_pmap = trie_create(pool);
808 kr_assert(pool && nsec_pmap);
809 }
810
811 ssize_t written = stash_rrset(cache, NULL, rr, rrsig, timestamp, rank,
812 nsec_pmap, pool, NULL);
813
814 if (nsec_pmap) {
815 trie_it_t *it;
816 for (it = trie_it_begin(nsec_pmap); !trie_it_finished(it); trie_it_next(it)) {
817 stash_nsec_p((const knot_dname_t *)trie_it_key(it, NULL),
818 (const char *)*trie_it_val(it),
819 cache, timestamp, pool, NULL);
820 }
821 trie_it_free(it);
822 mm_ctx_delete(pool);
823 }
824
825 if (written >= 0) {
826 return kr_ok();
827 }
828
829 return (int) written;
830 }
831
peek_exact_real(struct kr_cache * cache,const knot_dname_t * name,uint16_t type,struct kr_cache_p * peek)832 static int peek_exact_real(struct kr_cache *cache, const knot_dname_t *name, uint16_t type,
833 struct kr_cache_p *peek)
834 {
835 if (!check_rrtype(type, NULL) || !check_dname_for_lf(name, NULL)) {
836 return kr_error(ENOTSUP);
837 }
838 struct key k_storage, *k = &k_storage;
839
840 int ret = kr_dname_lf(k->buf, name, false);
841 if (ret) return kr_error(ret);
842
843 knot_db_val_t key = key_exact_type(k, type);
844 knot_db_val_t val = { NULL, 0 };
845 ret = cache_op(cache, read, &key, &val, 1);
846 if (!ret) ret = entry_h_seek(&val, type);
847 if (ret) return kr_error(ret);
848
849 const struct entry_h *eh = entry_h_consistent_E(val, type);
850 if (!eh || eh->is_packet) {
851 // TODO: no packets, but better get rid of whole kr_cache_peek_exact().
852 return kr_error(ENOENT);
853 }
854 *peek = (struct kr_cache_p){
855 .time = eh->time,
856 .ttl = eh->ttl,
857 .rank = eh->rank,
858 .raw_data = val.data,
859 .raw_bound = knot_db_val_bound(val),
860 };
861 return kr_ok();
862 }
kr_cache_peek_exact(struct kr_cache * cache,const knot_dname_t * name,uint16_t type,struct kr_cache_p * peek)863 int kr_cache_peek_exact(struct kr_cache *cache, const knot_dname_t *name, uint16_t type,
864 struct kr_cache_p *peek)
865 { /* Just wrap with extra verbose logging. */
866 const int ret = peek_exact_real(cache, name, type, peek);
867 if (false && kr_log_is_debug(CACHE, NULL)) { /* too noisy for usual --verbose */
868 auto_free char *type_str = kr_rrtype_text(type),
869 *name_str = kr_dname_text(name);
870 const char *result_str = (ret == kr_ok() ? "hit" :
871 (ret == kr_error(ENOENT) ? "miss" : "error"));
872 VERBOSE_MSG(NULL, "_peek_exact: %s %s %s (ret: %d)",
873 type_str, name_str, result_str, ret);
874 }
875 return ret;
876 }
877
kr_cache_remove(struct kr_cache * cache,const knot_dname_t * name,uint16_t type)878 int kr_cache_remove(struct kr_cache *cache, const knot_dname_t *name, uint16_t type)
879 {
880 if (!cache_isvalid(cache)) {
881 return kr_error(EINVAL);
882 }
883 if (!cache->api->remove) {
884 return kr_error(ENOSYS);
885 }
886 struct key k_storage, *k = &k_storage;
887 int ret = kr_dname_lf(k->buf, name, false);
888 if (ret) return kr_error(ret);
889
890 knot_db_val_t key = key_exact_type(k, type);
891 return cache_op(cache, remove, &key, 1);
892 }
893
kr_cache_match(struct kr_cache * cache,const knot_dname_t * name,bool exact_name,knot_db_val_t keyval[][2],int maxcount)894 int kr_cache_match(struct kr_cache *cache, const knot_dname_t *name,
895 bool exact_name, knot_db_val_t keyval[][2], int maxcount)
896 {
897 if (!cache_isvalid(cache)) {
898 return kr_error(EINVAL);
899 }
900 if (!cache->api->match) {
901 return kr_error(ENOSYS);
902 }
903
904 struct key k_storage, *k = &k_storage;
905
906 int ret = kr_dname_lf(k->buf, name, false);
907 if (ret) return kr_error(ret);
908
909 // use a mock type
910 knot_db_val_t key = key_exact_type(k, KNOT_RRTYPE_A);
911 /* CACHE_KEY_DEF */
912 key.len -= sizeof(uint16_t); /* the type */
913 if (!exact_name) {
914 key.len -= 2; /* '\0' 'E' */
915 if (name[0] == '\0') ++key.len; /* the root name is special ATM */
916 }
917 return cache_op(cache, match, &key, keyval, maxcount);
918 }
919
kr_unpack_cache_key(knot_db_val_t key,knot_dname_t * buf,uint16_t * type)920 int kr_unpack_cache_key(knot_db_val_t key, knot_dname_t *buf, uint16_t *type)
921 {
922 if (key.data == NULL || buf == NULL || type == NULL) {
923 return kr_error(EINVAL);
924 }
925
926 int len = -1;
927 const char *tag, *key_data = key.data;
928 for (tag = key_data + 1; tag < key_data + key.len; ++tag) {
929 /* CACHE_KEY_DEF */
930 if (tag[-1] == '\0' && (tag == key_data + 1 || tag[-2] == '\0')) {
931 if (tag[0] != 'E') return kr_error(EINVAL);
932 len = tag - 1 - key_data;
933 break;
934 }
935 }
936
937 if (len == -1 || len > KNOT_DNAME_MAXLEN) {
938 return kr_error(EINVAL);
939 }
940
941 int ret = knot_dname_lf2wire(buf, len, key.data);
942 if (ret < 0) {
943 return kr_error(ret);
944 }
945
946 /* CACHE_KEY_DEF: jump over "\0 E/1" */
947 memcpy(type, tag + 1, sizeof(uint16_t));
948
949 return kr_ok();
950 }
951
952
kr_cache_remove_subtree(struct kr_cache * cache,const knot_dname_t * name,bool exact_name,int maxcount)953 int kr_cache_remove_subtree(struct kr_cache *cache, const knot_dname_t *name,
954 bool exact_name, int maxcount)
955 {
956 if (!cache_isvalid(cache)) {
957 return kr_error(EINVAL);
958 }
959
960 knot_db_val_t keyval[maxcount][2], keys[maxcount];
961 int ret = kr_cache_match(cache, name, exact_name, keyval, maxcount);
962 if (ret <= 0) { /* ENOENT -> nothing to remove */
963 return (ret == KNOT_ENOENT) ? 0 : ret;
964 }
965 const int count = ret;
966 /* Duplicate the key strings, as deletion may invalidate the pointers. */
967 int i;
968 for (i = 0; i < count; ++i) {
969 keys[i].len = keyval[i][0].len;
970 keys[i].data = malloc(keys[i].len);
971 if (!keys[i].data) {
972 ret = kr_error(ENOMEM);
973 goto cleanup;
974 }
975 memcpy(keys[i].data, keyval[i][0].data, keys[i].len);
976 }
977 ret = cache_op(cache, remove, keys, count);
978 cleanup:
979 kr_cache_commit(cache); /* Sync even after just kr_cache_match(). */
980 /* Free keys */
981 while (--i >= 0) {
982 free(keys[i].data);
983 }
984 return ret;
985 }
986
health_timer_cb(uv_timer_t * health_timer)987 static void health_timer_cb(uv_timer_t *health_timer)
988 {
989 struct kr_cache *cache = health_timer->data;
990 if (cache)
991 cache_op(cache, check_health);
992 /* We don't do anything with the return code. For example, in some situations
993 * the file may not exist (temporarily), and we just expect to be more lucky
994 * when the timer fires again. */
995 }
996
kr_cache_check_health(struct kr_cache * cache,int interval)997 int kr_cache_check_health(struct kr_cache *cache, int interval)
998 {
999 if (interval == 0)
1000 return cache_op(cache, check_health);
1001 if (interval < 0) {
1002 if (!cache->health_timer)
1003 return kr_ok(); // tolerate stopping a "stopped" timer
1004 uv_close((uv_handle_t *)cache->health_timer, (uv_close_cb)free);
1005 cache->health_timer->data = NULL;
1006 cache->health_timer = NULL;
1007 return kr_ok();
1008 }
1009
1010 if (!cache->health_timer) {
1011 /* We avoid depending on daemon's symbols by using uv_default_loop. */
1012 cache->health_timer = malloc(sizeof(*cache->health_timer));
1013 if (!cache->health_timer) return kr_error(ENOMEM);
1014 uv_loop_t *loop = uv_default_loop();
1015 kr_require(loop);
1016 int ret = uv_timer_init(loop, cache->health_timer);
1017 if (ret) {
1018 free(cache->health_timer);
1019 cache->health_timer = NULL;
1020 return kr_error(ret);
1021 }
1022 cache->health_timer->data = cache;
1023 }
1024 kr_assert(cache->health_timer->data);
1025 return kr_error(uv_timer_start(cache->health_timer, health_timer_cb, interval, interval));
1026 }
1027
1028