1 /* Copyright (C) 2017 CZ.NIC, z.s.p.o. <knot-dns@labs.nic.cz>
2 * SPDX-License-Identifier: GPL-3.0-or-later
3 */
4
5 /** @file
6 * Implementation of chaining in struct entry_h. Prototypes in ./impl.h
7 */
8
9 #include "lib/cache/impl.h"
10 #include "lib/utils.h"
11
12
13 static int entry_h_len(knot_db_val_t val);
14
15
entry_list_memcpy(struct entry_apex * ea,entry_list_t list)16 void entry_list_memcpy(struct entry_apex *ea, entry_list_t list)
17 {
18 if (kr_fails_assert(ea))
19 return;
20 memset(ea, 0, offsetof(struct entry_apex, data));
21 ea->has_ns = list[EL_NS ].len;
22 ea->has_cname = list[EL_CNAME ].len;
23 ea->has_dname = list[EL_DNAME ].len;
24 for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) {
25 ea->nsecs[i] = list[i].len == 0 ? 0 :
26 (list[i].len == 4 ? 1 : 3);
27 }
28 uint8_t *it = ea->data;
29 for (int i = 0; i < EL_LENGTH; ++i) {
30 if (list[i].data) {
31 memcpy(it, list[i].data, list[i].len);
32 /* LATER(optim.): coalesce consecutive writes? */
33 } else {
34 list[i].data = it;
35 }
36 it += to_even(list[i].len);
37 }
38 }
39
entry_list_parse(const knot_db_val_t val,entry_list_t list)40 int entry_list_parse(const knot_db_val_t val, entry_list_t list)
41 {
42 if (kr_fails_assert(val.data && val.len && list))
43 return kr_error(EINVAL);
44 /* Parse the apex itself (nsec parameters). */
45 const struct entry_apex *ea = entry_apex_consistent(val);
46 if (!ea) {
47 return kr_error(EILSEQ);
48 }
49 const uint8_t *it = ea->data,
50 *it_bound = knot_db_val_bound(val);
51 for (int i = 0; i < ENTRY_APEX_NSECS_CNT; ++i) {
52 if (it > it_bound) {
53 return kr_error(EILSEQ);
54 }
55 list[i].data = (void *)it;
56 switch (ea->nsecs[i]) {
57 case 0:
58 list[i].len = 0;
59 break;
60 case 1:
61 list[i].len = sizeof(uint32_t); /* just timestamp */
62 break;
63 case 3: { /* timestamp + NSEC3PARAM wire */
64 if (it + sizeof(uint32_t) + 4 > it_bound) {
65 return kr_error(EILSEQ);
66 }
67 list[i].len = sizeof(uint32_t)
68 + nsec_p_rdlen(it + sizeof(uint32_t));
69 break;
70 }
71 default:
72 return kr_error(EILSEQ);
73 };
74 it += to_even(list[i].len);
75 }
76 /* Parse every entry_h. */
77 for (int i = ENTRY_APEX_NSECS_CNT; i < EL_LENGTH; ++i) {
78 list[i].data = (void *)it;
79 bool has_type;
80 switch (i) {
81 case EL_NS: has_type = ea->has_ns; break;
82 case EL_CNAME: has_type = ea->has_cname; break;
83 case EL_DNAME: has_type = ea->has_dname; break;
84 default:
85 kr_assert(!EINVAL);
86 return kr_error(EINVAL); /* something very bad */
87 }
88 if (!has_type) {
89 list[i].len = 0;
90 continue;
91 }
92 if (kr_fails_assert(it < it_bound))
93 return kr_error(EILSEQ);
94 const int len = entry_h_len(
95 (knot_db_val_t){ .data = (void *)it, .len = it_bound - it });
96 if (kr_fails_assert(len >= 0))
97 return kr_error(len);
98 list[i].len = len;
99 it += to_even(len);
100 }
101 if (kr_fails_assert(it == it_bound)) /* better not use it; might be "damaged" */
102 return kr_error(EILSEQ);
103 return kr_ok();
104 }
105
106 /** Given a valid entry header, find its length (i.e. offset of the next entry).
107 * \param val The beginning of the data and the bound (read only).
108 */
entry_h_len(const knot_db_val_t val)109 static int entry_h_len(const knot_db_val_t val)
110 {
111 const bool ok = val.data && ((ssize_t)val.len) > 0;
112 if (!ok) return kr_error(EINVAL);
113 const struct entry_h *eh = val.data;
114 const uint8_t *d = eh->data; /* iterates over the data in entry */
115 const uint8_t *data_bound = knot_db_val_bound(val);
116 if (d >= data_bound) return kr_error(EILSEQ);
117 if (!eh->is_packet) { /* Positive RRset + its RRsig set (may be empty). */
118 int sets = 2;
119 while (sets-- > 0) {
120 d += KR_CACHE_RR_COUNT_SIZE + rdataset_dematerialized_size(d, NULL);
121 if (kr_fails_assert(d <= data_bound))
122 return kr_error(EILSEQ);
123 }
124 } else { /* A "packet" (opaque ATM). */
125 uint16_t len;
126 if (d + sizeof(len) > data_bound) return kr_error(EILSEQ);
127 memcpy(&len, d, sizeof(len));
128 d += 2 + to_even(len);
129 }
130 if (kr_fails_assert(d <= data_bound))
131 return kr_error(EILSEQ);
132 return d - (uint8_t *)val.data;
133 }
134
entry_apex_consistent(knot_db_val_t val)135 struct entry_apex * entry_apex_consistent(knot_db_val_t val)
136 {
137 //XXX: check lengths, etc.
138 return val.data;
139 }
140
141 /* See the header file. */
entry_h_seek(knot_db_val_t * val,uint16_t type)142 int entry_h_seek(knot_db_val_t *val, uint16_t type)
143 {
144 int i = -1;
145 switch (type) {
146 case KNOT_RRTYPE_NS: i = EL_NS; break;
147 case KNOT_RRTYPE_CNAME: i = EL_CNAME; break;
148 case KNOT_RRTYPE_DNAME: i = EL_DNAME; break;
149 default: return kr_ok();
150 }
151
152 entry_list_t el;
153 int ret = entry_list_parse(*val, el);
154 if (ret) return ret;
155 *val = el[i];
156 return val->len ? kr_ok() : kr_error(ENOENT);
157 }
158
cache_write_or_clear(struct kr_cache * cache,const knot_db_val_t * key,knot_db_val_t * val,const struct kr_query * qry)159 static int cache_write_or_clear(struct kr_cache *cache, const knot_db_val_t *key,
160 knot_db_val_t *val, const struct kr_query *qry)
161 {
162 static uint64_t ignoring_errors_until = 0; /// zero or a timestamp
163 int ret = cache_op(cache, write, key, val, 1);
164 if (!ret) {
165 ignoring_errors_until = 0;
166 return kr_ok();
167 }
168 VERBOSE_MSG(qry, "=> failed backend write, ret = %d\n", ret);
169
170 if (ret == kr_error(ENOSPC) && cache->api->usage_percent(cache->db) > 90) {
171 // Cache seems overfull. Maybe kres-cache-gc service doesn't work.
172 goto recovery;
173 }
174
175 /* If we get ENOSPC with usage < 90% (especially just above 80% when GC fires),
176 * it most likely isn't real overfull state but some LMDB bug related
177 * to transactions. Upstream seems unlikely to address it:
178 https://lists.openldap.org/hyperkitty/list/openldap-technical@openldap.org/thread/QHOTE2Y3WZ6E7J27OOKI44P344ETUOSF/
179 *
180 * In real life we see all processes getting a LMDB failure
181 * but it should recover after the transactions get reopened.
182 *
183 * Fortunately the kresd cache can afford to be slightly lossy,
184 * so we ignore this and other errors for a short while.
185 */
186 const uint64_t now = kr_now();
187 if (!ignoring_errors_until) { // First error after a success.
188 kr_log_info(CACHE, "LMDB refusing writes (ignored for 5-9s): %s\n",
189 kr_strerror(ret));
190 ignoring_errors_until = now + 5000 + kr_rand_bytes(2)/16;
191 return kr_error(ret);
192 }
193 if (now < ignoring_errors_until)
194 return kr_error(ret);
195 // We've lost patience with cache writes not working continuously.
196
197 recovery: // Try to recover by clearing cache.
198 ret = kr_cache_clear(cache);
199 switch (ret) {
200 default:
201 kr_log_crit(CACHE, "CRITICAL: clearing cache failed: %s; fatal error, aborting\n",
202 kr_strerror(ret));
203 abort();
204 case 0:
205 kr_log_info(CACHE, "stuck cache cleared\n");
206 ignoring_errors_until = 0;
207 case -EAGAIN: // fall-through; krcachelock race -> retry later
208 return kr_error(ENOSPC);
209 }
210 }
211
212
213 /* See the header file. */
entry_h_splice(knot_db_val_t * val_new_entry,uint8_t rank,const knot_db_val_t key,const uint16_t ktype,const uint16_t type,const knot_dname_t * owner,const struct kr_query * qry,struct kr_cache * cache,uint32_t timestamp)214 int entry_h_splice(
215 knot_db_val_t *val_new_entry, uint8_t rank,
216 const knot_db_val_t key, const uint16_t ktype, const uint16_t type,
217 const knot_dname_t *owner/*log only*/,
218 const struct kr_query *qry, struct kr_cache *cache, uint32_t timestamp)
219 {
220 //TODO: another review, perhaps including the API
221 if (kr_fails_assert(val_new_entry && val_new_entry->len > 0))
222 return kr_error(EINVAL);
223
224 int i_type;
225 switch (type) {
226 case KNOT_RRTYPE_NS: i_type = EL_NS; break;
227 case KNOT_RRTYPE_CNAME: i_type = EL_CNAME; break;
228 case KNOT_RRTYPE_DNAME: i_type = EL_DNAME; break;
229 default: i_type = 0;
230 }
231
232 /* Get eh_orig (original entry), and also el list if multi-entry case. */
233 const struct entry_h *eh_orig = NULL;
234 entry_list_t el;
235 int ret = -1;
236 if (!kr_rank_test(rank, KR_RANK_SECURE) || ktype == KNOT_RRTYPE_NS) {
237 knot_db_val_t val;
238 ret = cache_op(cache, read, &key, &val, 1);
239 if (i_type) {
240 if (!ret) ret = entry_list_parse(val, el);
241 if (ret) memset(el, 0, sizeof(el));
242 val = el[i_type];
243 }
244 /* val is on the entry, in either case (or error) */
245 if (!ret) {
246 eh_orig = entry_h_consistent_E(val, type);
247 }
248 } else {
249 /* We want to fully overwrite the entry, so don't even read it. */
250 memset(el, 0, sizeof(el));
251 }
252
253 if (!kr_rank_test(rank, KR_RANK_SECURE) && eh_orig) {
254 /* If equal rank was accepted, spoofing a *single* answer would be
255 * enough to e.g. override NS record in AUTHORITY section.
256 * This way they would have to hit the first answer
257 * (whenever TTL nears expiration).
258 * Stale-serving is NOT considered, but TTL 1 would be considered
259 * as expiring anyway, ... */
260 int32_t old_ttl = get_new_ttl(eh_orig, qry, NULL, 0, timestamp);
261 if (old_ttl > 0 && !is_expiring(eh_orig->ttl, old_ttl)
262 && rank <= eh_orig->rank) {
263 WITH_VERBOSE(qry) {
264 auto_free char *type_str = kr_rrtype_text(type),
265 *owner_str = kr_dname_text(owner);
266 VERBOSE_MSG(qry, "=> not overwriting %s %s\n",
267 type_str, owner_str);
268 }
269 return kr_error(EEXIST);
270 }
271 }
272
273 if (!i_type) {
274 /* The non-list types are trivial now. */
275 return cache_write_or_clear(cache, &key, val_new_entry, qry);
276 }
277 /* Now we're in trouble. In some cases, parts of data to be written
278 * is an lmdb entry that may be invalidated by our write request.
279 * (lmdb does even in-place updates!) Therefore we copy all into a buffer.
280 * LATER(optim.): do this only when necessary, or perhaps another approach.
281 * This is also complicated by the fact that the val_new_entry part
282 * is to be written *afterwards* by the caller.
283 */
284 el[i_type] = (knot_db_val_t){
285 .len = val_new_entry->len,
286 .data = NULL, /* perhaps unclear in the entry_h_splice() API */
287 };
288 knot_db_val_t val = {
289 .len = entry_list_serial_size(el),
290 .data = NULL,
291 };
292 uint8_t buf[val.len];
293 entry_list_memcpy((struct entry_apex *)buf, el);
294 ret = cache_write_or_clear(cache, &key, &val, qry);
295 if (ret) return kr_error(ret);
296 memcpy(val.data, buf, val.len); /* we also copy the "empty" space, but well... */
297 val_new_entry->data = (uint8_t *)val.data
298 + ((uint8_t *)el[i_type].data - buf);
299 return kr_ok();
300 }
301
302