xref: /freebsd/usr.sbin/makefs/zfs/zap.c (revision 4e8d558c)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2022 The FreeBSD Foundation
5  *
6  * This software was developed by Mark Johnston under sponsorship from
7  * the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions are
11  * met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/endian.h>
33 
34 #include <assert.h>
35 #include <stddef.h>
36 #include <stdlib.h>
37 #include <string.h>
38 
39 #include <util.h>
40 
41 #include "makefs.h"
42 #include "zfs.h"
43 
44 typedef struct zfs_zap_entry {
45 	char		*name;		/* entry key, private copy */
46 	uint64_t	hash;		/* key hash */
47 	union {
48 		uint8_t	 *valp;
49 		uint16_t *val16p;
50 		uint32_t *val32p;
51 		uint64_t *val64p;
52 	};				/* entry value, an integer array */
53 	uint64_t	val64;		/* embedded value for a common case */
54 	size_t		intsz;		/* array element size; 1, 2, 4 or 8 */
55 	size_t		intcnt;		/* array size */
56 	STAILQ_ENTRY(zfs_zap_entry) next;
57 } zfs_zap_entry_t;
58 
59 struct zfs_zap {
60 	STAILQ_HEAD(, zfs_zap_entry) kvps;
61 	uint64_t	hashsalt;	/* key hash input */
62 	unsigned long	kvpcnt;		/* number of key-value pairs */
63 	unsigned long	chunks;		/* count of chunks needed for fat ZAP */
64 	bool		micro;		/* can this be a micro ZAP? */
65 
66 	dnode_phys_t	*dnode;		/* backpointer */
67 	zfs_objset_t	*os;		/* backpointer */
68 };
69 
70 static uint16_t
71 zap_entry_chunks(zfs_zap_entry_t *ent)
72 {
73 	return (1 + howmany(strlen(ent->name) + 1, ZAP_LEAF_ARRAY_BYTES) +
74 	    howmany(ent->intsz * ent->intcnt, ZAP_LEAF_ARRAY_BYTES));
75 }
76 
77 static uint64_t
78 zap_hash(uint64_t salt, const char *name)
79 {
80 	static uint64_t crc64_table[256];
81 	const uint64_t crc64_poly = 0xC96C5795D7870F42UL;
82 	const uint8_t *cp;
83 	uint64_t crc;
84 	uint8_t c;
85 
86 	assert(salt != 0);
87 	if (crc64_table[128] == 0) {
88 		for (int i = 0; i < 256; i++) {
89 			uint64_t *t;
90 
91 			t = crc64_table + i;
92 			*t = i;
93 			for (int j = 8; j > 0; j--)
94 				*t = (*t >> 1) ^ (-(*t & 1) & crc64_poly);
95 		}
96 	}
97 	assert(crc64_table[128] == crc64_poly);
98 
99 	for (cp = (const uint8_t *)name, crc = salt; (c = *cp) != '\0'; cp++)
100 		crc = (crc >> 8) ^ crc64_table[(crc ^ c) & 0xFF];
101 
102 	/*
103 	 * Only use 28 bits, since we need 4 bits in the cookie for the
104 	 * collision differentiator.  We MUST use the high bits, since
105 	 * those are the ones that we first pay attention to when
106 	 * choosing the bucket.
107 	 */
108 	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
109 
110 	return (crc);
111 }
112 
113 zfs_zap_t *
114 zap_alloc(zfs_objset_t *os, dnode_phys_t *dnode)
115 {
116 	zfs_zap_t *zap;
117 
118 	zap = ecalloc(1, sizeof(*zap));
119 	STAILQ_INIT(&zap->kvps);
120 	zap->hashsalt = ((uint64_t)random() << 32) | random();
121 	zap->micro = true;
122 	zap->kvpcnt = 0;
123 	zap->chunks = 0;
124 	zap->dnode = dnode;
125 	zap->os = os;
126 	return (zap);
127 }
128 
129 void
130 zap_add(zfs_zap_t *zap, const char *name, size_t intsz, size_t intcnt,
131     const uint8_t *val)
132 {
133 	zfs_zap_entry_t *ent;
134 
135 	assert(intsz == 1 || intsz == 2 || intsz == 4 || intsz == 8);
136 	assert(strlen(name) + 1 <= ZAP_MAXNAMELEN);
137 	assert(intcnt <= ZAP_MAXVALUELEN && intcnt * intsz <= ZAP_MAXVALUELEN);
138 
139 	ent = ecalloc(1, sizeof(*ent));
140 	ent->name = estrdup(name);
141 	ent->hash = zap_hash(zap->hashsalt, ent->name);
142 	ent->intsz = intsz;
143 	ent->intcnt = intcnt;
144 	if (intsz == sizeof(uint64_t) && intcnt == 1) {
145 		/*
146 		 * Micro-optimization to elide a memory allocation in that most
147 		 * common case where this is a directory entry.
148 		 */
149 		ent->val64p = &ent->val64;
150 	} else {
151 		ent->valp = ecalloc(intcnt, intsz);
152 	}
153 	memcpy(ent->valp, val, intcnt * intsz);
154 	zap->kvpcnt++;
155 	zap->chunks += zap_entry_chunks(ent);
156 	STAILQ_INSERT_TAIL(&zap->kvps, ent, next);
157 
158 	if (zap->micro && (intcnt != 1 || intsz != sizeof(uint64_t) ||
159 	    strlen(name) + 1 > MZAP_NAME_LEN || zap->kvpcnt > MZAP_ENT_MAX))
160 		zap->micro = false;
161 }
162 
163 void
164 zap_add_uint64(zfs_zap_t *zap, const char *name, uint64_t val)
165 {
166 	zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val);
167 }
168 
169 void
170 zap_add_string(zfs_zap_t *zap, const char *name, const char *val)
171 {
172 	zap_add(zap, name, 1, strlen(val) + 1, val);
173 }
174 
175 bool
176 zap_entry_exists(zfs_zap_t *zap, const char *name)
177 {
178 	zfs_zap_entry_t *ent;
179 
180 	STAILQ_FOREACH(ent, &zap->kvps, next) {
181 		if (strcmp(ent->name, name) == 0)
182 			return (true);
183 	}
184 	return (false);
185 }
186 
187 static void
188 zap_micro_write(zfs_opt_t *zfs, zfs_zap_t *zap)
189 {
190 	dnode_phys_t *dnode;
191 	zfs_zap_entry_t *ent;
192 	mzap_phys_t *mzap;
193 	mzap_ent_phys_t *ment;
194 	off_t bytes, loc;
195 
196 	memset(zfs->filebuf, 0, sizeof(zfs->filebuf));
197 	mzap = (mzap_phys_t *)&zfs->filebuf[0];
198 	mzap->mz_block_type = ZBT_MICRO;
199 	mzap->mz_salt = zap->hashsalt;
200 	mzap->mz_normflags = 0;
201 
202 	bytes = sizeof(*mzap) + (zap->kvpcnt - 1) * sizeof(*ment);
203 	assert(bytes <= (off_t)MZAP_MAX_BLKSZ);
204 
205 	ment = &mzap->mz_chunk[0];
206 	STAILQ_FOREACH(ent, &zap->kvps, next) {
207 		memcpy(&ment->mze_value, ent->valp, ent->intsz * ent->intcnt);
208 		ment->mze_cd = 0; /* XXX-MJ */
209 		strlcpy(ment->mze_name, ent->name, sizeof(ment->mze_name));
210 		ment++;
211 	}
212 
213 	loc = objset_space_alloc(zfs, zap->os, &bytes);
214 
215 	dnode = zap->dnode;
216 	dnode->dn_maxblkid = 0;
217 	dnode->dn_datablkszsec = bytes >> MINBLOCKSHIFT;
218 
219 	vdev_pwrite_dnode_data(zfs, dnode, zfs->filebuf, bytes, loc);
220 }
221 
222 /*
223  * Write some data to the fat ZAP leaf chunk starting at index "li".
224  *
225  * Note that individual integers in the value may be split among consecutive
226  * leaves.
227  */
228 static void
229 zap_fat_write_array_chunk(zap_leaf_t *l, uint16_t li, size_t sz,
230     const uint8_t *val)
231 {
232 	struct zap_leaf_array *la;
233 
234 	assert(sz <= ZAP_MAXVALUELEN);
235 
236 	for (uint16_t n, resid = sz; resid > 0; resid -= n, val += n, li++) {
237 		n = MIN(resid, ZAP_LEAF_ARRAY_BYTES);
238 
239 		la = &ZAP_LEAF_CHUNK(l, li).l_array;
240 		assert(la->la_type == ZAP_CHUNK_FREE);
241 		la->la_type = ZAP_CHUNK_ARRAY;
242 		memcpy(la->la_array, val, n);
243 		la->la_next = li + 1;
244 	}
245 	la->la_next = 0xffff;
246 }
247 
248 /*
249  * Find the shortest hash prefix length which lets us distribute keys without
250  * overflowing a leaf block.  This is not (space) optimal, but is simple, and
251  * directories large enough to overflow a single 128KB leaf block are uncommon.
252  */
253 static unsigned int
254 zap_fat_write_prefixlen(zfs_zap_t *zap, zap_leaf_t *l)
255 {
256 	zfs_zap_entry_t *ent;
257 	unsigned int prefixlen;
258 
259 	if (zap->chunks <= ZAP_LEAF_NUMCHUNKS(l)) {
260 		/*
261 		 * All chunks will fit in a single leaf block.
262 		 */
263 		return (0);
264 	}
265 
266 	for (prefixlen = 1; prefixlen < (unsigned int)l->l_bs; prefixlen++) {
267 		uint32_t *leafchunks;
268 
269 		leafchunks = ecalloc(1u << prefixlen, sizeof(*leafchunks));
270 		STAILQ_FOREACH(ent, &zap->kvps, next) {
271 			uint64_t li;
272 			uint16_t chunks;
273 
274 			li = ZAP_HASH_IDX(ent->hash, prefixlen);
275 
276 			chunks = zap_entry_chunks(ent);
277 			if (ZAP_LEAF_NUMCHUNKS(l) - leafchunks[li] < chunks) {
278 				/*
279 				 * Not enough space, grow the prefix and retry.
280 				 */
281 				break;
282 			}
283 			leafchunks[li] += chunks;
284 		}
285 		free(leafchunks);
286 
287 		if (ent == NULL) {
288 			/*
289 			 * Everything fits, we're done.
290 			 */
291 			break;
292 		}
293 	}
294 
295 	/*
296 	 * If this fails, then we need to expand the pointer table.  For now
297 	 * this situation is unhandled since it is hard to trigger.
298 	 */
299 	assert(prefixlen < (unsigned int)l->l_bs);
300 
301 	return (prefixlen);
302 }
303 
304 /*
305  * Initialize a fat ZAP leaf block.
306  */
307 static void
308 zap_fat_write_leaf_init(zap_leaf_t *l, uint64_t prefix, int prefixlen)
309 {
310 	zap_leaf_phys_t *leaf;
311 
312 	leaf = l->l_phys;
313 
314 	leaf->l_hdr.lh_block_type = ZBT_LEAF;
315 	leaf->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
316 	leaf->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
317 	leaf->l_hdr.lh_prefix = prefix;
318 	leaf->l_hdr.lh_prefix_len = prefixlen;
319 
320 	/* Initialize the leaf hash table. */
321 	assert(leaf->l_hdr.lh_nfree < 0xffff);
322 	memset(leaf->l_hash, 0xff,
323 	    ZAP_LEAF_HASH_NUMENTRIES(l) * sizeof(*leaf->l_hash));
324 
325 	/* Initialize the leaf chunks. */
326 	for (uint16_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
327 		struct zap_leaf_free *lf;
328 
329 		lf = &ZAP_LEAF_CHUNK(l, i).l_free;
330 		lf->lf_type = ZAP_CHUNK_FREE;
331 		if (i + 1 == ZAP_LEAF_NUMCHUNKS(l))
332 			lf->lf_next = 0xffff;
333 		else
334 			lf->lf_next = i + 1;
335 	}
336 }
337 
338 static void
339 zap_fat_write(zfs_opt_t *zfs, zfs_zap_t *zap)
340 {
341 	struct dnode_cursor *c;
342 	zap_leaf_t l;
343 	zap_phys_t *zaphdr;
344 	struct zap_table_phys *zt;
345 	zfs_zap_entry_t *ent;
346 	dnode_phys_t *dnode;
347 	uint8_t *leafblks;
348 	uint64_t lblkcnt, *ptrhasht;
349 	off_t loc, blksz;
350 	size_t blkshift;
351 	unsigned int prefixlen;
352 	int ptrcnt;
353 
354 	/*
355 	 * For simplicity, always use the largest block size.  This should be ok
356 	 * since most directories will be micro ZAPs, but it's space inefficient
357 	 * for small ZAPs and might need to be revisited.
358 	 */
359 	blkshift = MAXBLOCKSHIFT;
360 	blksz = (off_t)1 << blkshift;
361 
362 	/*
363 	 * Embedded pointer tables give up to 8192 entries.  This ought to be
364 	 * enough for anything except massive directories.
365 	 */
366 	ptrcnt = (blksz / 2) / sizeof(uint64_t);
367 
368 	memset(zfs->filebuf, 0, sizeof(zfs->filebuf));
369 	zaphdr = (zap_phys_t *)&zfs->filebuf[0];
370 	zaphdr->zap_block_type = ZBT_HEADER;
371 	zaphdr->zap_magic = ZAP_MAGIC;
372 	zaphdr->zap_num_entries = zap->kvpcnt;
373 	zaphdr->zap_salt = zap->hashsalt;
374 
375 	l.l_bs = blkshift;
376 	l.l_phys = NULL;
377 
378 	zt = &zaphdr->zap_ptrtbl;
379 	zt->zt_blk = 0;
380 	zt->zt_numblks = 0;
381 	zt->zt_shift = flsll(ptrcnt) - 1;
382 	zt->zt_nextblk = 0;
383 	zt->zt_blks_copied = 0;
384 
385 	/*
386 	 * How many leaf blocks do we need?  Initialize them and update the
387 	 * header.
388 	 */
389 	prefixlen = zap_fat_write_prefixlen(zap, &l);
390 	lblkcnt = (uint64_t)1 << prefixlen;
391 	leafblks = ecalloc(lblkcnt, blksz);
392 	for (unsigned int li = 0; li < lblkcnt; li++) {
393 		l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz);
394 		zap_fat_write_leaf_init(&l, li, prefixlen);
395 	}
396 	zaphdr->zap_num_leafs = lblkcnt;
397 	zaphdr->zap_freeblk = lblkcnt + 1;
398 
399 	/*
400 	 * For each entry, figure out which leaf block it belongs to based on
401 	 * the upper bits of its hash, allocate chunks from that leaf, and fill
402 	 * them out.
403 	 */
404 	ptrhasht = (uint64_t *)(&zfs->filebuf[0] + blksz / 2);
405 	STAILQ_FOREACH(ent, &zap->kvps, next) {
406 		struct zap_leaf_entry *le;
407 		uint16_t *lptr;
408 		uint64_t hi, li;
409 		uint16_t namelen, nchunks, nnamechunks, nvalchunks;
410 
411 		hi = ZAP_HASH_IDX(ent->hash, zt->zt_shift);
412 		li = ZAP_HASH_IDX(ent->hash, prefixlen);
413 		assert(ptrhasht[hi] == 0 || ptrhasht[hi] == li + 1);
414 		ptrhasht[hi] = li + 1;
415 		l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz);
416 
417 		namelen = strlen(ent->name) + 1;
418 
419 		/*
420 		 * How many leaf chunks do we need for this entry?
421 		 */
422 		nnamechunks = howmany(namelen, ZAP_LEAF_ARRAY_BYTES);
423 		nvalchunks = howmany(ent->intcnt,
424 		    ZAP_LEAF_ARRAY_BYTES / ent->intsz);
425 		nchunks = 1 + nnamechunks + nvalchunks;
426 
427 		/*
428 		 * Allocate a run of free leaf chunks for this entry,
429 		 * potentially extending a hash chain.
430 		 */
431 		assert(l.l_phys->l_hdr.lh_nfree >= nchunks);
432 		l.l_phys->l_hdr.lh_nfree -= nchunks;
433 		l.l_phys->l_hdr.lh_nentries++;
434 		lptr = ZAP_LEAF_HASH_ENTPTR(&l, ent->hash);
435 		while (*lptr != 0xffff) {
436 			assert(*lptr < ZAP_LEAF_NUMCHUNKS(&l));
437 			le = ZAP_LEAF_ENTRY(&l, *lptr);
438 			assert(le->le_type == ZAP_CHUNK_ENTRY);
439 			le->le_cd++;
440 			lptr = &le->le_next;
441 		}
442 		*lptr = l.l_phys->l_hdr.lh_freelist;
443 		l.l_phys->l_hdr.lh_freelist += nchunks;
444 		assert(l.l_phys->l_hdr.lh_freelist <=
445 		    ZAP_LEAF_NUMCHUNKS(&l));
446 		if (l.l_phys->l_hdr.lh_freelist ==
447 		    ZAP_LEAF_NUMCHUNKS(&l))
448 			l.l_phys->l_hdr.lh_freelist = 0xffff;
449 
450 		/*
451 		 * Integer values must be stored in big-endian format.
452 		 */
453 		switch (ent->intsz) {
454 		case 1:
455 			break;
456 		case 2:
457 			for (uint16_t *v = ent->val16p;
458 			    v - ent->val16p < (ptrdiff_t)ent->intcnt;
459 			    v++)
460 				*v = htobe16(*v);
461 			break;
462 		case 4:
463 			for (uint32_t *v = ent->val32p;
464 			    v - ent->val32p < (ptrdiff_t)ent->intcnt;
465 			    v++)
466 				*v = htobe32(*v);
467 			break;
468 		case 8:
469 			for (uint64_t *v = ent->val64p;
470 			    v - ent->val64p < (ptrdiff_t)ent->intcnt;
471 			    v++)
472 				*v = htobe64(*v);
473 			break;
474 		default:
475 			assert(0);
476 		}
477 
478 		/*
479 		 * Finally, write out the leaf chunks for this entry.
480 		 */
481 		le = ZAP_LEAF_ENTRY(&l, *lptr);
482 		assert(le->le_type == ZAP_CHUNK_FREE);
483 		le->le_type = ZAP_CHUNK_ENTRY;
484 		le->le_next = 0xffff;
485 		le->le_name_chunk = *lptr + 1;
486 		le->le_name_numints = namelen;
487 		le->le_value_chunk = *lptr + 1 + nnamechunks;
488 		le->le_value_intlen = ent->intsz;
489 		le->le_value_numints = ent->intcnt;
490 		le->le_hash = ent->hash;
491 		zap_fat_write_array_chunk(&l, *lptr + 1, namelen, ent->name);
492 		zap_fat_write_array_chunk(&l, *lptr + 1 + nnamechunks,
493 		    ent->intcnt * ent->intsz, ent->valp);
494 	}
495 
496 	/*
497 	 * Initialize unused slots of the pointer table.
498 	 */
499 	for (int i = 0; i < ptrcnt; i++)
500 		if (ptrhasht[i] == 0)
501 			ptrhasht[i] = (i >> (zt->zt_shift - prefixlen)) + 1;
502 
503 	/*
504 	 * Write the whole thing to disk.
505 	 */
506 	dnode = zap->dnode;
507 	dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
508 	dnode->dn_maxblkid = lblkcnt + 1;
509 
510 	c = dnode_cursor_init(zfs, zap->os, zap->dnode,
511 	    (lblkcnt + 1) * blksz, blksz);
512 
513 	loc = objset_space_alloc(zfs, zap->os, &blksz);
514 	vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, zfs->filebuf, blksz, loc,
515 	    dnode_cursor_next(zfs, c, 0));
516 
517 	for (uint64_t i = 0; i < lblkcnt; i++) {
518 		loc = objset_space_alloc(zfs, zap->os, &blksz);
519 		vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, leafblks + i * blksz,
520 		    blksz, loc, dnode_cursor_next(zfs, c, (i + 1) * blksz));
521 	}
522 
523 	dnode_cursor_finish(zfs, c);
524 
525 	free(leafblks);
526 }
527 
528 void
529 zap_write(zfs_opt_t *zfs, zfs_zap_t *zap)
530 {
531 	zfs_zap_entry_t *ent;
532 
533 	if (zap->micro) {
534 		zap_micro_write(zfs, zap);
535 	} else {
536 		assert(!STAILQ_EMPTY(&zap->kvps));
537 		assert(zap->kvpcnt > 0);
538 		zap_fat_write(zfs, zap);
539 	}
540 
541 	while ((ent = STAILQ_FIRST(&zap->kvps)) != NULL) {
542 		STAILQ_REMOVE_HEAD(&zap->kvps, next);
543 		if (ent->val64p != &ent->val64)
544 			free(ent->valp);
545 		free(ent->name);
546 		free(ent);
547 	}
548 	free(zap);
549 }
550