1 /*
2  * CDDL HEADER START
3  *
4  * This file and its contents are supplied under the terms of the
5  * Common Development and Distribution License ("CDDL"), version 1.0.
6  * You may only use this file in accordance with the terms of version
7  * 1.0 of the CDDL.
8  *
9  * A full copy of the text of the CDDL should have accompanied this
10  * source.  A copy of the CDDL is also available via the Internet at
11  * http://www.illumos.org/license/CDDL.
12  *
13  * CDDL HEADER END
14  */
15 
16 /*
17  * Copyright (c) 2020 by Delphix. All rights reserved.
18  */
19 
20 #include <assert.h>
21 #include <cityhash.h>
22 #include <ctype.h>
23 #include <errno.h>
24 #include <fcntl.h>
25 #include <libzfs.h>
26 #include <libzutil.h>
27 #include <stddef.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <umem.h>
32 #include <unistd.h>
33 #include <sys/debug.h>
34 #include <sys/stat.h>
35 #include <sys/zfs_ioctl.h>
36 #include <sys/zio_checksum.h>
37 #include "zfs_fletcher.h"
38 #include "zstream.h"
39 
40 
41 #define	MAX_RDT_PHYSMEM_PERCENT		20
42 #define	SMALLEST_POSSIBLE_MAX_RDT_MB		128
43 
44 typedef struct redup_entry {
45 	struct redup_entry	*rde_next;
46 	uint64_t rde_guid;
47 	uint64_t rde_object;
48 	uint64_t rde_offset;
49 	uint64_t rde_stream_offset;
50 } redup_entry_t;
51 
52 typedef struct redup_table {
53 	redup_entry_t	**redup_hash_array;
54 	umem_cache_t	*ddecache;
55 	uint64_t	ddt_count;
56 	int		numhashbits;
57 } redup_table_t;
58 
59 int
60 highbit64(uint64_t i)
61 {
62 	if (i == 0)
63 		return (0);
64 
65 	return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));
66 }
67 
68 void *
69 safe_calloc(size_t n)
70 {
71 	void *rv = calloc(1, n);
72 	if (rv == NULL) {
73 		fprintf(stderr,
74 		    "Error: could not allocate %u bytes of memory\n",
75 		    (int)n);
76 		exit(1);
77 	}
78 	return (rv);
79 }
80 
81 /*
82  * Safe version of fread(), exits on error.
83  */
84 int
85 sfread(void *buf, size_t size, FILE *fp)
86 {
87 	int rv = fread(buf, size, 1, fp);
88 	if (rv == 0 && ferror(fp)) {
89 		(void) fprintf(stderr, "Error while reading file: %s\n",
90 		    strerror(errno));
91 		exit(1);
92 	}
93 	return (rv);
94 }
95 
96 /*
97  * Safe version of pread(), exits on error.
98  */
99 static void
100 spread(int fd, void *buf, size_t count, off_t offset)
101 {
102 	ssize_t err = pread(fd, buf, count, offset);
103 	if (err == -1) {
104 		(void) fprintf(stderr,
105 		    "Error while reading file: %s\n",
106 		    strerror(errno));
107 		exit(1);
108 	} else if (err != count) {
109 		(void) fprintf(stderr,
110 		    "Error while reading file: short read\n");
111 		exit(1);
112 	}
113 }
114 
115 static int
116 dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
117     zio_cksum_t *zc, int outfd)
118 {
119 	assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum)
120 	    == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
121 	fletcher_4_incremental_native(drr,
122 	    offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
123 	if (drr->drr_type != DRR_BEGIN) {
124 		assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
125 		    drr_checksum.drr_checksum));
126 		drr->drr_u.drr_checksum.drr_checksum = *zc;
127 	}
128 	fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
129 	    sizeof (zio_cksum_t), zc);
130 	if (write(outfd, drr, sizeof (*drr)) == -1)
131 		return (errno);
132 	if (payload_len != 0) {
133 		fletcher_4_incremental_native(payload, payload_len, zc);
134 		if (write(outfd, payload, payload_len) == -1)
135 			return (errno);
136 	}
137 	return (0);
138 }
139 
140 static void
141 rdt_insert(redup_table_t *rdt,
142     uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset)
143 {
144 	uint64_t ch = cityhash4(guid, object, offset, 0);
145 	uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
146 	redup_entry_t **rdepp;
147 
148 	rdepp = &(rdt->redup_hash_array[hashcode]);
149 	redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL);
150 	rde->rde_next = *rdepp;
151 	rde->rde_guid = guid;
152 	rde->rde_object = object;
153 	rde->rde_offset = offset;
154 	rde->rde_stream_offset = stream_offset;
155 	*rdepp = rde;
156 	rdt->ddt_count++;
157 }
158 
159 static void
160 rdt_lookup(redup_table_t *rdt,
161     uint64_t guid, uint64_t object, uint64_t offset,
162     uint64_t *stream_offsetp)
163 {
164 	uint64_t ch = cityhash4(guid, object, offset, 0);
165 	uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
166 
167 	for (redup_entry_t *rde = rdt->redup_hash_array[hashcode];
168 	    rde != NULL; rde = rde->rde_next) {
169 		if (rde->rde_guid == guid &&
170 		    rde->rde_object == object &&
171 		    rde->rde_offset == offset) {
172 			*stream_offsetp = rde->rde_stream_offset;
173 			return;
174 		}
175 	}
176 	assert(!"could not find expected redup table entry");
177 }
178 
179 /*
180  * Convert a dedup stream (generated by "zfs send -D") to a
181  * non-deduplicated stream.  The entire infd will be converted, including
182  * any substreams in a stream package (generated by "zfs send -RD"). The
183  * infd must be seekable.
184  */
185 static void
186 zfs_redup_stream(int infd, int outfd, boolean_t verbose)
187 {
188 	int bufsz = SPA_MAXBLOCKSIZE;
189 	dmu_replay_record_t thedrr = { 0 };
190 	dmu_replay_record_t *drr = &thedrr;
191 	redup_table_t rdt;
192 	zio_cksum_t stream_cksum;
193 	uint64_t numbuckets;
194 	uint64_t num_records = 0;
195 	uint64_t num_write_byref_records = 0;
196 
197 #ifdef _ILP32
198 	uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20;
199 #else
200 	uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
201 	uint64_t max_rde_size =
202 	    MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100,
203 	    SMALLEST_POSSIBLE_MAX_RDT_MB << 20);
204 #endif
205 
206 	numbuckets = max_rde_size / (sizeof (redup_entry_t));
207 
208 	/*
209 	 * numbuckets must be a power of 2.  Increase number to
210 	 * a power of 2 if necessary.
211 	 */
212 	if (!ISP2(numbuckets))
213 		numbuckets = 1ULL << highbit64(numbuckets);
214 
215 	rdt.redup_hash_array =
216 	    safe_calloc(numbuckets * sizeof (redup_entry_t *));
217 	rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0,
218 	    NULL, NULL, NULL, NULL, NULL, 0);
219 	rdt.numhashbits = highbit64(numbuckets) - 1;
220 	rdt.ddt_count = 0;
221 
222 	char *buf = safe_calloc(bufsz);
223 	FILE *ofp = fdopen(infd, "r");
224 	long offset = ftell(ofp);
225 	int begin = 0;
226 	boolean_t seen = B_FALSE;
227 	while (sfread(drr, sizeof (*drr), ofp) != 0) {
228 		num_records++;
229 
230 		/*
231 		 * We need to regenerate the checksum.
232 		 */
233 		if (drr->drr_type != DRR_BEGIN) {
234 			memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
235 			    sizeof (drr->drr_u.drr_checksum.drr_checksum));
236 		}
237 
238 		uint64_t payload_size = 0;
239 		switch (drr->drr_type) {
240 		case DRR_BEGIN:
241 		{
242 			struct drr_begin *drrb = &drr->drr_u.drr_begin;
243 			int fflags;
244 			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
245 			VERIFY0(begin++);
246 			seen = B_TRUE;
247 
248 			assert(drrb->drr_magic == DMU_BACKUP_MAGIC);
249 
250 			/* clear the DEDUP feature flag for this stream */
251 			fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
252 			fflags &= ~(DMU_BACKUP_FEATURE_DEDUP |
253 			    DMU_BACKUP_FEATURE_DEDUPPROPS);
254 			/* cppcheck-suppress syntaxError */
255 			DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
256 
257 			uint32_t sz = drr->drr_payloadlen;
258 
259 			VERIFY3U(sz, <=, 1U << 28);
260 
261 			if (sz != 0) {
262 				if (sz > bufsz) {
263 					free(buf);
264 					buf = safe_calloc(sz);
265 					bufsz = sz;
266 				}
267 				(void) sfread(buf, sz, ofp);
268 			}
269 			payload_size = sz;
270 			break;
271 		}
272 
273 		case DRR_END:
274 		{
275 			struct drr_end *drre = &drr->drr_u.drr_end;
276 			/*
277 			 * We would prefer to just check --begin == 0, but
278 			 * replication streams have an end of stream END
279 			 * record, so we must avoid tripping it.
280 			 */
281 			VERIFY3B(seen, ==, B_TRUE);
282 			begin--;
283 			/*
284 			 * Use the recalculated checksum, unless this is
285 			 * the END record of a stream package, which has
286 			 * no checksum.
287 			 */
288 			if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum))
289 				drre->drr_checksum = stream_cksum;
290 			break;
291 		}
292 
293 		case DRR_OBJECT:
294 		{
295 			struct drr_object *drro = &drr->drr_u.drr_object;
296 			VERIFY3S(begin, ==, 1);
297 
298 			if (drro->drr_bonuslen > 0) {
299 				payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
300 				(void) sfread(buf, payload_size, ofp);
301 			}
302 			break;
303 		}
304 
305 		case DRR_SPILL:
306 		{
307 			struct drr_spill *drrs = &drr->drr_u.drr_spill;
308 			VERIFY3S(begin, ==, 1);
309 			payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
310 			(void) sfread(buf, payload_size, ofp);
311 			break;
312 		}
313 
314 		case DRR_WRITE_BYREF:
315 		{
316 			struct drr_write_byref drrwb =
317 			    drr->drr_u.drr_write_byref;
318 			VERIFY3S(begin, ==, 1);
319 
320 			num_write_byref_records++;
321 
322 			/*
323 			 * Look up in hash table by drrwb->drr_refguid,
324 			 * drr_refobject, drr_refoffset.  Replace this
325 			 * record with the found WRITE record, but with
326 			 * drr_object,drr_offset,drr_toguid replaced with ours.
327 			 */
328 			uint64_t stream_offset = 0;
329 			rdt_lookup(&rdt, drrwb.drr_refguid,
330 			    drrwb.drr_refobject, drrwb.drr_refoffset,
331 			    &stream_offset);
332 
333 			spread(infd, drr, sizeof (*drr), stream_offset);
334 
335 			assert(drr->drr_type == DRR_WRITE);
336 			struct drr_write *drrw = &drr->drr_u.drr_write;
337 			assert(drrw->drr_toguid == drrwb.drr_refguid);
338 			assert(drrw->drr_object == drrwb.drr_refobject);
339 			assert(drrw->drr_offset == drrwb.drr_refoffset);
340 
341 			payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
342 			spread(infd, buf, payload_size,
343 			    stream_offset + sizeof (*drr));
344 
345 			drrw->drr_toguid = drrwb.drr_toguid;
346 			drrw->drr_object = drrwb.drr_object;
347 			drrw->drr_offset = drrwb.drr_offset;
348 			break;
349 		}
350 
351 		case DRR_WRITE:
352 		{
353 			struct drr_write *drrw = &drr->drr_u.drr_write;
354 			VERIFY3S(begin, ==, 1);
355 			payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
356 			(void) sfread(buf, payload_size, ofp);
357 
358 			rdt_insert(&rdt, drrw->drr_toguid,
359 			    drrw->drr_object, drrw->drr_offset, offset);
360 			break;
361 		}
362 
363 		case DRR_WRITE_EMBEDDED:
364 		{
365 			struct drr_write_embedded *drrwe =
366 			    &drr->drr_u.drr_write_embedded;
367 			VERIFY3S(begin, ==, 1);
368 			payload_size =
369 			    P2ROUNDUP((uint64_t)drrwe->drr_psize, 8);
370 			(void) sfread(buf, payload_size, ofp);
371 			break;
372 		}
373 
374 		case DRR_FREEOBJECTS:
375 		case DRR_FREE:
376 		case DRR_OBJECT_RANGE:
377 			VERIFY3S(begin, ==, 1);
378 			break;
379 
380 		default:
381 			(void) fprintf(stderr, "INVALID record type 0x%x\n",
382 			    drr->drr_type);
383 			/* should never happen, so assert */
384 			assert(B_FALSE);
385 		}
386 
387 		if (feof(ofp)) {
388 			fprintf(stderr, "Error: unexpected end-of-file\n");
389 			exit(1);
390 		}
391 		if (ferror(ofp)) {
392 			fprintf(stderr, "Error while reading file: %s\n",
393 			    strerror(errno));
394 			exit(1);
395 		}
396 
397 		/*
398 		 * We need to recalculate the checksum, and it needs to be
399 		 * initially zero to do that.  BEGIN records don't have
400 		 * a checksum.
401 		 */
402 		if (drr->drr_type != DRR_BEGIN) {
403 			memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
404 			    sizeof (drr->drr_u.drr_checksum.drr_checksum));
405 		}
406 		if (dump_record(drr, buf, payload_size,
407 		    &stream_cksum, outfd) != 0)
408 			break;
409 		if (drr->drr_type == DRR_END) {
410 			/*
411 			 * Typically the END record is either the last
412 			 * thing in the stream, or it is followed
413 			 * by a BEGIN record (which also zeros the checksum).
414 			 * However, a stream package ends with two END
415 			 * records.  The last END record's checksum starts
416 			 * from zero.
417 			 */
418 			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
419 		}
420 		offset = ftell(ofp);
421 	}
422 
423 	if (verbose) {
424 		char mem_str[16];
425 		zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t),
426 		    mem_str, sizeof (mem_str));
427 		fprintf(stderr, "converted stream with %llu total records, "
428 		    "including %llu dedup records, using %sB memory.\n",
429 		    (long long)num_records,
430 		    (long long)num_write_byref_records,
431 		    mem_str);
432 	}
433 
434 	umem_cache_destroy(rdt.ddecache);
435 	free(rdt.redup_hash_array);
436 	free(buf);
437 	(void) fclose(ofp);
438 }
439 
440 int
441 zstream_do_redup(int argc, char *argv[])
442 {
443 	boolean_t verbose = B_FALSE;
444 	int c;
445 
446 	while ((c = getopt(argc, argv, "v")) != -1) {
447 		switch (c) {
448 		case 'v':
449 			verbose = B_TRUE;
450 			break;
451 		case '?':
452 			(void) fprintf(stderr, "invalid option '%c'\n",
453 			    optopt);
454 			zstream_usage();
455 			break;
456 		}
457 	}
458 
459 	argc -= optind;
460 	argv += optind;
461 
462 	if (argc != 1)
463 		zstream_usage();
464 
465 	const char *filename = argv[0];
466 
467 	if (isatty(STDOUT_FILENO)) {
468 		(void) fprintf(stderr,
469 		    "Error: Stream can not be written to a terminal.\n"
470 		    "You must redirect standard output.\n");
471 		return (1);
472 	}
473 
474 	int fd = open(filename, O_RDONLY);
475 	if (fd == -1) {
476 		(void) fprintf(stderr,
477 		    "Error while opening file '%s': %s\n",
478 		    filename, strerror(errno));
479 		exit(1);
480 	}
481 
482 	fletcher_4_init();
483 	zfs_redup_stream(fd, STDOUT_FILENO, verbose);
484 	fletcher_4_fini();
485 
486 	close(fd);
487 
488 	return (0);
489 }
490