1 /*
2 * CDDL HEADER START
3 *
4 * This file and its contents are supplied under the terms of the
5 * Common Development and Distribution License ("CDDL"), version 1.0.
6 * You may only use this file in accordance with the terms of version
7 * 1.0 of the CDDL.
8 *
9 * A full copy of the text of the CDDL should have accompanied this
10 * source. A copy of the CDDL is also available via the Internet at
11 * http://www.illumos.org/license/CDDL.
12 *
13 * CDDL HEADER END
14 */
15
16 /*
17 * Copyright (c) 2020 by Delphix. All rights reserved.
18 */
19
20 #include <assert.h>
21 #include <cityhash.h>
22 #include <ctype.h>
23 #include <errno.h>
24 #include <fcntl.h>
25 #include <libzfs.h>
26 #include <libzutil.h>
27 #include <stddef.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <string.h>
31 #include <umem.h>
32 #include <unistd.h>
33 #include <sys/debug.h>
34 #include <sys/stat.h>
35 #include <sys/zfs_ioctl.h>
36 #include <sys/zio_checksum.h>
37 #include "zfs_fletcher.h"
38 #include "zstream.h"
39
40
41 #define MAX_RDT_PHYSMEM_PERCENT 20
42 #define SMALLEST_POSSIBLE_MAX_RDT_MB 128
43
44 typedef struct redup_entry {
45 struct redup_entry *rde_next;
46 uint64_t rde_guid;
47 uint64_t rde_object;
48 uint64_t rde_offset;
49 uint64_t rde_stream_offset;
50 } redup_entry_t;
51
52 typedef struct redup_table {
53 redup_entry_t **redup_hash_array;
54 umem_cache_t *ddecache;
55 uint64_t ddt_count;
56 int numhashbits;
57 } redup_table_t;
58
59 int
highbit64(uint64_t i)60 highbit64(uint64_t i)
61 {
62 if (i == 0)
63 return (0);
64
65 return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));
66 }
67
68 void *
safe_calloc(size_t n)69 safe_calloc(size_t n)
70 {
71 void *rv = calloc(1, n);
72 if (rv == NULL) {
73 fprintf(stderr,
74 "Error: could not allocate %u bytes of memory\n",
75 (int)n);
76 exit(1);
77 }
78 return (rv);
79 }
80
81 /*
82 * Safe version of fread(), exits on error.
83 */
84 int
sfread(void * buf,size_t size,FILE * fp)85 sfread(void *buf, size_t size, FILE *fp)
86 {
87 int rv = fread(buf, size, 1, fp);
88 if (rv == 0 && ferror(fp)) {
89 (void) fprintf(stderr, "Error while reading file: %s\n",
90 strerror(errno));
91 exit(1);
92 }
93 return (rv);
94 }
95
96 /*
97 * Safe version of pread(), exits on error.
98 */
99 static void
spread(int fd,void * buf,size_t count,off_t offset)100 spread(int fd, void *buf, size_t count, off_t offset)
101 {
102 ssize_t err = pread(fd, buf, count, offset);
103 if (err == -1) {
104 (void) fprintf(stderr,
105 "Error while reading file: %s\n",
106 strerror(errno));
107 exit(1);
108 } else if (err != count) {
109 (void) fprintf(stderr,
110 "Error while reading file: short read\n");
111 exit(1);
112 }
113 }
114
115 static int
dump_record(dmu_replay_record_t * drr,void * payload,int payload_len,zio_cksum_t * zc,int outfd)116 dump_record(dmu_replay_record_t *drr, void *payload, int payload_len,
117 zio_cksum_t *zc, int outfd)
118 {
119 assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum)
120 == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
121 fletcher_4_incremental_native(drr,
122 offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc);
123 if (drr->drr_type != DRR_BEGIN) {
124 assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.
125 drr_checksum.drr_checksum));
126 drr->drr_u.drr_checksum.drr_checksum = *zc;
127 }
128 fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum,
129 sizeof (zio_cksum_t), zc);
130 if (write(outfd, drr, sizeof (*drr)) == -1)
131 return (errno);
132 if (payload_len != 0) {
133 fletcher_4_incremental_native(payload, payload_len, zc);
134 if (write(outfd, payload, payload_len) == -1)
135 return (errno);
136 }
137 return (0);
138 }
139
140 static void
rdt_insert(redup_table_t * rdt,uint64_t guid,uint64_t object,uint64_t offset,uint64_t stream_offset)141 rdt_insert(redup_table_t *rdt,
142 uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset)
143 {
144 uint64_t ch = cityhash4(guid, object, offset, 0);
145 uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
146 redup_entry_t **rdepp;
147
148 rdepp = &(rdt->redup_hash_array[hashcode]);
149 redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL);
150 rde->rde_next = *rdepp;
151 rde->rde_guid = guid;
152 rde->rde_object = object;
153 rde->rde_offset = offset;
154 rde->rde_stream_offset = stream_offset;
155 *rdepp = rde;
156 rdt->ddt_count++;
157 }
158
159 static void
rdt_lookup(redup_table_t * rdt,uint64_t guid,uint64_t object,uint64_t offset,uint64_t * stream_offsetp)160 rdt_lookup(redup_table_t *rdt,
161 uint64_t guid, uint64_t object, uint64_t offset,
162 uint64_t *stream_offsetp)
163 {
164 uint64_t ch = cityhash4(guid, object, offset, 0);
165 uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits);
166
167 for (redup_entry_t *rde = rdt->redup_hash_array[hashcode];
168 rde != NULL; rde = rde->rde_next) {
169 if (rde->rde_guid == guid &&
170 rde->rde_object == object &&
171 rde->rde_offset == offset) {
172 *stream_offsetp = rde->rde_stream_offset;
173 return;
174 }
175 }
176 assert(!"could not find expected redup table entry");
177 }
178
179 /*
180 * Convert a dedup stream (generated by "zfs send -D") to a
181 * non-deduplicated stream. The entire infd will be converted, including
182 * any substreams in a stream package (generated by "zfs send -RD"). The
183 * infd must be seekable.
184 */
185 static void
zfs_redup_stream(int infd,int outfd,boolean_t verbose)186 zfs_redup_stream(int infd, int outfd, boolean_t verbose)
187 {
188 int bufsz = SPA_MAXBLOCKSIZE;
189 dmu_replay_record_t thedrr;
190 dmu_replay_record_t *drr = &thedrr;
191 redup_table_t rdt;
192 zio_cksum_t stream_cksum;
193 uint64_t numbuckets;
194 uint64_t num_records = 0;
195 uint64_t num_write_byref_records = 0;
196
197 memset(&thedrr, 0, sizeof (dmu_replay_record_t));
198
199 #ifdef _ILP32
200 uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20;
201 #else
202 uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
203 uint64_t max_rde_size =
204 MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100,
205 SMALLEST_POSSIBLE_MAX_RDT_MB << 20);
206 #endif
207
208 numbuckets = max_rde_size / (sizeof (redup_entry_t));
209
210 /*
211 * numbuckets must be a power of 2. Increase number to
212 * a power of 2 if necessary.
213 */
214 if (!ISP2(numbuckets))
215 numbuckets = 1ULL << highbit64(numbuckets);
216
217 rdt.redup_hash_array =
218 safe_calloc(numbuckets * sizeof (redup_entry_t *));
219 rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0,
220 NULL, NULL, NULL, NULL, NULL, 0);
221 rdt.numhashbits = highbit64(numbuckets) - 1;
222 rdt.ddt_count = 0;
223
224 char *buf = safe_calloc(bufsz);
225 FILE *ofp = fdopen(infd, "r");
226 long offset = ftell(ofp);
227 int begin = 0;
228 boolean_t seen = B_FALSE;
229 while (sfread(drr, sizeof (*drr), ofp) != 0) {
230 num_records++;
231
232 /*
233 * We need to regenerate the checksum.
234 */
235 if (drr->drr_type != DRR_BEGIN) {
236 memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
237 sizeof (drr->drr_u.drr_checksum.drr_checksum));
238 }
239
240 uint64_t payload_size = 0;
241 switch (drr->drr_type) {
242 case DRR_BEGIN:
243 {
244 struct drr_begin *drrb = &drr->drr_u.drr_begin;
245 int fflags;
246 ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
247 VERIFY0(begin++);
248 seen = B_TRUE;
249
250 assert(drrb->drr_magic == DMU_BACKUP_MAGIC);
251
252 /* clear the DEDUP feature flag for this stream */
253 fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
254 fflags &= ~(DMU_BACKUP_FEATURE_DEDUP |
255 DMU_BACKUP_FEATURE_DEDUPPROPS);
256 /* cppcheck-suppress syntaxError */
257 DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
258
259 uint32_t sz = drr->drr_payloadlen;
260
261 VERIFY3U(sz, <=, 1U << 28);
262
263 if (sz != 0) {
264 if (sz > bufsz) {
265 free(buf);
266 buf = safe_calloc(sz);
267 bufsz = sz;
268 }
269 (void) sfread(buf, sz, ofp);
270 }
271 payload_size = sz;
272 break;
273 }
274
275 case DRR_END:
276 {
277 struct drr_end *drre = &drr->drr_u.drr_end;
278 /*
279 * We would prefer to just check --begin == 0, but
280 * replication streams have an end of stream END
281 * record, so we must avoid tripping it.
282 */
283 VERIFY3B(seen, ==, B_TRUE);
284 begin--;
285 /*
286 * Use the recalculated checksum, unless this is
287 * the END record of a stream package, which has
288 * no checksum.
289 */
290 if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum))
291 drre->drr_checksum = stream_cksum;
292 break;
293 }
294
295 case DRR_OBJECT:
296 {
297 struct drr_object *drro = &drr->drr_u.drr_object;
298 VERIFY3S(begin, ==, 1);
299
300 if (drro->drr_bonuslen > 0) {
301 payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro);
302 (void) sfread(buf, payload_size, ofp);
303 }
304 break;
305 }
306
307 case DRR_SPILL:
308 {
309 struct drr_spill *drrs = &drr->drr_u.drr_spill;
310 VERIFY3S(begin, ==, 1);
311 payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs);
312 (void) sfread(buf, payload_size, ofp);
313 break;
314 }
315
316 case DRR_WRITE_BYREF:
317 {
318 struct drr_write_byref drrwb =
319 drr->drr_u.drr_write_byref;
320 VERIFY3S(begin, ==, 1);
321
322 num_write_byref_records++;
323
324 /*
325 * Look up in hash table by drrwb->drr_refguid,
326 * drr_refobject, drr_refoffset. Replace this
327 * record with the found WRITE record, but with
328 * drr_object,drr_offset,drr_toguid replaced with ours.
329 */
330 uint64_t stream_offset = 0;
331 rdt_lookup(&rdt, drrwb.drr_refguid,
332 drrwb.drr_refobject, drrwb.drr_refoffset,
333 &stream_offset);
334
335 spread(infd, drr, sizeof (*drr), stream_offset);
336
337 assert(drr->drr_type == DRR_WRITE);
338 struct drr_write *drrw = &drr->drr_u.drr_write;
339 assert(drrw->drr_toguid == drrwb.drr_refguid);
340 assert(drrw->drr_object == drrwb.drr_refobject);
341 assert(drrw->drr_offset == drrwb.drr_refoffset);
342
343 payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
344 spread(infd, buf, payload_size,
345 stream_offset + sizeof (*drr));
346
347 drrw->drr_toguid = drrwb.drr_toguid;
348 drrw->drr_object = drrwb.drr_object;
349 drrw->drr_offset = drrwb.drr_offset;
350 break;
351 }
352
353 case DRR_WRITE:
354 {
355 struct drr_write *drrw = &drr->drr_u.drr_write;
356 VERIFY3S(begin, ==, 1);
357 payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw);
358 (void) sfread(buf, payload_size, ofp);
359
360 rdt_insert(&rdt, drrw->drr_toguid,
361 drrw->drr_object, drrw->drr_offset, offset);
362 break;
363 }
364
365 case DRR_WRITE_EMBEDDED:
366 {
367 struct drr_write_embedded *drrwe =
368 &drr->drr_u.drr_write_embedded;
369 VERIFY3S(begin, ==, 1);
370 payload_size =
371 P2ROUNDUP((uint64_t)drrwe->drr_psize, 8);
372 (void) sfread(buf, payload_size, ofp);
373 break;
374 }
375
376 case DRR_FREEOBJECTS:
377 case DRR_FREE:
378 case DRR_OBJECT_RANGE:
379 VERIFY3S(begin, ==, 1);
380 break;
381
382 default:
383 (void) fprintf(stderr, "INVALID record type 0x%x\n",
384 drr->drr_type);
385 /* should never happen, so assert */
386 assert(B_FALSE);
387 }
388
389 if (feof(ofp)) {
390 fprintf(stderr, "Error: unexpected end-of-file\n");
391 exit(1);
392 }
393 if (ferror(ofp)) {
394 fprintf(stderr, "Error while reading file: %s\n",
395 strerror(errno));
396 exit(1);
397 }
398
399 /*
400 * We need to recalculate the checksum, and it needs to be
401 * initially zero to do that. BEGIN records don't have
402 * a checksum.
403 */
404 if (drr->drr_type != DRR_BEGIN) {
405 memset(&drr->drr_u.drr_checksum.drr_checksum, 0,
406 sizeof (drr->drr_u.drr_checksum.drr_checksum));
407 }
408 if (dump_record(drr, buf, payload_size,
409 &stream_cksum, outfd) != 0)
410 break;
411 if (drr->drr_type == DRR_END) {
412 /*
413 * Typically the END record is either the last
414 * thing in the stream, or it is followed
415 * by a BEGIN record (which also zeros the checksum).
416 * However, a stream package ends with two END
417 * records. The last END record's checksum starts
418 * from zero.
419 */
420 ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
421 }
422 offset = ftell(ofp);
423 }
424
425 if (verbose) {
426 char mem_str[16];
427 zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t),
428 mem_str, sizeof (mem_str));
429 fprintf(stderr, "converted stream with %llu total records, "
430 "including %llu dedup records, using %sB memory.\n",
431 (long long)num_records,
432 (long long)num_write_byref_records,
433 mem_str);
434 }
435
436 umem_cache_destroy(rdt.ddecache);
437 free(rdt.redup_hash_array);
438 free(buf);
439 (void) fclose(ofp);
440 }
441
442 int
zstream_do_redup(int argc,char * argv[])443 zstream_do_redup(int argc, char *argv[])
444 {
445 boolean_t verbose = B_FALSE;
446 int c;
447
448 while ((c = getopt(argc, argv, "v")) != -1) {
449 switch (c) {
450 case 'v':
451 verbose = B_TRUE;
452 break;
453 case '?':
454 (void) fprintf(stderr, "invalid option '%c'\n",
455 optopt);
456 zstream_usage();
457 break;
458 }
459 }
460
461 argc -= optind;
462 argv += optind;
463
464 if (argc != 1)
465 zstream_usage();
466
467 const char *filename = argv[0];
468
469 if (isatty(STDOUT_FILENO)) {
470 (void) fprintf(stderr,
471 "Error: Stream can not be written to a terminal.\n"
472 "You must redirect standard output.\n");
473 return (1);
474 }
475
476 int fd = open(filename, O_RDONLY);
477 if (fd == -1) {
478 (void) fprintf(stderr,
479 "Error while opening file '%s': %s\n",
480 filename, strerror(errno));
481 exit(1);
482 }
483
484 fletcher_4_init();
485 zfs_redup_stream(fd, STDOUT_FILENO, verbose);
486 fletcher_4_fini();
487
488 close(fd);
489
490 return (0);
491 }
492