1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, 2015 by Delphix. All rights reserved. 24 * Copyright (c) 2013, Joyent, Inc. All rights reserved. 25 * Copyright 2017 RackTop Systems. 26 */ 27 28 #include <assert.h> 29 #include <fcntl.h> 30 #include <poll.h> 31 #include <stdio.h> 32 #include <stdlib.h> 33 #include <string.h> 34 #include <zlib.h> 35 #include <libgen.h> 36 #include <sys/spa.h> 37 #include <sys/stat.h> 38 #include <sys/processor.h> 39 #include <sys/zfs_context.h> 40 #include <sys/rrwlock.h> 41 #include <sys/zmod.h> 42 #include <sys/utsname.h> 43 #include <sys/systeminfo.h> 44 #include <libzfs.h> 45 46 extern void system_taskq_init(void); 47 extern void system_taskq_fini(void); 48 49 /* 50 * Emulation of kernel services in userland. 51 */ 52 53 pgcnt_t physmem; 54 vnode_t *rootdir = (vnode_t *)0xabcd1234; 55 char hw_serial[HW_HOSTID_LEN]; 56 kmutex_t cpu_lock; 57 vmem_t *zio_arena = NULL; 58 59 /* If set, all blocks read will be copied to the specified directory. */ 60 char *vn_dumpdir = NULL; 61 62 struct utsname utsname = { 63 "userland", "libzpool", "1", "1", "na" 64 }; 65 66 /* 67 * ========================================================================= 68 * vnode operations 69 * ========================================================================= 70 */ 71 /* 72 * Note: for the xxxat() versions of these functions, we assume that the 73 * starting vp is always rootdir (which is true for spa_directory.c, the only 74 * ZFS consumer of these interfaces). We assert this is true, and then emulate 75 * them by adding '/' in front of the path. 76 */ 77 78 /*ARGSUSED*/ 79 int 80 vn_open(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, int x3) 81 { 82 int fd; 83 int dump_fd; 84 vnode_t *vp; 85 int old_umask; 86 char realpath[MAXPATHLEN]; 87 struct stat64 st; 88 89 /* 90 * If we're accessing a real disk from userland, we need to use 91 * the character interface to avoid caching. This is particularly 92 * important if we're trying to look at a real in-kernel storage 93 * pool from userland, e.g. via zdb, because otherwise we won't 94 * see the changes occurring under the segmap cache. 95 * On the other hand, the stupid character device returns zero 96 * for its size. So -- gag -- we open the block device to get 97 * its size, and remember it for subsequent VOP_GETATTR(). 98 */ 99 if (strncmp(path, "/dev/", 5) == 0) { 100 char *dsk; 101 fd = open64(path, O_RDONLY); 102 if (fd == -1) 103 return (errno); 104 if (fstat64(fd, &st) == -1) { 105 close(fd); 106 return (errno); 107 } 108 close(fd); 109 (void) sprintf(realpath, "%s", path); 110 dsk = strstr(path, "/dsk/"); 111 if (dsk != NULL) 112 (void) sprintf(realpath + (dsk - path) + 1, "r%s", 113 dsk + 1); 114 } else { 115 (void) sprintf(realpath, "%s", path); 116 if (!(flags & FCREAT) && stat64(realpath, &st) == -1) 117 return (errno); 118 } 119 120 if (flags & FCREAT) 121 old_umask = umask(0); 122 123 /* 124 * The construct 'flags - FREAD' conveniently maps combinations of 125 * FREAD and FWRITE to the corresponding O_RDONLY, O_WRONLY, and O_RDWR. 126 */ 127 fd = open64(realpath, flags - FREAD, mode); 128 129 if (flags & FCREAT) 130 (void) umask(old_umask); 131 132 if (vn_dumpdir != NULL) { 133 char dumppath[MAXPATHLEN]; 134 (void) snprintf(dumppath, sizeof (dumppath), 135 "%s/%s", vn_dumpdir, basename(realpath)); 136 dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); 137 if (dump_fd == -1) 138 return (errno); 139 } else { 140 dump_fd = -1; 141 } 142 143 if (fd == -1) 144 return (errno); 145 146 if (fstat64(fd, &st) == -1) { 147 close(fd); 148 return (errno); 149 } 150 151 (void) fcntl(fd, F_SETFD, FD_CLOEXEC); 152 153 *vpp = vp = umem_zalloc(sizeof (vnode_t), UMEM_NOFAIL); 154 155 vp->v_fd = fd; 156 vp->v_size = st.st_size; 157 vp->v_path = spa_strdup(path); 158 vp->v_dump_fd = dump_fd; 159 160 return (0); 161 } 162 163 /*ARGSUSED*/ 164 int 165 vn_openat(char *path, int x1, int flags, int mode, vnode_t **vpp, int x2, 166 int x3, vnode_t *startvp, int fd) 167 { 168 char *realpath = umem_alloc(strlen(path) + 2, UMEM_NOFAIL); 169 int ret; 170 171 ASSERT(startvp == rootdir); 172 (void) sprintf(realpath, "/%s", path); 173 174 /* fd ignored for now, need if want to simulate nbmand support */ 175 ret = vn_open(realpath, x1, flags, mode, vpp, x2, x3); 176 177 umem_free(realpath, strlen(path) + 2); 178 179 return (ret); 180 } 181 182 /*ARGSUSED*/ 183 int 184 vn_rdwr(int uio, vnode_t *vp, void *addr, ssize_t len, offset_t offset, 185 int x1, int x2, rlim64_t x3, void *x4, ssize_t *residp) 186 { 187 ssize_t iolen, split; 188 189 if (uio == UIO_READ) { 190 iolen = pread64(vp->v_fd, addr, len, offset); 191 if (vp->v_dump_fd != -1) { 192 int status = 193 pwrite64(vp->v_dump_fd, addr, iolen, offset); 194 ASSERT(status != -1); 195 } 196 } else { 197 /* 198 * To simulate partial disk writes, we split writes into two 199 * system calls so that the process can be killed in between. 200 */ 201 int sectors = len >> SPA_MINBLOCKSHIFT; 202 split = (sectors > 0 ? rand() % sectors : 0) << 203 SPA_MINBLOCKSHIFT; 204 iolen = pwrite64(vp->v_fd, addr, split, offset); 205 iolen += pwrite64(vp->v_fd, (char *)addr + split, 206 len - split, offset + split); 207 } 208 209 if (iolen == -1) 210 return (errno); 211 if (residp) 212 *residp = len - iolen; 213 else if (iolen != len) 214 return (EIO); 215 return (0); 216 } 217 218 void 219 vn_close(vnode_t *vp) 220 { 221 close(vp->v_fd); 222 if (vp->v_dump_fd != -1) 223 close(vp->v_dump_fd); 224 spa_strfree(vp->v_path); 225 umem_free(vp, sizeof (vnode_t)); 226 } 227 228 /* 229 * At a minimum we need to update the size since vdev_reopen() 230 * will no longer call vn_openat(). 231 */ 232 int 233 fop_getattr(vnode_t *vp, vattr_t *vap) 234 { 235 struct stat64 st; 236 237 if (fstat64(vp->v_fd, &st) == -1) { 238 close(vp->v_fd); 239 return (errno); 240 } 241 242 vap->va_size = st.st_size; 243 return (0); 244 } 245 246 #ifdef ZFS_DEBUG 247 248 /* 249 * ========================================================================= 250 * Figure out which debugging statements to print 251 * ========================================================================= 252 */ 253 254 static char *dprintf_string; 255 static int dprintf_print_all; 256 257 int 258 dprintf_find_string(const char *string) 259 { 260 char *tmp_str = dprintf_string; 261 int len = strlen(string); 262 263 /* 264 * Find out if this is a string we want to print. 265 * String format: file1.c,function_name1,file2.c,file3.c 266 */ 267 268 while (tmp_str != NULL) { 269 if (strncmp(tmp_str, string, len) == 0 && 270 (tmp_str[len] == ',' || tmp_str[len] == '\0')) 271 return (1); 272 tmp_str = strchr(tmp_str, ','); 273 if (tmp_str != NULL) 274 tmp_str++; /* Get rid of , */ 275 } 276 return (0); 277 } 278 279 void 280 dprintf_setup(int *argc, char **argv) 281 { 282 int i, j; 283 284 /* 285 * Debugging can be specified two ways: by setting the 286 * environment variable ZFS_DEBUG, or by including a 287 * "debug=..." argument on the command line. The command 288 * line setting overrides the environment variable. 289 */ 290 291 for (i = 1; i < *argc; i++) { 292 int len = strlen("debug="); 293 /* First look for a command line argument */ 294 if (strncmp("debug=", argv[i], len) == 0) { 295 dprintf_string = argv[i] + len; 296 /* Remove from args */ 297 for (j = i; j < *argc; j++) 298 argv[j] = argv[j+1]; 299 argv[j] = NULL; 300 (*argc)--; 301 } 302 } 303 304 if (dprintf_string == NULL) { 305 /* Look for ZFS_DEBUG environment variable */ 306 dprintf_string = getenv("ZFS_DEBUG"); 307 } 308 309 /* 310 * Are we just turning on all debugging? 311 */ 312 if (dprintf_find_string("on")) 313 dprintf_print_all = 1; 314 315 if (dprintf_string != NULL) 316 zfs_flags |= ZFS_DEBUG_DPRINTF; 317 } 318 319 /* 320 * ========================================================================= 321 * debug printfs 322 * ========================================================================= 323 */ 324 void 325 __dprintf(const char *file, const char *func, int line, const char *fmt, ...) 326 { 327 const char *newfile; 328 va_list adx; 329 330 /* 331 * Get rid of annoying "../common/" prefix to filename. 332 */ 333 newfile = strrchr(file, '/'); 334 if (newfile != NULL) { 335 newfile = newfile + 1; /* Get rid of leading / */ 336 } else { 337 newfile = file; 338 } 339 340 if (dprintf_print_all || 341 dprintf_find_string(newfile) || 342 dprintf_find_string(func)) { 343 /* Print out just the function name if requested */ 344 flockfile(stdout); 345 if (dprintf_find_string("pid")) 346 (void) printf("%d ", getpid()); 347 if (dprintf_find_string("tid")) 348 (void) printf("%u ", thr_self()); 349 if (dprintf_find_string("cpu")) 350 (void) printf("%u ", getcpuid()); 351 if (dprintf_find_string("time")) 352 (void) printf("%llu ", gethrtime()); 353 if (dprintf_find_string("long")) 354 (void) printf("%s, line %d: ", newfile, line); 355 (void) printf("%s: ", func); 356 va_start(adx, fmt); 357 (void) vprintf(fmt, adx); 358 va_end(adx); 359 funlockfile(stdout); 360 } 361 } 362 363 #endif /* ZFS_DEBUG */ 364 365 /* 366 * ========================================================================= 367 * kobj interfaces 368 * ========================================================================= 369 */ 370 struct _buf * 371 kobj_open_file(char *name) 372 { 373 struct _buf *file; 374 vnode_t *vp; 375 376 /* set vp as the _fd field of the file */ 377 if (vn_openat(name, UIO_SYSSPACE, FREAD, 0, &vp, 0, 0, rootdir, 378 -1) != 0) 379 return ((void *)-1UL); 380 381 file = umem_zalloc(sizeof (struct _buf), UMEM_NOFAIL); 382 file->_fd = (intptr_t)vp; 383 return (file); 384 } 385 386 int 387 kobj_read_file(struct _buf *file, char *buf, unsigned size, unsigned off) 388 { 389 ssize_t resid; 390 391 vn_rdwr(UIO_READ, (vnode_t *)file->_fd, buf, size, (offset_t)off, 392 UIO_SYSSPACE, 0, 0, 0, &resid); 393 394 return (size - resid); 395 } 396 397 void 398 kobj_close_file(struct _buf *file) 399 { 400 vn_close((vnode_t *)file->_fd); 401 umem_free(file, sizeof (struct _buf)); 402 } 403 404 int 405 kobj_get_filesize(struct _buf *file, uint64_t *size) 406 { 407 struct stat64 st; 408 vnode_t *vp = (vnode_t *)file->_fd; 409 410 if (fstat64(vp->v_fd, &st) == -1) { 411 vn_close(vp); 412 return (errno); 413 } 414 *size = st.st_size; 415 return (0); 416 } 417 418 /* 419 * ========================================================================= 420 * kernel emulation setup & teardown 421 * ========================================================================= 422 */ 423 static int 424 umem_out_of_memory(void) 425 { 426 char errmsg[] = "out of memory -- generating core dump\n"; 427 428 write(fileno(stderr), errmsg, sizeof (errmsg)); 429 abort(); 430 return (0); 431 } 432 433 void 434 kernel_init(int mode) 435 { 436 extern uint_t rrw_tsd_key; 437 438 umem_nofail_callback(umem_out_of_memory); 439 440 physmem = sysconf(_SC_PHYS_PAGES); 441 442 dprintf("physmem = %llu pages (%.2f GB)\n", physmem, 443 (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); 444 445 (void) snprintf(hw_serial, sizeof (hw_serial), "%ld", 446 (mode & FWRITE) ? get_system_hostid() : 0); 447 448 system_taskq_init(); 449 450 mutex_init(&cpu_lock, NULL, MUTEX_DEFAULT, NULL); 451 452 spa_init(mode); 453 454 tsd_create(&rrw_tsd_key, rrw_tsd_destroy); 455 } 456 457 void 458 kernel_fini(void) 459 { 460 spa_fini(); 461 462 system_taskq_fini(); 463 } 464 465 /* ARGSUSED */ 466 uint32_t 467 zone_get_hostid(void *zonep) 468 { 469 /* 470 * We're emulating the system's hostid in userland. 471 */ 472 return (strtoul(hw_serial, NULL, 10)); 473 } 474 475 int 476 z_uncompress(void *dst, size_t *dstlen, const void *src, size_t srclen) 477 { 478 int ret; 479 uLongf len = *dstlen; 480 481 if ((ret = uncompress(dst, &len, src, srclen)) == Z_OK) 482 *dstlen = (size_t)len; 483 484 return (ret); 485 } 486 487 int 488 z_compress_level(void *dst, size_t *dstlen, const void *src, size_t srclen, 489 int level) 490 { 491 int ret; 492 uLongf len = *dstlen; 493 494 if ((ret = compress2(dst, &len, src, srclen, level)) == Z_OK) 495 *dstlen = (size_t)len; 496 497 return (ret); 498 } 499 500 int 501 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) 502 { 503 return (0); 504 } 505 506 int 507 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) 508 { 509 return (0); 510 } 511 512 int 513 zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) 514 { 515 return (0); 516 } 517 518 /* ARGSUSED */ 519 int 520 zfs_onexit_fd_hold(int fd, minor_t *minorp) 521 { 522 *minorp = 0; 523 return (0); 524 } 525 526 /* ARGSUSED */ 527 void 528 zfs_onexit_fd_rele(int fd) 529 { 530 } 531 532 /* ARGSUSED */ 533 int 534 zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, 535 uint64_t *action_handle) 536 { 537 return (0); 538 } 539 540 /* ARGSUSED */ 541 int 542 zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) 543 { 544 return (0); 545 } 546 547 /* ARGSUSED */ 548 int 549 zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) 550 { 551 return (0); 552 } 553 554 void 555 bioinit(buf_t *bp) 556 { 557 bzero(bp, sizeof (buf_t)); 558 } 559 560 void 561 biodone(buf_t *bp) 562 { 563 if (bp->b_iodone != NULL) { 564 (*(bp->b_iodone))(bp); 565 return; 566 } 567 ASSERT((bp->b_flags & B_DONE) == 0); 568 bp->b_flags |= B_DONE; 569 } 570 571 void 572 bioerror(buf_t *bp, int error) 573 { 574 ASSERT(bp != NULL); 575 ASSERT(error >= 0); 576 577 if (error != 0) { 578 bp->b_flags |= B_ERROR; 579 } else { 580 bp->b_flags &= ~B_ERROR; 581 } 582 bp->b_error = error; 583 } 584 585 586 int 587 geterror(struct buf *bp) 588 { 589 int error = 0; 590 591 if (bp->b_flags & B_ERROR) { 592 error = bp->b_error; 593 if (!error) 594 error = EIO; 595 } 596 return (error); 597 } 598