1 /* $NetBSD: functions.c,v 1.1.1.1 2009/12/02 00:27:10 haad Exp $ */ 2 3 /* 4 * Copyright (C) 2004-2009 Red Hat, Inc. All rights reserved. 5 * 6 * This copyrighted material is made available to anyone wishing to use, 7 * modify, copy, or redistribute it subject to the terms and conditions 8 * of the GNU Lesser General Public License v.2.1. 9 * 10 * You should have received a copy of the GNU Lesser General Public License 11 * along with this program; if not, write to the Free Software Foundation, 12 * Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 13 */ 14 #define _GNU_SOURCE 15 #define _FILE_OFFSET_BITS 64 16 17 #include <stdint.h> 18 #include <errno.h> 19 #include <string.h> 20 #include <sys/types.h> 21 #include <sys/stat.h> 22 #include <dirent.h> 23 #include <unistd.h> 24 #include <signal.h> 25 #include <linux/kdev_t.h> 26 //#define __USE_GNU /* for O_DIRECT */ 27 #include <fcntl.h> 28 #include <time.h> 29 #include "libdevmapper.h" 30 #include "dm-log-userspace.h" 31 #include "functions.h" 32 #include "common.h" 33 #include "cluster.h" 34 #include "logging.h" 35 36 #define BYTE_SHIFT 3 37 38 /* 39 * Magic for persistent mirrors: "MiRr" 40 * Following on-disk header information is stolen from 41 * drivers/md/dm-log.c 42 */ 43 #define MIRROR_MAGIC 0x4D695272 44 #define MIRROR_DISK_VERSION 2 45 #define LOG_OFFSET 2 46 47 #define RESYNC_HISTORY 50 48 //static char resync_history[RESYNC_HISTORY][128]; 49 //static int idx = 0; 50 #define LOG_SPRINT(_lc, f, arg...) do { \ 51 lc->idx++; \ 52 lc->idx = lc->idx % RESYNC_HISTORY; \ 53 sprintf(lc->resync_history[lc->idx], f, ## arg); \ 54 } while (0) 55 56 struct log_header { 57 uint32_t magic; 58 uint32_t version; 59 uint64_t nr_regions; 60 }; 61 62 struct log_c { 63 struct dm_list list; 64 65 char uuid[DM_UUID_LEN]; 66 uint64_t luid; 67 68 time_t delay; /* limits how fast a resume can happen after suspend */ 69 int touched; 70 uint32_t region_size; 71 uint32_t region_count; 72 uint64_t sync_count; 73 74 dm_bitset_t clean_bits; 75 dm_bitset_t sync_bits; 76 uint32_t recoverer; 77 uint64_t recovering_region; /* -1 means not recovering */ 78 uint64_t skip_bit_warning; /* used to warn if region skipped */ 79 int sync_search; 80 81 int resume_override; 82 83 uint32_t block_on_error; 84 enum sync { 85 DEFAULTSYNC, /* Synchronize if necessary */ 86 NOSYNC, /* Devices known to be already in sync */ 87 FORCESYNC, /* Force a sync to happen */ 88 } sync; 89 90 uint32_t state; /* current operational state of the log */ 91 92 struct dm_list mark_list; 93 94 uint32_t recovery_halted; 95 struct recovery_request *recovery_request_list; 96 97 int disk_fd; /* -1 means no disk log */ 98 int log_dev_failed; 99 uint64_t disk_nr_regions; 100 size_t disk_size; /* size of disk_buffer in bytes */ 101 void *disk_buffer; /* aligned memory for O_DIRECT */ 102 int idx; 103 char resync_history[RESYNC_HISTORY][128]; 104 }; 105 106 struct mark_entry { 107 struct dm_list list; 108 uint32_t nodeid; 109 uint64_t region; 110 }; 111 112 struct recovery_request { 113 uint64_t region; 114 struct recovery_request *next; 115 }; 116 117 static DM_LIST_INIT(log_list); 118 static DM_LIST_INIT(log_pending_list); 119 120 static int log_test_bit(dm_bitset_t bs, int bit) 121 { 122 return dm_bit(bs, bit); 123 } 124 125 static void log_set_bit(struct log_c *lc, dm_bitset_t bs, int bit) 126 { 127 dm_bit_set(bs, bit); 128 lc->touched = 1; 129 } 130 131 static void log_clear_bit(struct log_c *lc, dm_bitset_t bs, int bit) 132 { 133 dm_bit_clear(bs, bit); 134 lc->touched = 1; 135 } 136 137 static int find_next_zero_bit(dm_bitset_t bs, int start) 138 { 139 while (dm_bit(bs, start++)) 140 if (start >= (int)bs[0]) 141 return -1; 142 143 return start - 1; 144 } 145 146 static uint64_t count_bits32(dm_bitset_t bs) 147 { 148 int i, size = ((int)bs[0]/DM_BITS_PER_INT + 1); 149 unsigned count = 0; 150 151 for (i = 1; i <= size; i++) 152 count += hweight32(bs[i]); 153 154 return (uint64_t)count; 155 } 156 157 /* 158 * get_log 159 * 160 * Returns: log if found, NULL otherwise 161 */ 162 static struct log_c *get_log(const char *uuid, uint64_t luid) 163 { 164 struct log_c *lc; 165 166 dm_list_iterate_items(lc, &log_list) 167 if (!strcmp(lc->uuid, uuid) && 168 (!luid || (luid == lc->luid))) 169 return lc; 170 171 return NULL; 172 } 173 174 /* 175 * get_pending_log 176 * 177 * Pending logs are logs that have been 'clog_ctr'ed, but 178 * have not joined the CPG (via clog_resume). 179 * 180 * Returns: log if found, NULL otherwise 181 */ 182 static struct log_c *get_pending_log(const char *uuid, uint64_t luid) 183 { 184 struct log_c *lc; 185 186 dm_list_iterate_items(lc, &log_pending_list) 187 if (!strcmp(lc->uuid, uuid) && 188 (!luid || (luid == lc->luid))) 189 return lc; 190 191 return NULL; 192 } 193 194 static void header_to_disk(struct log_header *mem, struct log_header *disk) 195 { 196 memcpy(disk, mem, sizeof(struct log_header)); 197 } 198 199 static void header_from_disk(struct log_header *mem, struct log_header *disk) 200 { 201 memcpy(mem, disk, sizeof(struct log_header)); 202 } 203 204 static int rw_log(struct log_c *lc, int do_write) 205 { 206 int r; 207 208 r = lseek(lc->disk_fd, 0, SEEK_SET); 209 if (r < 0) { 210 LOG_ERROR("[%s] rw_log: lseek failure: %s", 211 SHORT_UUID(lc->uuid), strerror(errno)); 212 return -errno; 213 } 214 215 if (do_write) { 216 r = write(lc->disk_fd, lc->disk_buffer, lc->disk_size); 217 if (r < 0) { 218 LOG_ERROR("[%s] rw_log: write failure: %s", 219 SHORT_UUID(lc->uuid), strerror(errno)); 220 return -EIO; /* Failed disk write */ 221 } 222 return 0; 223 } 224 225 /* Read */ 226 r = read(lc->disk_fd, lc->disk_buffer, lc->disk_size); 227 if (r < 0) 228 LOG_ERROR("[%s] rw_log: read failure: %s", 229 SHORT_UUID(lc->uuid), strerror(errno)); 230 if (r != lc->disk_size) 231 return -EIO; /* Failed disk read */ 232 return 0; 233 } 234 235 /* 236 * read_log 237 * @lc 238 * 239 * Valid return codes: 240 * -EINVAL: Invalid header, bits not copied 241 * -EIO: Unable to read disk log 242 * 0: Valid header, disk bit -> lc->clean_bits 243 * 244 * Returns: 0 on success, -EXXX on failure 245 */ 246 static int read_log(struct log_c *lc) 247 { 248 struct log_header lh; 249 size_t bitset_size; 250 251 memset(&lh, 0, sizeof(struct log_header)); 252 253 if (rw_log(lc, 0)) 254 return -EIO; /* Failed disk read */ 255 256 header_from_disk(&lh, lc->disk_buffer); 257 if (lh.magic != MIRROR_MAGIC) 258 return -EINVAL; 259 260 lc->disk_nr_regions = lh.nr_regions; 261 262 /* Read disk bits into sync_bits */ 263 bitset_size = lc->region_count / 8; 264 bitset_size += (lc->region_count % 8) ? 1 : 0; 265 memcpy(lc->clean_bits, lc->disk_buffer + 1024, bitset_size); 266 267 return 0; 268 } 269 270 /* 271 * write_log 272 * @lc 273 * 274 * Returns: 0 on success, -EIO on failure 275 */ 276 static int write_log(struct log_c *lc) 277 { 278 struct log_header lh; 279 size_t bitset_size; 280 281 lh.magic = MIRROR_MAGIC; 282 lh.version = MIRROR_DISK_VERSION; 283 lh.nr_regions = lc->region_count; 284 285 header_to_disk(&lh, lc->disk_buffer); 286 287 /* Write disk bits from clean_bits */ 288 bitset_size = lc->region_count / 8; 289 bitset_size += (lc->region_count % 8) ? 1 : 0; 290 memcpy(lc->disk_buffer + 1024, lc->clean_bits, bitset_size); 291 292 if (rw_log(lc, 1)) { 293 lc->log_dev_failed = 1; 294 return -EIO; /* Failed disk write */ 295 } 296 return 0; 297 } 298 299 static int find_disk_path(char *major_minor_str, char *path_rtn, int *unlink_path) 300 { 301 int r; 302 DIR *dp; 303 struct dirent *dep; 304 struct stat statbuf; 305 int major, minor; 306 307 if (!strstr(major_minor_str, ":")) { 308 r = stat(major_minor_str, &statbuf); 309 if (r) 310 return -errno; 311 if (!S_ISBLK(statbuf.st_mode)) 312 return -EINVAL; 313 sprintf(path_rtn, "%s", major_minor_str); 314 return 0; 315 } 316 317 r = sscanf(major_minor_str, "%d:%d", &major, &minor); 318 if (r != 2) 319 return -EINVAL; 320 321 LOG_DBG("Checking /dev/mapper for device %d:%d", major, minor); 322 /* Check /dev/mapper dir */ 323 dp = opendir("/dev/mapper"); 324 if (!dp) 325 return -ENOENT; 326 327 while ((dep = readdir(dp)) != NULL) { 328 /* 329 * FIXME: This is racy. By the time the path is used, 330 * it may point to something else. 'fstat' will be 331 * required upon opening to ensure we got what we 332 * wanted. 333 */ 334 335 sprintf(path_rtn, "/dev/mapper/%s", dep->d_name); 336 stat(path_rtn, &statbuf); 337 if (S_ISBLK(statbuf.st_mode) && 338 (major(statbuf.st_rdev) == major) && 339 (minor(statbuf.st_rdev) == minor)) { 340 LOG_DBG(" %s: YES", dep->d_name); 341 closedir(dp); 342 return 0; 343 } else { 344 LOG_DBG(" %s: NO", dep->d_name); 345 } 346 } 347 348 closedir(dp); 349 350 LOG_DBG("Path not found for %d/%d", major, minor); 351 LOG_DBG("Creating /dev/mapper/%d-%d", major, minor); 352 sprintf(path_rtn, "/dev/mapper/%d-%d", major, minor); 353 r = mknod(path_rtn, S_IFBLK | S_IRUSR | S_IWUSR, MKDEV(major, minor)); 354 355 /* 356 * If we have to make the path, we unlink it after we open it 357 */ 358 *unlink_path = 1; 359 360 return r ? -errno : 0; 361 } 362 363 static int _clog_ctr(char *uuid, uint64_t luid, 364 int argc, char **argv, uint64_t device_size) 365 { 366 int i; 367 int r = 0; 368 char *p; 369 uint64_t region_size; 370 uint64_t region_count; 371 struct log_c *lc = NULL; 372 struct log_c *duplicate; 373 enum sync sync = DEFAULTSYNC; 374 uint32_t block_on_error = 0; 375 376 int disk_log = 0; 377 char disk_path[128]; 378 int unlink_path = 0; 379 size_t page_size; 380 int pages; 381 382 /* If core log request, then argv[0] will be region_size */ 383 if (!strtoll(argv[0], &p, 0) || *p) { 384 disk_log = 1; 385 386 if ((argc < 2) || (argc > 4)) { 387 LOG_ERROR("Too %s arguments to clustered_disk log type", 388 (argc < 3) ? "few" : "many"); 389 r = -EINVAL; 390 goto fail; 391 } 392 393 r = find_disk_path(argv[0], disk_path, &unlink_path); 394 if (r) { 395 LOG_ERROR("Unable to find path to device %s", argv[0]); 396 goto fail; 397 } 398 LOG_DBG("Clustered log disk is %s", disk_path); 399 } else { 400 disk_log = 0; 401 402 if ((argc < 1) || (argc > 3)) { 403 LOG_ERROR("Too %s arguments to clustered_core log type", 404 (argc < 2) ? "few" : "many"); 405 r = -EINVAL; 406 goto fail; 407 } 408 } 409 410 if (!(region_size = strtoll(argv[disk_log], &p, 0)) || *p) { 411 LOG_ERROR("Invalid region_size argument to clustered_%s log type", 412 (disk_log) ? "disk" : "core"); 413 r = -EINVAL; 414 goto fail; 415 } 416 417 region_count = device_size / region_size; 418 if (device_size % region_size) { 419 /* 420 * I can't remember if device_size must be a multiple 421 * of region_size, so check it anyway. 422 */ 423 region_count++; 424 } 425 426 for (i = 0; i < argc; i++) { 427 if (!strcmp(argv[i], "sync")) 428 sync = FORCESYNC; 429 else if (!strcmp(argv[i], "nosync")) 430 sync = NOSYNC; 431 else if (!strcmp(argv[i], "block_on_error")) 432 block_on_error = 1; 433 } 434 435 lc = malloc(sizeof(*lc)); 436 if (!lc) { 437 LOG_ERROR("Unable to allocate cluster log context"); 438 r = -ENOMEM; 439 goto fail; 440 } 441 memset(lc, 0, sizeof(*lc)); 442 443 lc->region_size = region_size; 444 lc->region_count = region_count; 445 lc->sync = sync; 446 lc->block_on_error = block_on_error; 447 lc->sync_search = 0; 448 lc->recovering_region = (uint64_t)-1; 449 lc->skip_bit_warning = region_count; 450 lc->disk_fd = -1; 451 lc->log_dev_failed = 0; 452 strncpy(lc->uuid, uuid, DM_UUID_LEN); 453 lc->luid = luid; 454 455 if ((duplicate = get_log(lc->uuid, lc->luid)) || 456 (duplicate = get_pending_log(lc->uuid, lc->luid))) { 457 LOG_ERROR("[%s/%llu] Log already exists, unable to create.", 458 SHORT_UUID(lc->uuid), lc->luid); 459 free(lc); 460 return -EINVAL; 461 } 462 463 dm_list_init(&lc->mark_list); 464 465 lc->clean_bits = dm_bitset_create(NULL, region_count); 466 if (!lc->clean_bits) { 467 LOG_ERROR("Unable to allocate clean bitset"); 468 r = -ENOMEM; 469 goto fail; 470 } 471 472 lc->sync_bits = dm_bitset_create(NULL, region_count); 473 if (!lc->sync_bits) { 474 LOG_ERROR("Unable to allocate sync bitset"); 475 r = -ENOMEM; 476 goto fail; 477 } 478 if (sync == NOSYNC) 479 dm_bit_set_all(lc->sync_bits); 480 481 lc->sync_count = (sync == NOSYNC) ? region_count : 0; 482 if (disk_log) { 483 page_size = sysconf(_SC_PAGESIZE); 484 pages = ((int)lc->clean_bits[0])/page_size; 485 pages += ((int)lc->clean_bits[0])%page_size ? 1 : 0; 486 pages += 1; /* for header */ 487 488 r = open(disk_path, O_RDWR | O_DIRECT); 489 if (r < 0) { 490 LOG_ERROR("Unable to open log device, %s: %s", 491 disk_path, strerror(errno)); 492 r = errno; 493 goto fail; 494 } 495 if (unlink_path) 496 unlink(disk_path); 497 498 lc->disk_fd = r; 499 lc->disk_size = pages * page_size; 500 501 r = posix_memalign(&(lc->disk_buffer), page_size, 502 lc->disk_size); 503 if (r) { 504 LOG_ERROR("Unable to allocate memory for disk_buffer"); 505 goto fail; 506 } 507 memset(lc->disk_buffer, 0, lc->disk_size); 508 LOG_DBG("Disk log ready"); 509 } 510 511 dm_list_add(&log_pending_list, &lc->list); 512 513 return 0; 514 fail: 515 if (lc) { 516 if (lc->clean_bits) 517 free(lc->clean_bits); 518 if (lc->sync_bits) 519 free(lc->sync_bits); 520 if (lc->disk_buffer) 521 free(lc->disk_buffer); 522 if (lc->disk_fd >= 0) 523 close(lc->disk_fd); 524 free(lc); 525 } 526 return r; 527 } 528 529 /* 530 * clog_ctr 531 * @rq 532 * 533 * rq->data should contain constructor string as follows: 534 * <log_type> [disk] <region_size> [[no]sync] <device_len> 535 * The kernel is responsible for adding the <dev_len> argument 536 * to the end; otherwise, we cannot compute the region_count. 537 * 538 * FIXME: Currently relies on caller to fill in rq->error 539 */ 540 static int clog_dtr(struct dm_ulog_request *rq); 541 static int clog_ctr(struct dm_ulog_request *rq) 542 { 543 int argc, i, r = 0; 544 char *p, **argv = NULL; 545 char *dev_size_str; 546 uint64_t device_size; 547 548 /* Sanity checks */ 549 if (!rq->data_size) { 550 LOG_ERROR("Received constructor request with no data"); 551 return -EINVAL; 552 } 553 554 if (strlen(rq->data) > rq->data_size) { 555 LOG_ERROR("Received constructor request with bad data"); 556 LOG_ERROR("strlen(rq->data)[%d] != rq->data_size[%llu]", 557 (int)strlen(rq->data), 558 (unsigned long long)rq->data_size); 559 LOG_ERROR("rq->data = '%s' [%d]", 560 rq->data, (int)strlen(rq->data)); 561 return -EINVAL; 562 } 563 564 /* Split up args */ 565 for (argc = 0, p = rq->data; (p = strstr(p, " ")); p++, argc++) 566 *p = '\0'; 567 568 argv = malloc(argc * sizeof(char *)); 569 if (!argv) 570 return -ENOMEM; 571 572 p = dev_size_str = rq->data; 573 p += strlen(p) + 1; 574 for (i = 0; i < argc; i++, p = p + strlen(p) + 1) 575 argv[i] = p; 576 577 if (strcmp(argv[0], "clustered_disk") && 578 strcmp(argv[0], "clustered_core")) { 579 LOG_ERROR("Unsupported userspace log type, \"%s\"", argv[0]); 580 free(argv); 581 return -EINVAL; 582 } 583 584 if (!(device_size = strtoll(dev_size_str, &p, 0)) || *p) { 585 LOG_ERROR("Invalid device size argument: %s", dev_size_str); 586 free(argv); 587 return -EINVAL; 588 } 589 590 r = _clog_ctr(rq->uuid, rq->luid, argc - 1, argv + 1, device_size); 591 592 /* We join the CPG when we resume */ 593 594 /* No returning data */ 595 rq->data_size = 0; 596 597 if (r) { 598 LOG_ERROR("Failed to create cluster log (%s)", rq->uuid); 599 for (i = 0; i < argc; i++) 600 LOG_ERROR("argv[%d] = %s", i, argv[i]); 601 } 602 else 603 LOG_DBG("[%s] Cluster log created", 604 SHORT_UUID(rq->uuid)); 605 606 free(argv); 607 return r; 608 } 609 610 /* 611 * clog_dtr 612 * @rq 613 * 614 */ 615 static int clog_dtr(struct dm_ulog_request *rq) 616 { 617 struct log_c *lc = get_log(rq->uuid, rq->luid); 618 619 if (lc) { 620 /* 621 * The log should not be on the official list. There 622 * should have been a suspend first. 623 */ 624 LOG_ERROR("[%s] DTR before SUS: leaving CPG", 625 SHORT_UUID(rq->uuid)); 626 destroy_cluster_cpg(rq->uuid); 627 } else if (!(lc = get_pending_log(rq->uuid, rq->luid))) { 628 LOG_ERROR("clog_dtr called on log that is not official or pending"); 629 return -EINVAL; 630 } 631 632 LOG_DBG("[%s] Cluster log removed", SHORT_UUID(lc->uuid)); 633 634 dm_list_del(&lc->list); 635 if (lc->disk_fd != -1) 636 close(lc->disk_fd); 637 if (lc->disk_buffer) 638 free(lc->disk_buffer); 639 free(lc->clean_bits); 640 free(lc->sync_bits); 641 free(lc); 642 643 return 0; 644 } 645 646 /* 647 * clog_presuspend 648 * @rq 649 * 650 */ 651 static int clog_presuspend(struct dm_ulog_request *rq) 652 { 653 struct log_c *lc = get_log(rq->uuid, rq->luid); 654 655 if (!lc) 656 return -EINVAL; 657 658 if (lc->touched) 659 LOG_DBG("WARNING: log still marked as 'touched' during suspend"); 660 661 lc->recovery_halted = 1; 662 663 return 0; 664 } 665 666 /* 667 * clog_postsuspend 668 * @rq 669 * 670 */ 671 static int clog_postsuspend(struct dm_ulog_request *rq) 672 { 673 struct log_c *lc = get_log(rq->uuid, rq->luid); 674 675 if (!lc) 676 return -EINVAL; 677 678 LOG_DBG("[%s] clog_postsuspend: leaving CPG", SHORT_UUID(lc->uuid)); 679 destroy_cluster_cpg(rq->uuid); 680 681 lc->state = LOG_SUSPENDED; 682 lc->recovering_region = (uint64_t)-1; 683 lc->recoverer = (uint32_t)-1; 684 lc->delay = time(NULL); 685 686 return 0; 687 } 688 689 /* 690 * cluster_postsuspend 691 * @rq 692 * 693 */ 694 int cluster_postsuspend(char *uuid, uint64_t luid) 695 { 696 struct log_c *lc = get_log(uuid, luid); 697 698 if (!lc) 699 return -EINVAL; 700 701 LOG_DBG("[%s] clog_postsuspend: finalizing", SHORT_UUID(lc->uuid)); 702 lc->resume_override = 0; 703 704 /* move log to pending list */ 705 dm_list_del(&lc->list); 706 dm_list_add(&log_pending_list, &lc->list); 707 708 return 0; 709 } 710 711 /* 712 * clog_resume 713 * @rq 714 * 715 * Does the main work of resuming. 716 */ 717 static int clog_resume(struct dm_ulog_request *rq) 718 { 719 uint32_t i; 720 int commit_log = 0; 721 struct log_c *lc = get_log(rq->uuid, rq->luid); 722 723 if (!lc) 724 return -EINVAL; 725 726 switch (lc->resume_override) { 727 case 1000: 728 LOG_ERROR("[%s] Additional resume issued before suspend", 729 SHORT_UUID(rq->uuid)); 730 #ifdef DEBUG 731 kill(getpid(), SIGUSR1); 732 #endif 733 return 0; 734 case 0: 735 lc->resume_override = 1000; 736 if (lc->disk_fd == -1) { 737 LOG_DBG("[%s] Master resume.", 738 SHORT_UUID(lc->uuid)); 739 goto no_disk; 740 } 741 742 LOG_DBG("[%s] Master resume: reading disk log", 743 SHORT_UUID(lc->uuid)); 744 commit_log = 1; 745 break; 746 case 1: 747 LOG_ERROR("Error:: partial bit loading (just sync_bits)"); 748 return -EINVAL; 749 case 2: 750 LOG_ERROR("Error:: partial bit loading (just clean_bits)"); 751 return -EINVAL; 752 case 3: 753 LOG_DBG("[%s] Non-master resume: bits pre-loaded", 754 SHORT_UUID(lc->uuid)); 755 lc->resume_override = 1000; 756 goto out; 757 default: 758 LOG_ERROR("Error:: multiple loading of bits (%d)", 759 lc->resume_override); 760 return -EINVAL; 761 } 762 763 if (lc->log_dev_failed) { 764 LOG_ERROR("Log device has failed, unable to read bits"); 765 rq->error = 0; /* We can handle this so far */ 766 lc->disk_nr_regions = 0; 767 } else 768 rq->error = read_log(lc); 769 770 switch (rq->error) { 771 case 0: 772 if (lc->disk_nr_regions < lc->region_count) 773 LOG_DBG("[%s] Mirror has grown, updating log bits", 774 SHORT_UUID(lc->uuid)); 775 else if (lc->disk_nr_regions > lc->region_count) 776 LOG_DBG("[%s] Mirror has shrunk, updating log bits", 777 SHORT_UUID(lc->uuid)); 778 break; 779 case -EINVAL: 780 LOG_DBG("[%s] (Re)initializing mirror log - resync issued.", 781 SHORT_UUID(lc->uuid)); 782 lc->disk_nr_regions = 0; 783 break; 784 default: 785 LOG_ERROR("Failed to read disk log"); 786 lc->disk_nr_regions = 0; 787 break; 788 } 789 790 no_disk: 791 /* If mirror has grown, set bits appropriately */ 792 if (lc->sync == NOSYNC) 793 for (i = lc->disk_nr_regions; i < lc->region_count; i++) 794 log_set_bit(lc, lc->clean_bits, i); 795 else 796 for (i = lc->disk_nr_regions; i < lc->region_count; i++) 797 log_clear_bit(lc, lc->clean_bits, i); 798 799 /* Clear any old bits if device has shrunk */ 800 for (i = lc->region_count; i % 32; i++) 801 log_clear_bit(lc, lc->clean_bits, i); 802 803 /* copy clean across to sync */ 804 dm_bit_copy(lc->sync_bits, lc->clean_bits); 805 806 if (commit_log && (lc->disk_fd >= 0)) { 807 rq->error = write_log(lc); 808 if (rq->error) 809 LOG_ERROR("Failed initial disk log write"); 810 else 811 LOG_DBG("Disk log initialized"); 812 lc->touched = 0; 813 } 814 out: 815 /* 816 * Clear any old bits if device has shrunk - necessary 817 * for non-master resume 818 */ 819 for (i = lc->region_count; i % 32; i++) { 820 log_clear_bit(lc, lc->clean_bits, i); 821 log_clear_bit(lc, lc->sync_bits, i); 822 } 823 824 lc->sync_count = count_bits32(lc->sync_bits); 825 826 LOG_SPRINT(lc, "[%s] Initial sync_count = %llu", 827 SHORT_UUID(lc->uuid), (unsigned long long)lc->sync_count); 828 lc->sync_search = 0; 829 lc->state = LOG_RESUMED; 830 lc->recovery_halted = 0; 831 832 return rq->error; 833 } 834 835 /* 836 * local_resume 837 * @rq 838 * 839 * If the log is pending, we must first join the cpg and 840 * put the log in the official list. 841 * 842 */ 843 int local_resume(struct dm_ulog_request *rq) 844 { 845 int r; 846 time_t t; 847 struct log_c *lc = get_log(rq->uuid, rq->luid); 848 849 if (!lc) { 850 /* Is the log in the pending list? */ 851 lc = get_pending_log(rq->uuid, rq->luid); 852 if (!lc) { 853 LOG_ERROR("clog_resume called on log that is not official or pending"); 854 return -EINVAL; 855 } 856 857 t = time(NULL); 858 t -= lc->delay; 859 /* 860 * This should be considered a temporary fix. It addresses 861 * a problem that exists when nodes suspend/resume in rapid 862 * succession. While the problem is very rare, it has been 863 * seen to happen in real-world-like testing. 864 * 865 * The problem: 866 * - Node A joins cluster 867 * - Node B joins cluster 868 * - Node A prepares checkpoint 869 * - Node A gets ready to write checkpoint 870 * - Node B leaves 871 * - Node B joins 872 * - Node A finishes write of checkpoint 873 * - Node B receives checkpoint meant for previous session 874 * -- Node B can now be non-coherent 875 * 876 * This timer will solve the problem for now, but could be 877 * replaced by a generation number sent with the resume 878 * command from the kernel. The generation number would 879 * be included in the name of the checkpoint to prevent 880 * reading stale data. 881 */ 882 if ((t < 3) && (t >= 0)) 883 sleep(3 - t); 884 885 /* Join the CPG */ 886 r = create_cluster_cpg(rq->uuid, rq->luid); 887 if (r) { 888 LOG_ERROR("clog_resume: Failed to create cluster CPG"); 889 return r; 890 } 891 892 /* move log to official list */ 893 dm_list_del(&lc->list); 894 dm_list_add(&log_list, &lc->list); 895 } 896 897 return 0; 898 } 899 900 /* 901 * clog_get_region_size 902 * @rq 903 * 904 * Since this value doesn't change, the kernel 905 * should not need to talk to server to get this 906 * The function is here for completness 907 * 908 * Returns: 0 on success, -EXXX on failure 909 */ 910 static int clog_get_region_size(struct dm_ulog_request *rq) 911 { 912 uint64_t *rtn = (uint64_t *)rq->data; 913 struct log_c *lc = get_log(rq->uuid, rq->luid); 914 915 if (!lc && !(lc = get_pending_log(rq->uuid, rq->luid))) 916 return -EINVAL; 917 918 *rtn = lc->region_size; 919 rq->data_size = sizeof(*rtn); 920 921 return 0; 922 } 923 924 /* 925 * clog_is_clean 926 * @rq 927 * 928 * Returns: 1 if clean, 0 otherwise 929 */ 930 static int clog_is_clean(struct dm_ulog_request *rq) 931 { 932 int64_t *rtn = (int64_t *)rq->data; 933 uint64_t region = *((uint64_t *)(rq->data)); 934 struct log_c *lc = get_log(rq->uuid, rq->luid); 935 936 if (!lc) 937 return -EINVAL; 938 939 *rtn = log_test_bit(lc->clean_bits, region); 940 rq->data_size = sizeof(*rtn); 941 942 return 0; 943 } 944 945 /* 946 * clog_in_sync 947 * @rq 948 * 949 * We ignore any request for non-block. That 950 * should be handled elsewhere. (If the request 951 * has come this far, it has already blocked.) 952 * 953 * Returns: 1 if in-sync, 0 otherwise 954 */ 955 static int clog_in_sync(struct dm_ulog_request *rq) 956 { 957 int64_t *rtn = (int64_t *)rq->data; 958 uint64_t region = *((uint64_t *)(rq->data)); 959 struct log_c *lc = get_log(rq->uuid, rq->luid); 960 961 if (!lc) 962 return -EINVAL; 963 964 if (region > lc->region_count) 965 return -EINVAL; 966 967 *rtn = log_test_bit(lc->sync_bits, region); 968 if (*rtn) 969 LOG_DBG("[%s] Region is in-sync: %llu", 970 SHORT_UUID(lc->uuid), (unsigned long long)region); 971 else 972 LOG_DBG("[%s] Region is not in-sync: %llu", 973 SHORT_UUID(lc->uuid), (unsigned long long)region); 974 975 rq->data_size = sizeof(*rtn); 976 977 return 0; 978 } 979 980 /* 981 * clog_flush 982 * @rq 983 * 984 */ 985 static int clog_flush(struct dm_ulog_request *rq, int server) 986 { 987 int r = 0; 988 struct log_c *lc = get_log(rq->uuid, rq->luid); 989 990 if (!lc) 991 return -EINVAL; 992 993 if (!lc->touched) 994 return 0; 995 996 /* 997 * Do the actual flushing of the log only 998 * if we are the server. 999 */ 1000 if (server && (lc->disk_fd >= 0)) { 1001 r = rq->error = write_log(lc); 1002 if (r) 1003 LOG_ERROR("[%s] Error writing to disk log", 1004 SHORT_UUID(lc->uuid)); 1005 else 1006 LOG_DBG("[%s] Disk log written", SHORT_UUID(lc->uuid)); 1007 } 1008 1009 lc->touched = 0; 1010 1011 return r; 1012 1013 } 1014 1015 /* 1016 * mark_region 1017 * @lc 1018 * @region 1019 * @who 1020 * 1021 * Put a mark region request in the tree for tracking. 1022 * 1023 * Returns: 0 on success, -EXXX on error 1024 */ 1025 static int mark_region(struct log_c *lc, uint64_t region, uint32_t who) 1026 { 1027 int found = 0; 1028 struct mark_entry *m; 1029 1030 dm_list_iterate_items(m, &lc->mark_list) 1031 if (m->region == region) { 1032 found = 1; 1033 if (m->nodeid == who) 1034 return 0; 1035 } 1036 1037 if (!found) 1038 log_clear_bit(lc, lc->clean_bits, region); 1039 1040 /* 1041 * Save allocation until here - if there is a failure, 1042 * at least we have cleared the bit. 1043 */ 1044 m = malloc(sizeof(*m)); 1045 if (!m) { 1046 LOG_ERROR("Unable to allocate space for mark_entry: %llu/%u", 1047 (unsigned long long)region, who); 1048 return -ENOMEM; 1049 } 1050 1051 m->nodeid = who; 1052 m->region = region; 1053 dm_list_add(&lc->mark_list, &m->list); 1054 1055 return 0; 1056 } 1057 1058 /* 1059 * clog_mark_region 1060 * @rq 1061 * 1062 * rq may contain more than one mark request. We 1063 * can determine the number from the 'data_size' field. 1064 * 1065 * Returns: 0 on success, -EXXX on failure 1066 */ 1067 static int clog_mark_region(struct dm_ulog_request *rq, uint32_t originator) 1068 { 1069 int r; 1070 int count; 1071 uint64_t *region; 1072 struct log_c *lc = get_log(rq->uuid, rq->luid); 1073 1074 if (!lc) 1075 return -EINVAL; 1076 1077 if (rq->data_size % sizeof(uint64_t)) { 1078 LOG_ERROR("Bad data size given for mark_region request"); 1079 return -EINVAL; 1080 } 1081 1082 count = rq->data_size / sizeof(uint64_t); 1083 region = (uint64_t *)&rq->data; 1084 1085 for (; count > 0; count--, region++) { 1086 r = mark_region(lc, *region, originator); 1087 if (r) 1088 return r; 1089 } 1090 1091 rq->data_size = 0; 1092 1093 return 0; 1094 } 1095 1096 static int clear_region(struct log_c *lc, uint64_t region, uint32_t who) 1097 { 1098 int other_matches = 0; 1099 struct mark_entry *m, *n; 1100 1101 dm_list_iterate_items_safe(m, n, &lc->mark_list) 1102 if (m->region == region) { 1103 if (m->nodeid == who) { 1104 dm_list_del(&m->list); 1105 free(m); 1106 } else 1107 other_matches = 1; 1108 } 1109 1110 /* 1111 * Clear region if: 1112 * 1) It is in-sync 1113 * 2) There are no other machines that have it marked 1114 */ 1115 if (!other_matches && log_test_bit(lc->sync_bits, region)) 1116 log_set_bit(lc, lc->clean_bits, region); 1117 1118 return 0; 1119 } 1120 1121 /* 1122 * clog_clear_region 1123 * @rq 1124 * 1125 * rq may contain more than one clear request. We 1126 * can determine the number from the 'data_size' field. 1127 * 1128 * Returns: 0 on success, -EXXX on failure 1129 */ 1130 static int clog_clear_region(struct dm_ulog_request *rq, uint32_t originator) 1131 { 1132 int r; 1133 int count; 1134 uint64_t *region; 1135 struct log_c *lc = get_log(rq->uuid, rq->luid); 1136 1137 if (!lc) 1138 return -EINVAL; 1139 1140 if (rq->data_size % sizeof(uint64_t)) { 1141 LOG_ERROR("Bad data size given for clear_region request"); 1142 return -EINVAL; 1143 } 1144 1145 count = rq->data_size / sizeof(uint64_t); 1146 region = (uint64_t *)&rq->data; 1147 1148 for (; count > 0; count--, region++) { 1149 r = clear_region(lc, *region, originator); 1150 if (r) 1151 return r; 1152 } 1153 1154 rq->data_size = 0; 1155 1156 return 0; 1157 } 1158 1159 /* 1160 * clog_get_resync_work 1161 * @rq 1162 * 1163 */ 1164 static int clog_get_resync_work(struct dm_ulog_request *rq, uint32_t originator) 1165 { 1166 struct { 1167 int64_t i; 1168 uint64_t r; 1169 } *pkg = (void *)rq->data; 1170 struct log_c *lc = get_log(rq->uuid, rq->luid); 1171 1172 if (!lc) 1173 return -EINVAL; 1174 1175 rq->data_size = sizeof(*pkg); 1176 pkg->i = 0; 1177 1178 if (lc->sync_search >= lc->region_count) { 1179 /* 1180 * FIXME: handle intermittent errors during recovery 1181 * by resetting sync_search... but not to many times. 1182 */ 1183 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1184 "Recovery finished", 1185 rq->seq, SHORT_UUID(lc->uuid), originator); 1186 return 0; 1187 } 1188 1189 if (lc->recovering_region != (uint64_t)-1) { 1190 if (lc->recoverer == originator) { 1191 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1192 "Re-requesting work (%llu)", 1193 rq->seq, SHORT_UUID(lc->uuid), originator, 1194 (unsigned long long)lc->recovering_region); 1195 pkg->r = lc->recovering_region; 1196 pkg->i = 1; 1197 LOG_COND(log_resend_requests, "***** RE-REQUEST *****"); 1198 } else { 1199 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1200 "Someone already recovering (%llu)", 1201 rq->seq, SHORT_UUID(lc->uuid), originator, 1202 (unsigned long long)lc->recovering_region); 1203 } 1204 1205 return 0; 1206 } 1207 1208 while (lc->recovery_request_list) { 1209 struct recovery_request *del; 1210 1211 del = lc->recovery_request_list; 1212 lc->recovery_request_list = del->next; 1213 1214 pkg->r = del->region; 1215 free(del); 1216 1217 if (!log_test_bit(lc->sync_bits, pkg->r)) { 1218 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1219 "Assigning priority resync work (%llu)", 1220 rq->seq, SHORT_UUID(lc->uuid), originator, 1221 (unsigned long long)pkg->r); 1222 pkg->i = 1; 1223 lc->recovering_region = pkg->r; 1224 lc->recoverer = originator; 1225 return 0; 1226 } 1227 } 1228 1229 pkg->r = find_next_zero_bit(lc->sync_bits, 1230 lc->sync_search); 1231 1232 if (pkg->r >= lc->region_count) { 1233 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1234 "Resync work complete.", 1235 rq->seq, SHORT_UUID(lc->uuid), originator); 1236 return 0; 1237 } 1238 1239 lc->sync_search = pkg->r + 1; 1240 1241 LOG_SPRINT(lc, "GET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1242 "Assigning resync work (%llu)", 1243 rq->seq, SHORT_UUID(lc->uuid), originator, 1244 (unsigned long long)pkg->r); 1245 pkg->i = 1; 1246 lc->recovering_region = pkg->r; 1247 lc->recoverer = originator; 1248 1249 return 0; 1250 } 1251 1252 /* 1253 * clog_set_region_sync 1254 * @rq 1255 */ 1256 static int clog_set_region_sync(struct dm_ulog_request *rq, uint32_t originator) 1257 { 1258 struct { 1259 uint64_t region; 1260 int64_t in_sync; 1261 } *pkg = (void *)rq->data; 1262 struct log_c *lc = get_log(rq->uuid, rq->luid); 1263 1264 if (!lc) 1265 return -EINVAL; 1266 1267 lc->recovering_region = (uint64_t)-1; 1268 1269 if (pkg->in_sync) { 1270 if (log_test_bit(lc->sync_bits, pkg->region)) { 1271 LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1272 "Region already set (%llu)", 1273 rq->seq, SHORT_UUID(lc->uuid), originator, 1274 (unsigned long long)pkg->region); 1275 } else { 1276 log_set_bit(lc, lc->sync_bits, pkg->region); 1277 lc->sync_count++; 1278 1279 /* The rest of this section is all for debugging */ 1280 LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1281 "Setting region (%llu)", 1282 rq->seq, SHORT_UUID(lc->uuid), originator, 1283 (unsigned long long)pkg->region); 1284 if (pkg->region == lc->skip_bit_warning) 1285 lc->skip_bit_warning = lc->region_count; 1286 1287 if (pkg->region > (lc->skip_bit_warning + 5)) { 1288 LOG_ERROR("*** Region #%llu skipped during recovery ***", 1289 (unsigned long long)lc->skip_bit_warning); 1290 lc->skip_bit_warning = lc->region_count; 1291 #ifdef DEBUG 1292 kill(getpid(), SIGUSR1); 1293 #endif 1294 } 1295 1296 if (!log_test_bit(lc->sync_bits, 1297 (pkg->region) ? pkg->region - 1 : 0)) { 1298 LOG_SPRINT(lc, "*** Previous bit not set ***"); 1299 lc->skip_bit_warning = (pkg->region) ? 1300 pkg->region - 1 : 0; 1301 } 1302 } 1303 } else if (log_test_bit(lc->sync_bits, pkg->region)) { 1304 lc->sync_count--; 1305 log_clear_bit(lc, lc->sync_bits, pkg->region); 1306 LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1307 "Unsetting region (%llu)", 1308 rq->seq, SHORT_UUID(lc->uuid), originator, 1309 (unsigned long long)pkg->region); 1310 } 1311 1312 if (lc->sync_count != count_bits32(lc->sync_bits)) { 1313 unsigned long long reset = count_bits32(lc->sync_bits); 1314 1315 LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1316 "sync_count(%llu) != bitmap count(%llu)", 1317 rq->seq, SHORT_UUID(lc->uuid), originator, 1318 (unsigned long long)lc->sync_count, reset); 1319 #ifdef DEBUG 1320 kill(getpid(), SIGUSR1); 1321 #endif 1322 lc->sync_count = reset; 1323 } 1324 1325 if (lc->sync_count > lc->region_count) 1326 LOG_SPRINT(lc, "SET - SEQ#=%u, UUID=%s, nodeid = %u:: " 1327 "(lc->sync_count > lc->region_count) - this is bad", 1328 rq->seq, SHORT_UUID(lc->uuid), originator); 1329 1330 rq->data_size = 0; 1331 return 0; 1332 } 1333 1334 /* 1335 * clog_get_sync_count 1336 * @rq 1337 */ 1338 static int clog_get_sync_count(struct dm_ulog_request *rq, uint32_t originator) 1339 { 1340 uint64_t *sync_count = (uint64_t *)rq->data; 1341 struct log_c *lc = get_log(rq->uuid, rq->luid); 1342 1343 /* 1344 * FIXME: Mirror requires us to be able to ask for 1345 * the sync count while pending... but I don't like 1346 * it because other machines may not be suspended and 1347 * the stored value may not be accurate. 1348 */ 1349 if (!lc) 1350 lc = get_pending_log(rq->uuid, rq->luid); 1351 1352 if (!lc) 1353 return -EINVAL; 1354 1355 *sync_count = lc->sync_count; 1356 1357 rq->data_size = sizeof(*sync_count); 1358 1359 if (lc->sync_count != count_bits32(lc->sync_bits)) { 1360 unsigned long long reset = count_bits32(lc->sync_bits); 1361 1362 LOG_SPRINT(lc, "get_sync_count - SEQ#=%u, UUID=%s, nodeid = %u:: " 1363 "sync_count(%llu) != bitmap count(%llu)", 1364 rq->seq, SHORT_UUID(lc->uuid), originator, 1365 (unsigned long long)lc->sync_count, reset); 1366 #ifdef DEBUG 1367 kill(getpid(), SIGUSR1); 1368 #endif 1369 lc->sync_count = reset; 1370 } 1371 1372 return 0; 1373 } 1374 1375 static int core_status_info(struct log_c *lc, struct dm_ulog_request *rq) 1376 { 1377 char *data = (char *)rq->data; 1378 1379 rq->data_size = sprintf(data, "1 clustered_core"); 1380 1381 return 0; 1382 } 1383 1384 static int disk_status_info(struct log_c *lc, struct dm_ulog_request *rq) 1385 { 1386 char *data = (char *)rq->data; 1387 struct stat statbuf; 1388 1389 if(fstat(lc->disk_fd, &statbuf)) { 1390 rq->error = -errno; 1391 return -errno; 1392 } 1393 1394 rq->data_size = sprintf(data, "3 clustered_disk %d:%d %c", 1395 major(statbuf.st_rdev), minor(statbuf.st_rdev), 1396 (lc->log_dev_failed) ? 'D' : 'A'); 1397 1398 return 0; 1399 } 1400 1401 /* 1402 * clog_status_info 1403 * @rq 1404 * 1405 */ 1406 static int clog_status_info(struct dm_ulog_request *rq) 1407 { 1408 int r; 1409 struct log_c *lc = get_log(rq->uuid, rq->luid); 1410 1411 if (!lc) 1412 lc = get_pending_log(rq->uuid, rq->luid); 1413 1414 if (!lc) 1415 return -EINVAL; 1416 1417 if (lc->disk_fd == -1) 1418 r = core_status_info(lc, rq); 1419 else 1420 r = disk_status_info(lc, rq); 1421 1422 return r; 1423 } 1424 1425 static int core_status_table(struct log_c *lc, struct dm_ulog_request *rq) 1426 { 1427 char *data = (char *)rq->data; 1428 1429 rq->data_size = sprintf(data, "clustered_core %u %s%s ", 1430 lc->region_size, 1431 (lc->sync == DEFAULTSYNC) ? "" : 1432 (lc->sync == NOSYNC) ? "nosync " : "sync ", 1433 (lc->block_on_error) ? "block_on_error" : ""); 1434 return 0; 1435 } 1436 1437 static int disk_status_table(struct log_c *lc, struct dm_ulog_request *rq) 1438 { 1439 char *data = (char *)rq->data; 1440 struct stat statbuf; 1441 1442 if(fstat(lc->disk_fd, &statbuf)) { 1443 rq->error = -errno; 1444 return -errno; 1445 } 1446 1447 rq->data_size = sprintf(data, "clustered_disk %d:%d %u %s%s ", 1448 major(statbuf.st_rdev), minor(statbuf.st_rdev), 1449 lc->region_size, 1450 (lc->sync == DEFAULTSYNC) ? "" : 1451 (lc->sync == NOSYNC) ? "nosync " : "sync ", 1452 (lc->block_on_error) ? "block_on_error" : ""); 1453 return 0; 1454 } 1455 1456 /* 1457 * clog_status_table 1458 * @rq 1459 * 1460 */ 1461 static int clog_status_table(struct dm_ulog_request *rq) 1462 { 1463 int r; 1464 struct log_c *lc = get_log(rq->uuid, rq->luid); 1465 1466 if (!lc) 1467 lc = get_pending_log(rq->uuid, rq->luid); 1468 1469 if (!lc) 1470 return -EINVAL; 1471 1472 if (lc->disk_fd == -1) 1473 r = core_status_table(lc, rq); 1474 else 1475 r = disk_status_table(lc, rq); 1476 1477 return r; 1478 } 1479 1480 /* 1481 * clog_is_remote_recovering 1482 * @rq 1483 * 1484 */ 1485 static int clog_is_remote_recovering(struct dm_ulog_request *rq) 1486 { 1487 uint64_t region = *((uint64_t *)(rq->data)); 1488 struct { 1489 int64_t is_recovering; 1490 uint64_t in_sync_hint; 1491 } *pkg = (void *)rq->data; 1492 struct log_c *lc = get_log(rq->uuid, rq->luid); 1493 1494 if (!lc) 1495 return -EINVAL; 1496 1497 if (region > lc->region_count) 1498 return -EINVAL; 1499 1500 if (lc->recovery_halted) { 1501 LOG_DBG("[%s] Recovery halted... [not remote recovering]: %llu", 1502 SHORT_UUID(lc->uuid), (unsigned long long)region); 1503 pkg->is_recovering = 0; 1504 pkg->in_sync_hint = lc->region_count; /* none are recovering */ 1505 } else { 1506 pkg->is_recovering = !log_test_bit(lc->sync_bits, region); 1507 1508 /* 1509 * Remember, 'lc->sync_search' is 1 plus the region 1510 * currently being recovered. So, we must take off 1 1511 * to account for that; but only if 'sync_search > 1'. 1512 */ 1513 pkg->in_sync_hint = lc->sync_search ? (lc->sync_search - 1) : 0; 1514 LOG_DBG("[%s] Region is %s: %llu", 1515 SHORT_UUID(lc->uuid), 1516 (region == lc->recovering_region) ? 1517 "currently remote recovering" : 1518 (pkg->is_recovering) ? "pending remote recovery" : 1519 "not remote recovering", (unsigned long long)region); 1520 } 1521 1522 if (pkg->is_recovering && 1523 (region != lc->recovering_region)) { 1524 struct recovery_request *rr; 1525 1526 /* Already in the list? */ 1527 for (rr = lc->recovery_request_list; rr; rr = rr->next) 1528 if (rr->region == region) 1529 goto out; 1530 1531 /* Failure to allocated simply means we can't prioritize it */ 1532 rr = malloc(sizeof(*rr)); 1533 if (!rr) 1534 goto out; 1535 1536 LOG_DBG("[%s] Adding region to priority list: %llu", 1537 SHORT_UUID(lc->uuid), (unsigned long long)region); 1538 rr->region = region; 1539 rr->next = lc->recovery_request_list; 1540 lc->recovery_request_list = rr; 1541 } 1542 1543 out: 1544 1545 rq->data_size = sizeof(*pkg); 1546 1547 return 0; 1548 } 1549 1550 1551 /* 1552 * do_request 1553 * @rq: the request 1554 * @server: is this request performed by the server 1555 * 1556 * An inability to perform this function will return an error 1557 * from this function. However, an inability to successfully 1558 * perform the request will fill in the 'rq->error' field. 1559 * 1560 * Returns: 0 on success, -EXXX on error 1561 */ 1562 int do_request(struct clog_request *rq, int server) 1563 { 1564 int r; 1565 1566 if (!rq) 1567 return 0; 1568 1569 if (rq->u_rq.error) 1570 LOG_DBG("Programmer error: rq struct has error set"); 1571 1572 switch (rq->u_rq.request_type) { 1573 case DM_ULOG_CTR: 1574 r = clog_ctr(&rq->u_rq); 1575 break; 1576 case DM_ULOG_DTR: 1577 r = clog_dtr(&rq->u_rq); 1578 break; 1579 case DM_ULOG_PRESUSPEND: 1580 r = clog_presuspend(&rq->u_rq); 1581 break; 1582 case DM_ULOG_POSTSUSPEND: 1583 r = clog_postsuspend(&rq->u_rq); 1584 break; 1585 case DM_ULOG_RESUME: 1586 r = clog_resume(&rq->u_rq); 1587 break; 1588 case DM_ULOG_GET_REGION_SIZE: 1589 r = clog_get_region_size(&rq->u_rq); 1590 break; 1591 case DM_ULOG_IS_CLEAN: 1592 r = clog_is_clean(&rq->u_rq); 1593 break; 1594 case DM_ULOG_IN_SYNC: 1595 r = clog_in_sync(&rq->u_rq); 1596 break; 1597 case DM_ULOG_FLUSH: 1598 r = clog_flush(&rq->u_rq, server); 1599 break; 1600 case DM_ULOG_MARK_REGION: 1601 r = clog_mark_region(&rq->u_rq, rq->originator); 1602 break; 1603 case DM_ULOG_CLEAR_REGION: 1604 r = clog_clear_region(&rq->u_rq, rq->originator); 1605 break; 1606 case DM_ULOG_GET_RESYNC_WORK: 1607 r = clog_get_resync_work(&rq->u_rq, rq->originator); 1608 break; 1609 case DM_ULOG_SET_REGION_SYNC: 1610 r = clog_set_region_sync(&rq->u_rq, rq->originator); 1611 break; 1612 case DM_ULOG_GET_SYNC_COUNT: 1613 r = clog_get_sync_count(&rq->u_rq, rq->originator); 1614 break; 1615 case DM_ULOG_STATUS_INFO: 1616 r = clog_status_info(&rq->u_rq); 1617 break; 1618 case DM_ULOG_STATUS_TABLE: 1619 r = clog_status_table(&rq->u_rq); 1620 break; 1621 case DM_ULOG_IS_REMOTE_RECOVERING: 1622 r = clog_is_remote_recovering(&rq->u_rq); 1623 break; 1624 default: 1625 LOG_ERROR("Unknown request"); 1626 r = rq->u_rq.error = -EINVAL; 1627 break; 1628 } 1629 1630 if (r && !rq->u_rq.error) 1631 rq->u_rq.error = r; 1632 else if (r != rq->u_rq.error) 1633 LOG_DBG("Warning: error from function != rq->u_rq.error"); 1634 1635 if (rq->u_rq.error && rq->u_rq.data_size) { 1636 /* Make sure I'm handling errors correctly above */ 1637 LOG_DBG("Programmer error: rq->u_rq.error && rq->u_rq.data_size"); 1638 rq->u_rq.data_size = 0; 1639 } 1640 1641 return 0; 1642 } 1643 1644 static void print_bits(char *buf, int size, int print) 1645 { 1646 int i; 1647 char outbuf[128]; 1648 1649 memset(outbuf, 0, sizeof(outbuf)); 1650 1651 for (i = 0; i < size; i++) { 1652 if (!(i % 16)) { 1653 if (outbuf[0] != '\0') { 1654 if (print) 1655 LOG_PRINT("%s", outbuf); 1656 else 1657 LOG_DBG("%s", outbuf); 1658 } 1659 memset(outbuf, 0, sizeof(outbuf)); 1660 sprintf(outbuf, "[%3d - %3d]", i, i+15); 1661 } 1662 sprintf(outbuf + strlen(outbuf), " %.2X", (unsigned char)buf[i]); 1663 } 1664 if (outbuf[0] != '\0') { 1665 if (print) 1666 LOG_PRINT("%s", outbuf); 1667 else 1668 LOG_DBG("%s", outbuf); 1669 } 1670 } 1671 1672 /* int store_bits(const char *uuid, const char *which, char **buf)*/ 1673 int push_state(const char *uuid, uint64_t luid, 1674 const char *which, char **buf, uint32_t debug_who) 1675 { 1676 int bitset_size; 1677 struct log_c *lc; 1678 1679 if (*buf) 1680 LOG_ERROR("store_bits: *buf != NULL"); 1681 1682 lc = get_log(uuid, luid); 1683 if (!lc) { 1684 LOG_ERROR("store_bits: No log found for %s", uuid); 1685 return -EINVAL; 1686 } 1687 1688 if (!strcmp(which, "recovering_region")) { 1689 *buf = malloc(64); /* easily handles the 2 written numbers */ 1690 if (!*buf) 1691 return -ENOMEM; 1692 sprintf(*buf, "%llu %u", (unsigned long long)lc->recovering_region, 1693 lc->recoverer); 1694 1695 LOG_SPRINT(lc, "CKPT SEND - SEQ#=X, UUID=%s, nodeid = %u:: " 1696 "recovering_region=%llu, recoverer=%u, sync_count=%llu", 1697 SHORT_UUID(lc->uuid), debug_who, 1698 (unsigned long long)lc->recovering_region, 1699 lc->recoverer, 1700 (unsigned long long)count_bits32(lc->sync_bits)); 1701 return 64; 1702 } 1703 1704 /* Size in 'int's */ 1705 bitset_size = ((int)lc->clean_bits[0]/DM_BITS_PER_INT) + 1; 1706 1707 /* Size in bytes */ 1708 bitset_size *= 4; 1709 1710 *buf = malloc(bitset_size); 1711 1712 if (!*buf) { 1713 LOG_ERROR("store_bits: Unable to allocate memory"); 1714 return -ENOMEM; 1715 } 1716 1717 if (!strncmp(which, "sync_bits", 9)) { 1718 memcpy(*buf, lc->sync_bits + 1, bitset_size); 1719 LOG_DBG("[%s] storing sync_bits (sync_count = %llu):", 1720 SHORT_UUID(uuid), (unsigned long long) 1721 count_bits32(lc->sync_bits)); 1722 print_bits(*buf, bitset_size, 0); 1723 } else if (!strncmp(which, "clean_bits", 9)) { 1724 memcpy(*buf, lc->clean_bits + 1, bitset_size); 1725 LOG_DBG("[%s] storing clean_bits:", SHORT_UUID(lc->uuid)); 1726 print_bits(*buf, bitset_size, 0); 1727 } 1728 1729 return bitset_size; 1730 } 1731 1732 /*int load_bits(const char *uuid, const char *which, char *buf, int size)*/ 1733 int pull_state(const char *uuid, uint64_t luid, 1734 const char *which, char *buf, int size) 1735 { 1736 int bitset_size; 1737 struct log_c *lc; 1738 1739 if (!buf) 1740 LOG_ERROR("pull_state: buf == NULL"); 1741 1742 lc = get_log(uuid, luid); 1743 if (!lc) { 1744 LOG_ERROR("pull_state: No log found for %s", uuid); 1745 return -EINVAL; 1746 } 1747 1748 if (!strncmp(which, "recovering_region", 17)) { 1749 sscanf(buf, "%llu %u", (unsigned long long *)&lc->recovering_region, 1750 &lc->recoverer); 1751 LOG_SPRINT(lc, "CKPT INIT - SEQ#=X, UUID=%s, nodeid = X:: " 1752 "recovering_region=%llu, recoverer=%u", 1753 SHORT_UUID(lc->uuid), 1754 (unsigned long long)lc->recovering_region, lc->recoverer); 1755 return 0; 1756 } 1757 1758 /* Size in 'int's */ 1759 bitset_size = ((int)lc->clean_bits[0]/DM_BITS_PER_INT) + 1; 1760 1761 /* Size in bytes */ 1762 bitset_size *= 4; 1763 1764 if (bitset_size != size) { 1765 LOG_ERROR("pull_state(%s): bad bitset_size (%d vs %d)", 1766 which, size, bitset_size); 1767 return -EINVAL; 1768 } 1769 1770 if (!strncmp(which, "sync_bits", 9)) { 1771 lc->resume_override += 1; 1772 memcpy(lc->sync_bits + 1, buf, bitset_size); 1773 LOG_DBG("[%s] loading sync_bits (sync_count = %llu):", 1774 SHORT_UUID(lc->uuid),(unsigned long long) 1775 count_bits32(lc->sync_bits)); 1776 print_bits((char *)lc->sync_bits, bitset_size, 0); 1777 } else if (!strncmp(which, "clean_bits", 9)) { 1778 lc->resume_override += 2; 1779 memcpy(lc->clean_bits + 1, buf, bitset_size); 1780 LOG_DBG("[%s] loading clean_bits:", SHORT_UUID(lc->uuid)); 1781 print_bits((char *)lc->clean_bits, bitset_size, 0); 1782 } 1783 1784 return 0; 1785 } 1786 1787 int log_get_state(struct dm_ulog_request *rq) 1788 { 1789 struct log_c *lc; 1790 1791 lc = get_log(rq->uuid, rq->luid); 1792 if (!lc) 1793 return -EINVAL; 1794 1795 return lc->state; 1796 } 1797 1798 /* 1799 * log_status 1800 * 1801 * Returns: 1 if logs are still present, 0 otherwise 1802 */ 1803 int log_status(void) 1804 { 1805 if (!dm_list_empty(&log_list) || !dm_list_empty(&log_pending_list)) 1806 return 1; 1807 1808 return 0; 1809 } 1810 1811 void log_debug(void) 1812 { 1813 struct log_c *lc; 1814 uint64_t r; 1815 int i; 1816 1817 LOG_ERROR(""); 1818 LOG_ERROR("LOG COMPONENT DEBUGGING::"); 1819 LOG_ERROR("Official log list:"); 1820 LOG_ERROR("Pending log list:"); 1821 dm_list_iterate_items(lc, &log_pending_list) { 1822 LOG_ERROR("%s", lc->uuid); 1823 LOG_ERROR("sync_bits:"); 1824 print_bits((char *)lc->sync_bits, (int)lc->sync_bits[0], 1); 1825 LOG_ERROR("clean_bits:"); 1826 print_bits((char *)lc->clean_bits, (int)lc->sync_bits[0], 1); 1827 } 1828 1829 dm_list_iterate_items(lc, &log_list) { 1830 LOG_ERROR("%s", lc->uuid); 1831 LOG_ERROR(" recoverer : %u", lc->recoverer); 1832 LOG_ERROR(" recovering_region: %llu", 1833 (unsigned long long)lc->recovering_region); 1834 LOG_ERROR(" recovery_halted : %s", (lc->recovery_halted) ? 1835 "YES" : "NO"); 1836 LOG_ERROR("sync_bits:"); 1837 print_bits((char *)lc->sync_bits, (int)lc->sync_bits[0], 1); 1838 LOG_ERROR("clean_bits:"); 1839 print_bits((char *)lc->clean_bits, (int)lc->sync_bits[0], 1); 1840 1841 LOG_ERROR("Validating %s::", SHORT_UUID(lc->uuid)); 1842 r = find_next_zero_bit(lc->sync_bits, 0); 1843 LOG_ERROR(" lc->region_count = %llu", 1844 (unsigned long long)lc->region_count); 1845 LOG_ERROR(" lc->sync_count = %llu", 1846 (unsigned long long)lc->sync_count); 1847 LOG_ERROR(" next zero bit = %llu", 1848 (unsigned long long)r); 1849 if ((r > lc->region_count) || 1850 ((r == lc->region_count) && (lc->sync_count > lc->region_count))) { 1851 LOG_ERROR("ADJUSTING SYNC_COUNT"); 1852 lc->sync_count = lc->region_count; 1853 } 1854 1855 LOG_ERROR("Resync request history:"); 1856 for (i = 0; i < RESYNC_HISTORY; i++) { 1857 lc->idx++; 1858 lc->idx = lc->idx % RESYNC_HISTORY; 1859 if (lc->resync_history[lc->idx][0] == '\0') 1860 continue; 1861 LOG_ERROR("%d:%d) %s", i, lc->idx, 1862 lc->resync_history[lc->idx]); 1863 } 1864 } 1865 } 1866