1 /* 2 * Copyright (c) 2006 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/platform/vkernel/platform/init.c,v 1.56 2008/05/27 07:48:00 dillon Exp $ 35 */ 36 37 #include <sys/types.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/stat.h> 41 #include <sys/mman.h> 42 #include <sys/cons.h> 43 #include <sys/random.h> 44 #include <sys/vkernel.h> 45 #include <sys/tls.h> 46 #include <sys/reboot.h> 47 #include <sys/proc.h> 48 #include <sys/msgbuf.h> 49 #include <sys/vmspace.h> 50 #include <sys/socket.h> 51 #include <sys/sockio.h> 52 #include <sys/sysctl.h> 53 #include <sys/un.h> 54 #include <vm/vm_page.h> 55 56 #include <machine/cpu.h> 57 #include <machine/globaldata.h> 58 #include <machine/tls.h> 59 #include <machine/md_var.h> 60 #include <machine/vmparam.h> 61 #include <cpu/specialreg.h> 62 63 #include <net/if.h> 64 #include <net/if_arp.h> 65 #include <net/ethernet.h> 66 #include <net/bridge/if_bridgevar.h> 67 #include <netinet/in.h> 68 #include <arpa/inet.h> 69 70 #include <stdio.h> 71 #include <stdlib.h> 72 #include <stdarg.h> 73 #include <unistd.h> 74 #include <fcntl.h> 75 #include <string.h> 76 #include <err.h> 77 #include <errno.h> 78 #include <assert.h> 79 80 vm_paddr_t phys_avail[16]; 81 vm_paddr_t Maxmem; 82 vm_paddr_t Maxmem_bytes; 83 int MemImageFd = -1; 84 struct vkdisk_info DiskInfo[VKDISK_MAX]; 85 int DiskNum; 86 struct vknetif_info NetifInfo[VKNETIF_MAX]; 87 int NetifNum; 88 char *pid_file; 89 vm_offset_t KvaStart; 90 vm_offset_t KvaEnd; 91 vm_offset_t KvaSize; 92 vm_offset_t virtual_start; 93 vm_offset_t virtual_end; 94 vm_offset_t virtual2_start; 95 vm_offset_t virtual2_end; 96 vm_offset_t kernel_vm_end; 97 vm_offset_t crashdumpmap; 98 vm_offset_t clean_sva; 99 vm_offset_t clean_eva; 100 struct msgbuf *msgbufp; 101 caddr_t ptvmmap; 102 vpte_t *KernelPTD; 103 vpte_t *KernelPTA; /* Warning: Offset for direct VA translation */ 104 void *dmap_min_address; 105 u_int cpu_feature; /* XXX */ 106 int tsc_present; 107 int64_t tsc_frequency; 108 int optcpus; /* number of cpus - see mp_start() */ 109 int lwp_cpu_lock; /* if/how to lock virtual CPUs to real CPUs */ 110 int real_ncpus; /* number of real CPUs */ 111 int next_cpu; /* next real CPU to lock a virtual CPU to */ 112 113 struct privatespace *CPU_prvspace; 114 115 static struct trapframe proc0_tf; 116 static void *proc0paddr; 117 118 static void init_sys_memory(char *imageFile); 119 static void init_kern_memory(void); 120 static void init_globaldata(void); 121 static void init_vkernel(void); 122 static void init_disk(char *diskExp[], int diskFileNum, enum vkdisk_type type); 123 static void init_netif(char *netifExp[], int netifFileNum); 124 static void writepid( void ); 125 static void cleanpid( void ); 126 static int unix_connect(const char *path); 127 static void usage(const char *ctl, ...); 128 129 static int save_ac; 130 static char **save_av; 131 132 /* 133 * Kernel startup for virtual kernels - standard main() 134 */ 135 int 136 main(int ac, char **av) 137 { 138 char *memImageFile = NULL; 139 char *netifFile[VKNETIF_MAX]; 140 char *diskFile[VKDISK_MAX]; 141 char *cdFile[VKDISK_MAX]; 142 char *suffix; 143 char *endp; 144 int netifFileNum = 0; 145 int diskFileNum = 0; 146 int cdFileNum = 0; 147 int bootOnDisk = -1; /* set below to vcd (0) or vkd (1) */ 148 int c; 149 int i; 150 int j; 151 int n; 152 int isq; 153 int real_vkernel_enable; 154 int supports_sse; 155 size_t vsize; 156 157 save_ac = ac; 158 save_av = av; 159 160 /* 161 * Process options 162 */ 163 kernel_mem_readonly = 1; 164 #ifdef SMP 165 optcpus = 2; 166 #endif 167 lwp_cpu_lock = LCL_NONE; 168 169 real_vkernel_enable = 0; 170 vsize = sizeof(real_vkernel_enable); 171 sysctlbyname("vm.vkernel_enable", &real_vkernel_enable, &vsize, NULL,0); 172 173 if (real_vkernel_enable == 0) { 174 errx(1, "vm.vkernel_enable is 0, must be set " 175 "to 1 to execute a vkernel!"); 176 } 177 178 real_ncpus = 1; 179 vsize = sizeof(real_ncpus); 180 sysctlbyname("hw.ncpu", &real_ncpus, &vsize, NULL, 0); 181 182 while ((c = getopt(ac, av, "c:svl:m:n:r:e:i:p:I:U")) != -1) { 183 switch(c) { 184 case 'e': 185 /* 186 * name=value:name=value:name=value... 187 * name="value"... 188 * 189 * Allow values to be quoted but note that shells 190 * may remove the quotes, so using this feature 191 * to embed colons may require a backslash. 192 */ 193 n = strlen(optarg); 194 isq = 0; 195 kern_envp = malloc(n + 2); 196 for (i = j = 0; i < n; ++i) { 197 if (optarg[i] == '"') 198 isq ^= 1; 199 else if (optarg[i] == '\'') 200 isq ^= 2; 201 else if (isq == 0 && optarg[i] == ':') 202 kern_envp[j++] = 0; 203 else 204 kern_envp[j++] = optarg[i]; 205 } 206 kern_envp[j++] = 0; 207 kern_envp[j++] = 0; 208 break; 209 case 's': 210 boothowto |= RB_SINGLE; 211 break; 212 case 'v': 213 bootverbose = 1; 214 break; 215 case 'i': 216 memImageFile = optarg; 217 break; 218 case 'I': 219 if (netifFileNum < VKNETIF_MAX) 220 netifFile[netifFileNum++] = strdup(optarg); 221 break; 222 case 'r': 223 if (bootOnDisk < 0) 224 bootOnDisk = 1; 225 if (diskFileNum + cdFileNum < VKDISK_MAX) 226 diskFile[diskFileNum++] = strdup(optarg); 227 break; 228 case 'c': 229 if (bootOnDisk < 0) 230 bootOnDisk = 0; 231 if (diskFileNum + cdFileNum < VKDISK_MAX) 232 cdFile[cdFileNum++] = strdup(optarg); 233 break; 234 case 'm': 235 Maxmem_bytes = strtoull(optarg, &suffix, 0); 236 if (suffix) { 237 switch(*suffix) { 238 case 'g': 239 case 'G': 240 Maxmem_bytes <<= 30; 241 break; 242 case 'm': 243 case 'M': 244 Maxmem_bytes <<= 20; 245 break; 246 case 'k': 247 case 'K': 248 Maxmem_bytes <<= 10; 249 break; 250 default: 251 Maxmem_bytes = 0; 252 usage("Bad maxmem option"); 253 /* NOT REACHED */ 254 break; 255 } 256 } 257 break; 258 case 'l': 259 next_cpu = -1; 260 if (strncmp("map", optarg, 3) == 0) { 261 lwp_cpu_lock = LCL_PER_CPU; 262 if (optarg[3] == ',') { 263 next_cpu = strtol(optarg+4, &endp, 0); 264 if (*endp != '\0') 265 usage("Bad target CPU number at '%s'", endp); 266 } else { 267 next_cpu = 0; 268 } 269 if (next_cpu < 0 || next_cpu > real_ncpus - 1) 270 usage("Bad target CPU, valid range is 0-%d", real_ncpus - 1); 271 } else if (strncmp("any", optarg, 3) == 0) { 272 lwp_cpu_lock = LCL_NONE; 273 } else { 274 lwp_cpu_lock = LCL_SINGLE_CPU; 275 next_cpu = strtol(optarg, &endp, 0); 276 if (*endp != '\0') 277 usage("Bad target CPU number at '%s'", endp); 278 if (next_cpu < 0 || next_cpu > real_ncpus - 1) 279 usage("Bad target CPU, valid range is 0-%d", real_ncpus - 1); 280 } 281 break; 282 case 'n': 283 /* 284 * This value is set up by mp_start(), don't just 285 * set ncpus here. 286 */ 287 #ifdef SMP 288 optcpus = strtol(optarg, NULL, 0); 289 if (optcpus < 1 || optcpus > MAXCPU) 290 usage("Bad ncpus, valid range is 1-%d", MAXCPU); 291 #else 292 if (strtol(optarg, NULL, 0) != 1) { 293 usage("You built a UP vkernel, only 1 cpu!"); 294 } 295 #endif 296 297 break; 298 case 'p': 299 pid_file = optarg; 300 break; 301 case 'U': 302 kernel_mem_readonly = 0; 303 break; 304 } 305 } 306 307 writepid(); 308 cpu_disable_intr(); 309 init_sys_memory(memImageFile); 310 init_kern_memory(); 311 init_globaldata(); 312 init_vkernel(); 313 setrealcpu(); 314 init_kqueue(); 315 316 /* 317 * Check TSC 318 */ 319 vsize = sizeof(tsc_present); 320 sysctlbyname("hw.tsc_present", &tsc_present, &vsize, NULL, 0); 321 vsize = sizeof(tsc_frequency); 322 sysctlbyname("hw.tsc_frequency", &tsc_frequency, &vsize, NULL, 0); 323 if (tsc_present) 324 cpu_feature |= CPUID_TSC; 325 326 /* 327 * Check SSE 328 */ 329 vsize = sizeof(supports_sse); 330 supports_sse = 0; 331 sysctlbyname("hw.instruction_sse", &supports_sse, &vsize, NULL, 0); 332 init_fpu(supports_sse); 333 if (supports_sse) 334 cpu_feature |= CPUID_SSE | CPUID_FXSR; 335 336 /* 337 * We boot from the first installed disk. 338 */ 339 if (bootOnDisk == 1) { 340 init_disk(diskFile, diskFileNum, VKD_DISK); 341 init_disk(cdFile, cdFileNum, VKD_CD); 342 } else { 343 init_disk(cdFile, cdFileNum, VKD_CD); 344 init_disk(diskFile, diskFileNum, VKD_DISK); 345 } 346 init_netif(netifFile, netifFileNum); 347 init_exceptions(); 348 mi_startup(); 349 /* NOT REACHED */ 350 exit(1); 351 } 352 353 /* 354 * Initialize system memory. This is the virtual kernel's 'RAM'. 355 */ 356 static 357 void 358 init_sys_memory(char *imageFile) 359 { 360 struct stat st; 361 int i; 362 int fd; 363 364 /* 365 * Figure out the system memory image size. If an image file was 366 * specified and -m was not specified, use the image file's size. 367 */ 368 369 if (imageFile && stat(imageFile, &st) == 0 && Maxmem_bytes == 0) 370 Maxmem_bytes = (vm_paddr_t)st.st_size; 371 if ((imageFile == NULL || stat(imageFile, &st) < 0) && 372 Maxmem_bytes == 0) { 373 err(1, "Cannot create new memory file %s unless " 374 "system memory size is specified with -m", 375 imageFile); 376 /* NOT REACHED */ 377 } 378 379 /* 380 * Maxmem must be known at this time 381 */ 382 if (Maxmem_bytes < 32 * 1024 * 1024 || (Maxmem_bytes & SEG_MASK)) { 383 err(1, "Bad maxmem specification: 32MB minimum, " 384 "multiples of %dMB only", 385 SEG_SIZE / 1024 / 1024); 386 /* NOT REACHED */ 387 } 388 389 /* 390 * Generate an image file name if necessary, then open/create the 391 * file exclusively locked. Do not allow multiple virtual kernels 392 * to use the same image file. 393 */ 394 if (imageFile == NULL) { 395 for (i = 0; i < 1000000; ++i) { 396 asprintf(&imageFile, "/var/vkernel/memimg.%06d", i); 397 fd = open(imageFile, 398 O_RDWR|O_CREAT|O_EXLOCK|O_NONBLOCK, 0644); 399 if (fd < 0 && errno == EWOULDBLOCK) { 400 free(imageFile); 401 continue; 402 } 403 break; 404 } 405 } else { 406 fd = open(imageFile, O_RDWR|O_CREAT|O_EXLOCK|O_NONBLOCK, 0644); 407 } 408 fprintf(stderr, "Using memory file: %s\n", imageFile); 409 if (fd < 0 || fstat(fd, &st) < 0) { 410 err(1, "Unable to open/create %s", imageFile); 411 /* NOT REACHED */ 412 } 413 414 /* 415 * Truncate or extend the file as necessary. 416 */ 417 if (st.st_size > Maxmem_bytes) { 418 ftruncate(fd, Maxmem_bytes); 419 } else if (st.st_size < Maxmem_bytes) { 420 char *zmem; 421 off_t off = st.st_size & ~SEG_MASK; 422 423 kprintf("%s: Reserving blocks for memory image\n", imageFile); 424 zmem = malloc(SEG_SIZE); 425 bzero(zmem, SEG_SIZE); 426 lseek(fd, off, SEEK_SET); 427 while (off < Maxmem_bytes) { 428 if (write(fd, zmem, SEG_SIZE) != SEG_SIZE) { 429 err(1, "Unable to reserve blocks for memory image"); 430 /* NOT REACHED */ 431 } 432 off += SEG_SIZE; 433 } 434 if (fsync(fd) < 0) 435 err(1, "Unable to reserve blocks for memory image"); 436 free(zmem); 437 } 438 MemImageFd = fd; 439 Maxmem = Maxmem_bytes >> PAGE_SHIFT; 440 } 441 442 /* 443 * Initialize kernel memory. This reserves kernel virtual memory by using 444 * MAP_VPAGETABLE 445 */ 446 447 static 448 void 449 init_kern_memory(void) 450 { 451 void *base; 452 void *try; 453 char dummy; 454 char *topofstack = &dummy; 455 int i; 456 void *firstfree; 457 458 /* 459 * Memory map our kernel virtual memory space. Note that the 460 * kernel image itself is not made part of this memory for the 461 * moment. 462 * 463 * The memory map must be segment-aligned so we can properly 464 * offset KernelPTD. 465 * 466 * If the system kernel has a different MAXDSIZ, it might not 467 * be possible to map kernel memory in its prefered location. 468 * Try a number of different locations. 469 */ 470 try = (void *)(512UL << 30); 471 base = NULL; 472 while ((char *)try + KERNEL_KVA_SIZE < topofstack) { 473 base = mmap(try, KERNEL_KVA_SIZE, PROT_READ|PROT_WRITE, 474 MAP_FILE|MAP_SHARED|MAP_VPAGETABLE, 475 MemImageFd, (off_t)try); 476 if (base == try) 477 break; 478 if (base != MAP_FAILED) 479 munmap(base, KERNEL_KVA_SIZE); 480 try = (char *)try + (512UL << 30); 481 } 482 if (base != try) { 483 err(1, "Unable to mmap() kernel virtual memory!"); 484 /* NOT REACHED */ 485 } 486 madvise(base, KERNEL_KVA_SIZE, MADV_NOSYNC); 487 KvaStart = (vm_offset_t)base; 488 KvaSize = KERNEL_KVA_SIZE; 489 KvaEnd = KvaStart + KvaSize; 490 printf("KVM mapped at %p-%p\n", (void *)KvaStart, (void *)KvaEnd); 491 492 /* MAP_FILE? */ 493 dmap_min_address = mmap(0, DMAP_SIZE, PROT_READ|PROT_WRITE, 494 MAP_NOCORE|MAP_NOSYNC|MAP_SHARED, 495 MemImageFd, 0); 496 if (dmap_min_address == MAP_FAILED) { 497 err(1, "Unable to mmap() kernel DMAP region!"); 498 /* NOT REACHED */ 499 } 500 501 firstfree = 0; 502 pmap_bootstrap((vm_paddr_t *)&firstfree, (int64_t)base); 503 504 mcontrol(base, KERNEL_KVA_SIZE, MADV_SETMAP, 505 0 | VPTE_R | VPTE_W | VPTE_V); 506 507 /* 508 * phys_avail[] represents unallocated physical memory. MI code 509 * will use phys_avail[] to create the vm_page array. 510 */ 511 phys_avail[0] = (vm_paddr_t)firstfree; 512 phys_avail[0] = (phys_avail[0] + PAGE_MASK) & ~(vm_paddr_t)PAGE_MASK; 513 phys_avail[1] = Maxmem_bytes; 514 515 #if JGV 516 /* 517 * (virtual_start, virtual_end) represent unallocated kernel virtual 518 * memory. MI code will create kernel_map using these parameters. 519 */ 520 virtual_start = KvaStart + (long)firstfree; 521 virtual_start = (virtual_start + PAGE_MASK) & ~(vm_offset_t)PAGE_MASK; 522 virtual_end = KvaStart + KERNEL_KVA_SIZE; 523 #endif 524 525 /* 526 * pmap_growkernel() will set the correct value. 527 */ 528 kernel_vm_end = 0; 529 530 /* 531 * Allocate space for process 0's UAREA. 532 */ 533 proc0paddr = (void *)virtual_start; 534 for (i = 0; i < UPAGES; ++i) { 535 pmap_kenter_quick(virtual_start, phys_avail[0]); 536 virtual_start += PAGE_SIZE; 537 phys_avail[0] += PAGE_SIZE; 538 } 539 540 /* 541 * crashdumpmap 542 */ 543 crashdumpmap = virtual_start; 544 virtual_start += MAXDUMPPGS * PAGE_SIZE; 545 546 /* 547 * msgbufp maps the system message buffer 548 */ 549 assert((MSGBUF_SIZE & PAGE_MASK) == 0); 550 msgbufp = (void *)virtual_start; 551 for (i = 0; i < (MSGBUF_SIZE >> PAGE_SHIFT); ++i) { 552 pmap_kenter_quick(virtual_start, phys_avail[0]); 553 virtual_start += PAGE_SIZE; 554 phys_avail[0] += PAGE_SIZE; 555 } 556 msgbufinit(msgbufp, MSGBUF_SIZE); 557 558 /* 559 * used by kern_memio for /dev/mem access 560 */ 561 ptvmmap = (caddr_t)virtual_start; 562 virtual_start += PAGE_SIZE; 563 564 /* 565 * Bootstrap the kernel_pmap 566 */ 567 #if JGV 568 pmap_bootstrap(); 569 #endif 570 } 571 572 /* 573 * Map the per-cpu globaldata for cpu #0. Allocate the space using 574 * virtual_start and phys_avail[0] 575 */ 576 static 577 void 578 init_globaldata(void) 579 { 580 int i; 581 vm_paddr_t pa; 582 vm_offset_t va; 583 584 /* 585 * Reserve enough KVA to cover possible cpus. This is a considerable 586 * amount of KVA since the privatespace structure includes two 587 * whole page table mappings. 588 */ 589 virtual_start = (virtual_start + SEG_MASK) & ~(vm_offset_t)SEG_MASK; 590 CPU_prvspace = (void *)virtual_start; 591 virtual_start += sizeof(struct privatespace) * SMP_MAXCPU; 592 593 /* 594 * Allocate enough physical memory to cover the mdglobaldata 595 * portion of the space and the idle stack and map the pages 596 * into KVA. For cpu #0 only. 597 */ 598 for (i = 0; i < sizeof(struct mdglobaldata); i += PAGE_SIZE) { 599 pa = phys_avail[0]; 600 va = (vm_offset_t)&CPU_prvspace[0].mdglobaldata + i; 601 pmap_kenter_quick(va, pa); 602 phys_avail[0] += PAGE_SIZE; 603 } 604 for (i = 0; i < sizeof(CPU_prvspace[0].idlestack); i += PAGE_SIZE) { 605 pa = phys_avail[0]; 606 va = (vm_offset_t)&CPU_prvspace[0].idlestack + i; 607 pmap_kenter_quick(va, pa); 608 phys_avail[0] += PAGE_SIZE; 609 } 610 611 /* 612 * Setup the %gs for cpu #0. The mycpu macro works after this 613 * point. Note that %fs is used by pthreads. 614 */ 615 tls_set_gs(&CPU_prvspace[0], sizeof(struct privatespace)); 616 } 617 618 /* 619 * Initialize very low level systems including thread0, proc0, etc. 620 */ 621 static 622 void 623 init_vkernel(void) 624 { 625 struct mdglobaldata *gd; 626 627 gd = &CPU_prvspace[0].mdglobaldata; 628 bzero(gd, sizeof(*gd)); 629 630 gd->mi.gd_curthread = &thread0; 631 thread0.td_gd = &gd->mi; 632 ncpus = 1; 633 ncpus2 = 1; /* rounded down power of 2 */ 634 ncpus_fit = 1; /* rounded up power of 2 */ 635 /* ncpus2_mask and ncpus_fit_mask are 0 */ 636 init_param1(); 637 gd->mi.gd_prvspace = &CPU_prvspace[0]; 638 mi_gdinit(&gd->mi, 0); 639 cpu_gdinit(gd, 0); 640 mi_proc0init(&gd->mi, proc0paddr); 641 lwp0.lwp_md.md_regs = &proc0_tf; 642 643 /*init_locks();*/ 644 cninit(); 645 rand_initialize(); 646 #if 0 /* #ifdef DDB */ 647 kdb_init(); 648 if (boothowto & RB_KDB) 649 Debugger("Boot flags requested debugger"); 650 #endif 651 identcpu(); 652 #if 0 653 initializecpu(); /* Initialize CPU registers */ 654 #endif 655 init_param2((phys_avail[1] - phys_avail[0]) / PAGE_SIZE); 656 657 #if 0 658 /* 659 * Map the message buffer 660 */ 661 for (off = 0; off < round_page(MSGBUF_SIZE); off += PAGE_SIZE) 662 pmap_kenter((vm_offset_t)msgbufp + off, avail_end + off); 663 msgbufinit(msgbufp, MSGBUF_SIZE); 664 #endif 665 #if 0 666 thread0.td_pcb_cr3 ... MMU 667 lwp0.lwp_md.md_regs = &proc0_tf; 668 #endif 669 } 670 671 /* 672 * Filesystem image paths for the virtual kernel are optional. 673 * If specified they each should point to a disk image, 674 * the first of which will become the root disk. 675 * 676 * The virtual kernel caches data from our 'disk' just like a normal kernel, 677 * so we do not really want the real kernel to cache the data too. Use 678 * O_DIRECT to remove the duplication. 679 */ 680 static 681 void 682 init_disk(char *diskExp[], int diskFileNum, enum vkdisk_type type) 683 { 684 int i; 685 686 if (diskFileNum == 0) 687 return; 688 689 for(i=0; i < diskFileNum; i++){ 690 char *fname; 691 fname = diskExp[i]; 692 693 if (fname == NULL) { 694 warnx("Invalid argument to '-r'"); 695 continue; 696 } 697 698 if (DiskNum < VKDISK_MAX) { 699 struct stat st; 700 struct vkdisk_info* info = NULL; 701 int fd; 702 size_t l = 0; 703 704 if (type == VKD_DISK) 705 fd = open(fname, O_RDWR|O_DIRECT|O_EXLOCK|O_NONBLOCK, 0644); 706 else 707 fd = open(fname, O_RDONLY|O_DIRECT, 0644); 708 if (fd < 0 || fstat(fd, &st) < 0) { 709 if (errno == EAGAIN) 710 fprintf(stderr, "You may already have a vkernel using this disk image!\n"); 711 err(1, "Unable to open/create %s", fname); 712 /* NOT REACHED */ 713 } 714 /* get rid of O_NONBLOCK, keep O_DIRECT */ 715 if (type == VKD_DISK) 716 fcntl(fd, F_SETFL, O_DIRECT); 717 718 info = &DiskInfo[DiskNum]; 719 l = strlen(fname); 720 721 info->unit = i; 722 info->fd = fd; 723 info->type = type; 724 memcpy(info->fname, fname, l); 725 726 if (DiskNum == 0) { 727 if (type == VKD_CD) { 728 rootdevnames[0] = "cd9660:vcd0a"; 729 } else if (type == VKD_DISK) { 730 rootdevnames[0] = "ufs:vkd0s0a"; 731 rootdevnames[1] = "ufs:vkd0s1a"; 732 } 733 } 734 735 DiskNum++; 736 } else { 737 warnx("vkd%d (%s) > VKDISK_MAX", DiskNum, fname); 738 continue; 739 } 740 } 741 } 742 743 static 744 int 745 netif_set_tapflags(int tap_unit, int f, int s) 746 { 747 struct ifreq ifr; 748 int flags; 749 750 bzero(&ifr, sizeof(ifr)); 751 752 snprintf(ifr.ifr_name, sizeof(ifr.ifr_name), "tap%d", tap_unit); 753 if (ioctl(s, SIOCGIFFLAGS, &ifr) < 0) { 754 warn("tap%d: ioctl(SIOCGIFFLAGS) failed", tap_unit); 755 return -1; 756 } 757 758 /* 759 * Adjust if_flags 760 * 761 * If the flags are already set/cleared, then we return 762 * immediately to avoid extra syscalls 763 */ 764 flags = (ifr.ifr_flags & 0xffff) | (ifr.ifr_flagshigh << 16); 765 if (f < 0) { 766 /* Turn off flags */ 767 f = -f; 768 if ((flags & f) == 0) 769 return 0; 770 flags &= ~f; 771 } else { 772 /* Turn on flags */ 773 if (flags & f) 774 return 0; 775 flags |= f; 776 } 777 778 /* 779 * Fix up ifreq.ifr_name, since it may be trashed 780 * in previous ioctl(SIOCGIFFLAGS) 781 */ 782 snprintf(ifr.ifr_name, sizeof(ifr.ifr_name), "tap%d", tap_unit); 783 784 ifr.ifr_flags = flags & 0xffff; 785 ifr.ifr_flagshigh = flags >> 16; 786 if (ioctl(s, SIOCSIFFLAGS, &ifr) < 0) { 787 warn("tap%d: ioctl(SIOCSIFFLAGS) failed", tap_unit); 788 return -1; 789 } 790 return 0; 791 } 792 793 static 794 int 795 netif_set_tapaddr(int tap_unit, in_addr_t addr, in_addr_t mask, int s) 796 { 797 struct ifaliasreq ifra; 798 struct sockaddr_in *in; 799 800 bzero(&ifra, sizeof(ifra)); 801 snprintf(ifra.ifra_name, sizeof(ifra.ifra_name), "tap%d", tap_unit); 802 803 /* Setup address */ 804 in = (struct sockaddr_in *)&ifra.ifra_addr; 805 in->sin_family = AF_INET; 806 in->sin_len = sizeof(*in); 807 in->sin_addr.s_addr = addr; 808 809 if (mask != 0) { 810 /* Setup netmask */ 811 in = (struct sockaddr_in *)&ifra.ifra_mask; 812 in->sin_len = sizeof(*in); 813 in->sin_addr.s_addr = mask; 814 } 815 816 if (ioctl(s, SIOCAIFADDR, &ifra) < 0) { 817 warn("tap%d: ioctl(SIOCAIFADDR) failed", tap_unit); 818 return -1; 819 } 820 return 0; 821 } 822 823 static 824 int 825 netif_add_tap2brg(int tap_unit, const char *ifbridge, int s) 826 { 827 struct ifbreq ifbr; 828 struct ifdrv ifd; 829 830 bzero(&ifbr, sizeof(ifbr)); 831 snprintf(ifbr.ifbr_ifsname, sizeof(ifbr.ifbr_ifsname), 832 "tap%d", tap_unit); 833 834 bzero(&ifd, sizeof(ifd)); 835 strlcpy(ifd.ifd_name, ifbridge, sizeof(ifd.ifd_name)); 836 ifd.ifd_cmd = BRDGADD; 837 ifd.ifd_len = sizeof(ifbr); 838 ifd.ifd_data = &ifbr; 839 840 if (ioctl(s, SIOCSDRVSPEC, &ifd) < 0) { 841 /* 842 * 'errno == EEXIST' means that the tap(4) is already 843 * a member of the bridge(4) 844 */ 845 if (errno != EEXIST) { 846 warn("ioctl(%s, SIOCSDRVSPEC) failed", ifbridge); 847 return -1; 848 } 849 } 850 return 0; 851 } 852 853 #define TAPDEV_OFLAGS (O_RDWR | O_NONBLOCK) 854 855 /* 856 * Locate the first unused tap(4) device file if auto mode is requested, 857 * or open the user supplied device file, and bring up the corresponding 858 * tap(4) interface. 859 * 860 * NOTE: Only tap(4) device file is supported currently 861 */ 862 static 863 int 864 netif_open_tap(const char *netif, int *tap_unit, int s) 865 { 866 char tap_dev[MAXPATHLEN]; 867 int tap_fd, failed; 868 struct stat st; 869 char *dname; 870 871 *tap_unit = -1; 872 873 if (strcmp(netif, "auto") == 0) { 874 /* 875 * Find first unused tap(4) device file 876 */ 877 tap_fd = open("/dev/tap", TAPDEV_OFLAGS); 878 if (tap_fd < 0) { 879 warnc(errno, "Unable to find a free tap(4)"); 880 return -1; 881 } 882 } else { 883 /* 884 * User supplied tap(4) device file or unix socket. 885 */ 886 if (netif[0] == '/') /* Absolute path */ 887 strlcpy(tap_dev, netif, sizeof(tap_dev)); 888 else 889 snprintf(tap_dev, sizeof(tap_dev), "/dev/%s", netif); 890 891 tap_fd = open(tap_dev, TAPDEV_OFLAGS); 892 893 /* 894 * If we cannot open normally try to connect to it. 895 */ 896 if (tap_fd < 0) 897 tap_fd = unix_connect(tap_dev); 898 899 if (tap_fd < 0) { 900 warn("Unable to open %s", tap_dev); 901 return -1; 902 } 903 } 904 905 /* 906 * Check whether the device file is a tap(4) 907 */ 908 if (fstat(tap_fd, &st) < 0) { 909 failed = 1; 910 } else if (S_ISCHR(st.st_mode)) { 911 dname = fdevname(tap_fd); 912 if (dname) 913 dname = strstr(dname, "tap"); 914 if (dname) { 915 /* 916 * Bring up the corresponding tap(4) interface 917 */ 918 *tap_unit = strtol(dname + 3, NULL, 10); 919 printf("TAP UNIT %d\n", *tap_unit); 920 if (netif_set_tapflags(*tap_unit, IFF_UP, s) == 0) 921 failed = 0; 922 else 923 failed = 1; 924 } else { 925 failed = 1; 926 } 927 } else if (S_ISSOCK(st.st_mode)) { 928 /* 929 * Special socket connection (typically to vknet). We 930 * do not have to do anything. 931 */ 932 failed = 0; 933 } else { 934 failed = 1; 935 } 936 937 if (failed) { 938 warnx("%s is not a tap(4) device or socket", tap_dev); 939 close(tap_fd); 940 tap_fd = -1; 941 *tap_unit = -1; 942 } 943 return tap_fd; 944 } 945 946 static int 947 unix_connect(const char *path) 948 { 949 struct sockaddr_un sunx; 950 int len; 951 int net_fd; 952 int sndbuf = 262144; 953 struct stat st; 954 955 snprintf(sunx.sun_path, sizeof(sunx.sun_path), "%s", path); 956 len = offsetof(struct sockaddr_un, sun_path[strlen(sunx.sun_path)]); 957 ++len; /* include nul */ 958 sunx.sun_family = AF_UNIX; 959 sunx.sun_len = len; 960 961 net_fd = socket(AF_UNIX, SOCK_SEQPACKET, 0); 962 if (net_fd < 0) 963 return(-1); 964 if (connect(net_fd, (void *)&sunx, len) < 0) { 965 close(net_fd); 966 return(-1); 967 } 968 setsockopt(net_fd, SOL_SOCKET, SO_SNDBUF, &sndbuf, sizeof(sndbuf)); 969 if (fstat(net_fd, &st) == 0) 970 printf("Network socket buffer: %d bytes\n", st.st_blksize); 971 fcntl(net_fd, F_SETFL, O_NONBLOCK); 972 return(net_fd); 973 } 974 975 #undef TAPDEV_MAJOR 976 #undef TAPDEV_MINOR 977 #undef TAPDEV_OFLAGS 978 979 /* 980 * Following syntax is supported, 981 * 1) x.x.x.x tap(4)'s address is x.x.x.x 982 * 983 * 2) x.x.x.x/z tap(4)'s address is x.x.x.x 984 * tap(4)'s netmask len is z 985 * 986 * 3) x.x.x.x:y.y.y.y tap(4)'s address is x.x.x.x 987 * pseudo netif's address is y.y.y.y 988 * 989 * 4) x.x.x.x:y.y.y.y/z tap(4)'s address is x.x.x.x 990 * pseudo netif's address is y.y.y.y 991 * tap(4) and pseudo netif's netmask len are z 992 * 993 * 5) bridgeX tap(4) will be added to bridgeX 994 * 995 * 6) bridgeX:y.y.y.y tap(4) will be added to bridgeX 996 * pseudo netif's address is y.y.y.y 997 * 998 * 7) bridgeX:y.y.y.y/z tap(4) will be added to bridgeX 999 * pseudo netif's address is y.y.y.y 1000 * pseudo netif's netmask len is z 1001 */ 1002 static 1003 int 1004 netif_init_tap(int tap_unit, in_addr_t *addr, in_addr_t *mask, int s) 1005 { 1006 in_addr_t tap_addr, netmask, netif_addr; 1007 int next_netif_addr; 1008 char *tok, *masklen_str, *ifbridge; 1009 1010 *addr = 0; 1011 *mask = 0; 1012 1013 tok = strtok(NULL, ":/"); 1014 if (tok == NULL) { 1015 /* 1016 * Nothing special, simply use tap(4) as backend 1017 */ 1018 return 0; 1019 } 1020 1021 if (inet_pton(AF_INET, tok, &tap_addr) > 0) { 1022 /* 1023 * tap(4)'s address is supplied 1024 */ 1025 ifbridge = NULL; 1026 1027 /* 1028 * If there is next token, then it may be pseudo 1029 * netif's address or netmask len for tap(4) 1030 */ 1031 next_netif_addr = 0; 1032 } else { 1033 /* 1034 * Not tap(4)'s address, assume it as a bridge(4) 1035 * iface name 1036 */ 1037 tap_addr = 0; 1038 ifbridge = tok; 1039 1040 /* 1041 * If there is next token, then it must be pseudo 1042 * netif's address 1043 */ 1044 next_netif_addr = 1; 1045 } 1046 1047 netmask = netif_addr = 0; 1048 1049 tok = strtok(NULL, ":/"); 1050 if (tok == NULL) 1051 goto back; 1052 1053 if (inet_pton(AF_INET, tok, &netif_addr) <= 0) { 1054 if (next_netif_addr) { 1055 warnx("Invalid pseudo netif address: %s", tok); 1056 return -1; 1057 } 1058 netif_addr = 0; 1059 1060 /* 1061 * Current token is not address, then it must be netmask len 1062 */ 1063 masklen_str = tok; 1064 } else { 1065 /* 1066 * Current token is pseudo netif address, if there is next token 1067 * it must be netmask len 1068 */ 1069 masklen_str = strtok(NULL, "/"); 1070 } 1071 1072 /* Calculate netmask */ 1073 if (masklen_str != NULL) { 1074 u_long masklen; 1075 1076 masklen = strtoul(masklen_str, NULL, 10); 1077 if (masklen < 32 && masklen > 0) { 1078 netmask = htonl(~((1LL << (32 - masklen)) - 1) 1079 & 0xffffffff); 1080 } else { 1081 warnx("Invalid netmask len: %lu", masklen); 1082 return -1; 1083 } 1084 } 1085 1086 /* Make sure there is no more token left */ 1087 if (strtok(NULL, ":/") != NULL) { 1088 warnx("Invalid argument to '-I'"); 1089 return -1; 1090 } 1091 1092 back: 1093 if (tap_unit < 0) { 1094 /* Do nothing */ 1095 } else if (ifbridge == NULL) { 1096 /* Set tap(4) address/netmask */ 1097 if (netif_set_tapaddr(tap_unit, tap_addr, netmask, s) < 0) 1098 return -1; 1099 } else { 1100 /* Tie tap(4) to bridge(4) */ 1101 if (netif_add_tap2brg(tap_unit, ifbridge, s) < 0) 1102 return -1; 1103 } 1104 1105 *addr = netif_addr; 1106 *mask = netmask; 1107 return 0; 1108 } 1109 1110 /* 1111 * NetifInfo[] will be filled for pseudo netif initialization. 1112 * NetifNum will be bumped to reflect the number of valid entries 1113 * in NetifInfo[]. 1114 */ 1115 static 1116 void 1117 init_netif(char *netifExp[], int netifExpNum) 1118 { 1119 int i, s; 1120 1121 if (netifExpNum == 0) 1122 return; 1123 1124 s = socket(AF_INET, SOCK_DGRAM, 0); /* for ioctl(SIOC) */ 1125 if (s < 0) 1126 return; 1127 1128 for (i = 0; i < netifExpNum; ++i) { 1129 struct vknetif_info *info; 1130 in_addr_t netif_addr, netif_mask; 1131 int tap_fd, tap_unit; 1132 char *netif; 1133 1134 netif = strtok(netifExp[i], ":"); 1135 if (netif == NULL) { 1136 warnx("Invalid argument to '-I'"); 1137 continue; 1138 } 1139 1140 /* 1141 * Open tap(4) device file and bring up the 1142 * corresponding interface 1143 */ 1144 tap_fd = netif_open_tap(netif, &tap_unit, s); 1145 if (tap_fd < 0) 1146 continue; 1147 1148 /* 1149 * Initialize tap(4) and get address/netmask 1150 * for pseudo netif 1151 * 1152 * NB: Rest part of netifExp[i] is passed 1153 * to netif_init_tap() implicitly. 1154 */ 1155 if (netif_init_tap(tap_unit, &netif_addr, &netif_mask, s) < 0) { 1156 /* 1157 * NB: Closing tap(4) device file will bring 1158 * down the corresponding interface 1159 */ 1160 close(tap_fd); 1161 continue; 1162 } 1163 1164 info = &NetifInfo[NetifNum]; 1165 info->tap_fd = tap_fd; 1166 info->tap_unit = tap_unit; 1167 info->netif_addr = netif_addr; 1168 info->netif_mask = netif_mask; 1169 1170 NetifNum++; 1171 if (NetifNum >= VKNETIF_MAX) /* XXX will this happen? */ 1172 break; 1173 } 1174 close(s); 1175 } 1176 1177 static 1178 void 1179 writepid( void ) 1180 { 1181 pid_t self; 1182 FILE *fp; 1183 1184 if (pid_file != NULL) { 1185 self = getpid(); 1186 fp = fopen(pid_file, "w"); 1187 1188 if (fp != NULL) { 1189 fprintf(fp, "%ld\n", (long)self); 1190 fclose(fp); 1191 } 1192 else { 1193 perror("Warning: couldn't open pidfile"); 1194 } 1195 } 1196 } 1197 1198 static 1199 void 1200 cleanpid( void ) 1201 { 1202 if (pid_file != NULL) { 1203 if ( unlink(pid_file) != 0 ) 1204 perror("Warning: couldn't remove pidfile"); 1205 } 1206 } 1207 1208 static 1209 void 1210 usage(const char *ctl, ...) 1211 { 1212 va_list va; 1213 1214 va_start(va, ctl); 1215 vfprintf(stderr, ctl, va); 1216 va_end(va); 1217 fprintf(stderr, "\n"); 1218 exit(1); 1219 } 1220 1221 void 1222 cpu_reset(void) 1223 { 1224 kprintf("cpu reset, rebooting vkernel\n"); 1225 closefrom(3); 1226 cleanpid(); 1227 execv(save_av[0], save_av); 1228 } 1229 1230 void 1231 cpu_halt(void) 1232 { 1233 kprintf("cpu halt, exiting vkernel\n"); 1234 cleanpid(); 1235 exit(0); 1236 } 1237 1238 void 1239 setrealcpu(void) 1240 { 1241 switch(lwp_cpu_lock) { 1242 case LCL_PER_CPU: 1243 if (bootverbose) 1244 kprintf("Locking CPU%d to real cpu %d\n", 1245 mycpuid, next_cpu); 1246 usched_set(getpid(), USCHED_SET_CPU, &next_cpu, sizeof(next_cpu)); 1247 next_cpu++; 1248 if (next_cpu >= real_ncpus) 1249 next_cpu = 0; 1250 break; 1251 case LCL_SINGLE_CPU: 1252 if (bootverbose) 1253 kprintf("Locking CPU%d to real cpu %d\n", 1254 mycpuid, next_cpu); 1255 usched_set(getpid(), USCHED_SET_CPU, &next_cpu, sizeof(next_cpu)); 1256 break; 1257 default: 1258 /* do not map virtual cpus to real cpus */ 1259 break; 1260 } 1261 } 1262