1package main 2 3import ( 4 "context" 5 "encoding/json" 6 "errors" 7 "fmt" 8 "os" 9 "path/filepath" 10 "strconv" 11 "strings" 12 "time" 13 14 winio "github.com/Microsoft/go-winio" 15 "github.com/Microsoft/go-winio/pkg/guid" 16 "github.com/Microsoft/hcsshim/internal/cni" 17 "github.com/Microsoft/hcsshim/internal/hcs" 18 "github.com/Microsoft/hcsshim/internal/hcsoci" 19 "github.com/Microsoft/hcsshim/internal/logfields" 20 "github.com/Microsoft/hcsshim/internal/oci" 21 "github.com/Microsoft/hcsshim/internal/regstate" 22 "github.com/Microsoft/hcsshim/internal/resources" 23 "github.com/Microsoft/hcsshim/internal/runhcs" 24 "github.com/Microsoft/hcsshim/internal/uvm" 25 "github.com/Microsoft/hcsshim/osversion" 26 specs "github.com/opencontainers/runtime-spec/specs-go" 27 "github.com/sirupsen/logrus" 28 "golang.org/x/sys/windows" 29) 30 31var errContainerStopped = errors.New("container is stopped") 32 33type persistedState struct { 34 // ID is the id of this container/UVM. 35 ID string `json:",omitempty"` 36 // Owner is the owner value passed into the runhcs command and may be `""`. 37 Owner string `json:",omitempty"` 38 // SandboxID is the sandbox identifer passed in via OCI specifications. This 39 // can either be the sandbox itself or the sandbox this container should run 40 // in. See `parseSandboxAnnotations`. 41 SandboxID string `json:",omitempty"` 42 // HostID will be VM ID hosting this container. If a sandbox is used it will 43 // match the `SandboxID`. 44 HostID string `json:",omitempty"` 45 // Bundle is the folder path on disk where the container state and spec files 46 // reside. 47 Bundle string `json:",omitempty"` 48 Created time.Time `json:",omitempty"` 49 Rootfs string `json:",omitempty"` 50 // Spec is the in memory deserialized values found on `Bundle\config.json`. 51 Spec *specs.Spec `json:",omitempty"` 52 RequestedNetNS string `json:",omitempty"` 53 // IsHost is `true` when this is a VM isolated config. 54 IsHost bool `json:",omitempty"` 55 // UniqueID is a unique ID generated per container config. 56 UniqueID guid.GUID `json:",omitempty"` 57 // HostUniqueID is the unique ID of the hosting VM if this container is 58 // hosted. 59 HostUniqueID guid.GUID `json:",omitempty"` 60} 61 62type containerStatus string 63 64const ( 65 containerRunning containerStatus = "running" 66 containerStopped containerStatus = "stopped" 67 containerCreated containerStatus = "created" 68 containerPaused containerStatus = "paused" 69 containerUnknown containerStatus = "unknown" 70 71 keyState = "state" 72 keyResources = "resources" 73 keyShimPid = "shim" 74 keyInitPid = "pid" 75 keyNetNS = "netns" 76 // keyPidMapFmt is the format to use when mapping a host OS pid to a guest 77 // pid. 78 keyPidMapFmt = "pid-%d" 79) 80 81type container struct { 82 persistedState 83 ShimPid int 84 hc *hcs.System 85 resources *resources.Resources 86} 87 88func startProcessShim(id, pidFile, logFile string, spec *specs.Process) (_ *os.Process, err error) { 89 // Ensure the stdio handles inherit to the child process. This isn't undone 90 // after the StartProcess call because the caller never launches another 91 // process before exiting. 92 for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} { 93 err = windows.SetHandleInformation(windows.Handle(f.Fd()), windows.HANDLE_FLAG_INHERIT, windows.HANDLE_FLAG_INHERIT) 94 if err != nil { 95 return nil, err 96 } 97 } 98 99 args := []string{ 100 "--stdin", strconv.Itoa(int(os.Stdin.Fd())), 101 "--stdout", strconv.Itoa(int(os.Stdout.Fd())), 102 "--stderr", strconv.Itoa(int(os.Stderr.Fd())), 103 } 104 if spec != nil { 105 args = append(args, "--exec") 106 } 107 if strings.HasPrefix(logFile, runhcs.SafePipePrefix) { 108 args = append(args, "--log-pipe", logFile) 109 } 110 args = append(args, id) 111 return launchShim("shim", pidFile, logFile, args, spec) 112} 113 114func launchShim(cmd, pidFile, logFile string, args []string, data interface{}) (_ *os.Process, err error) { 115 executable, err := os.Executable() 116 if err != nil { 117 return nil, err 118 } 119 120 // Create a pipe to use as stderr for the shim process. This is used to 121 // retrieve early error information, up to the point that the shim is ready 122 // to launch a process in the container. 123 rp, wp, err := os.Pipe() 124 if err != nil { 125 return nil, err 126 } 127 defer rp.Close() 128 defer wp.Close() 129 130 // Create a pipe to send the data, if one is provided. 131 var rdatap, wdatap *os.File 132 if data != nil { 133 rdatap, wdatap, err = os.Pipe() 134 if err != nil { 135 return nil, err 136 } 137 defer rdatap.Close() 138 defer wdatap.Close() 139 } 140 141 var log *os.File 142 fullargs := []string{os.Args[0]} 143 if logFile != "" { 144 if !strings.HasPrefix(logFile, runhcs.SafePipePrefix) { 145 log, err = os.OpenFile(logFile, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0666) 146 if err != nil { 147 return nil, err 148 } 149 defer log.Close() 150 } 151 152 fullargs = append(fullargs, "--log-format", logFormat) 153 if logrus.GetLevel() == logrus.DebugLevel { 154 fullargs = append(fullargs, "--debug") 155 } 156 } 157 fullargs = append(fullargs, cmd) 158 fullargs = append(fullargs, args...) 159 attr := &os.ProcAttr{ 160 Files: []*os.File{rdatap, wp, log}, 161 } 162 p, err := os.StartProcess(executable, fullargs, attr) 163 if err != nil { 164 return nil, err 165 } 166 defer func() { 167 if err != nil { 168 p.Kill() 169 } 170 }() 171 172 wp.Close() 173 174 // Write the data if provided. 175 if data != nil { 176 rdatap.Close() 177 dataj, err := json.Marshal(data) 178 if err != nil { 179 return nil, err 180 } 181 _, err = wdatap.Write(dataj) 182 if err != nil { 183 return nil, err 184 } 185 wdatap.Close() 186 } 187 188 err = runhcs.GetErrorFromPipe(rp, p) 189 if err != nil { 190 return nil, err 191 } 192 193 if pidFile != "" { 194 if err = createPidFile(pidFile, p.Pid); err != nil { 195 return nil, err 196 } 197 } 198 199 return p, nil 200} 201 202// parseSandboxAnnotations searches `a` for various annotations used by 203// different runtimes to represent a sandbox ID, and sandbox type. 204// 205// If found returns the tuple `(sandboxID, isSandbox)` where `isSandbox == true` 206// indicates the identifer is the sandbox itself; `isSandbox == false` indicates 207// the identifer is the sandbox in which to place this container. Otherwise 208// returns `("", false)`. 209func parseSandboxAnnotations(a map[string]string) (string, bool) { 210 var t, id string 211 if t = a["io.kubernetes.cri.container-type"]; t != "" { 212 id = a["io.kubernetes.cri.sandbox-id"] 213 } else if t = a["io.kubernetes.cri-o.ContainerType"]; t != "" { 214 id = a["io.kubernetes.cri-o.SandboxID"] 215 } else if t = a["io.kubernetes.docker.type"]; t != "" { 216 id = a["io.kubernetes.sandbox.id"] 217 if t == "podsandbox" { 218 t = "sandbox" 219 } 220 } 221 if t == "container" { 222 return id, false 223 } 224 if t == "sandbox" { 225 return id, true 226 } 227 return "", false 228} 229 230// startVMShim starts a vm-shim command with the specified `opts`. `opts` can be `uvm.OptionsWCOW` or `uvm.OptionsLCOW` 231func (c *container) startVMShim(logFile string, opts interface{}) (*os.Process, error) { 232 var os string 233 if _, ok := opts.(*uvm.OptionsLCOW); ok { 234 os = "linux" 235 } else { 236 os = "windows" 237 } 238 args := []string{"--os", os} 239 if strings.HasPrefix(logFile, runhcs.SafePipePrefix) { 240 args = append(args, "--log-pipe", logFile) 241 } 242 args = append(args, c.VMPipePath()) 243 return launchShim("vmshim", "", logFile, args, opts) 244} 245 246type containerConfig struct { 247 ID string 248 Owner string 249 HostID string 250 PidFile string 251 ShimLogFile, VMLogFile string 252 Spec *specs.Spec 253 VMConsolePipe string 254} 255 256func createContainer(cfg *containerConfig) (_ *container, err error) { 257 // Store the container information in a volatile registry key. 258 cwd, err := os.Getwd() 259 if err != nil { 260 return nil, err 261 } 262 263 vmisolated := cfg.Spec.Linux != nil || (cfg.Spec.Windows != nil && cfg.Spec.Windows.HyperV != nil) 264 265 sandboxID, isSandbox := parseSandboxAnnotations(cfg.Spec.Annotations) 266 hostID := cfg.HostID 267 if isSandbox { 268 if sandboxID != cfg.ID { 269 return nil, errors.New("sandbox ID must match ID") 270 } 271 } else if sandboxID != "" { 272 // Validate that the sandbox container exists. 273 sandbox, err := getContainer(sandboxID, false) 274 if err != nil { 275 return nil, err 276 } 277 defer sandbox.Close() 278 if sandbox.SandboxID != sandboxID { 279 return nil, fmt.Errorf("container %s is not a sandbox", sandboxID) 280 } 281 if hostID == "" { 282 // Use the sandbox's host. 283 hostID = sandbox.HostID 284 } else if sandbox.HostID == "" { 285 return nil, fmt.Errorf("sandbox container %s is not running in a VM host, but host %s was specified", sandboxID, hostID) 286 } else if hostID != sandbox.HostID { 287 return nil, fmt.Errorf("sandbox container %s has a different host %s from the requested host %s", sandboxID, sandbox.HostID, hostID) 288 } 289 if vmisolated && hostID == "" { 290 return nil, fmt.Errorf("container %s is not a VM isolated sandbox", sandboxID) 291 } 292 } 293 294 uniqueID, err := guid.NewV4() 295 if err != nil { 296 return nil, err 297 } 298 299 newvm := false 300 var hostUniqueID guid.GUID 301 if hostID != "" { 302 host, err := getContainer(hostID, false) 303 if err != nil { 304 return nil, err 305 } 306 defer host.Close() 307 if !host.IsHost { 308 return nil, fmt.Errorf("host container %s is not a VM host", hostID) 309 } 310 hostUniqueID = host.UniqueID 311 } else if vmisolated && (isSandbox || cfg.Spec.Linux != nil || osversion.Get().Build >= osversion.RS5) { 312 // This handles all LCOW, Pod Sandbox, and (Windows Xenon V2 for RS5+) 313 hostID = cfg.ID 314 newvm = true 315 hostUniqueID = uniqueID 316 } 317 318 // Make absolute the paths in Root.Path and Windows.LayerFolders. 319 rootfs := "" 320 if cfg.Spec.Root != nil { 321 rootfs = cfg.Spec.Root.Path 322 if rootfs != "" && !filepath.IsAbs(rootfs) && !strings.HasPrefix(rootfs, `\\?\`) { 323 rootfs = filepath.Join(cwd, rootfs) 324 cfg.Spec.Root.Path = rootfs 325 } 326 } 327 328 netNS := "" 329 if cfg.Spec.Windows != nil { 330 for i, f := range cfg.Spec.Windows.LayerFolders { 331 if !filepath.IsAbs(f) && !strings.HasPrefix(rootfs, `\\?\`) { 332 cfg.Spec.Windows.LayerFolders[i] = filepath.Join(cwd, f) 333 } 334 } 335 336 // Determine the network namespace to use. 337 if cfg.Spec.Windows.Network != nil { 338 if cfg.Spec.Windows.Network.NetworkSharedContainerName != "" { 339 // RS4 case 340 err = stateKey.Get(cfg.Spec.Windows.Network.NetworkSharedContainerName, keyNetNS, &netNS) 341 if err != nil { 342 if _, ok := err.(*regstate.NoStateError); !ok { 343 return nil, err 344 } 345 } 346 } else if cfg.Spec.Windows.Network.NetworkNamespace != "" { 347 // RS5 case 348 netNS = cfg.Spec.Windows.Network.NetworkNamespace 349 } 350 } 351 } 352 353 // Store the initial container state in the registry so that the delete 354 // command can clean everything up if something goes wrong. 355 c := &container{ 356 persistedState: persistedState{ 357 ID: cfg.ID, 358 Owner: cfg.Owner, 359 Bundle: cwd, 360 Rootfs: rootfs, 361 Created: time.Now(), 362 Spec: cfg.Spec, 363 SandboxID: sandboxID, 364 HostID: hostID, 365 IsHost: newvm, 366 RequestedNetNS: netNS, 367 UniqueID: uniqueID, 368 HostUniqueID: hostUniqueID, 369 }, 370 } 371 err = stateKey.Create(cfg.ID, keyState, &c.persistedState) 372 if err != nil { 373 return nil, err 374 } 375 defer func() { 376 if err != nil { 377 c.Remove() 378 } 379 }() 380 if isSandbox && vmisolated { 381 cnicfg := cni.NewPersistedNamespaceConfig(netNS, cfg.ID, hostUniqueID) 382 err = cnicfg.Store() 383 if err != nil { 384 return nil, err 385 } 386 defer func() { 387 if err != nil { 388 cnicfg.Remove() 389 } 390 }() 391 } 392 393 // Start a VM if necessary. 394 if newvm { 395 opts, err := oci.SpecToUVMCreateOpts(context.Background(), cfg.Spec, vmID(c.ID), cfg.Owner) 396 if err != nil { 397 return nil, err 398 } 399 switch opts.(type) { 400 case *uvm.OptionsLCOW: 401 lopts := opts.(*uvm.OptionsLCOW) 402 lopts.ConsolePipe = cfg.VMConsolePipe 403 case *uvm.OptionsWCOW: 404 wopts := opts.(*uvm.OptionsWCOW) 405 406 // In order for the UVM sandbox.vhdx not to collide with the actual 407 // nested Argon sandbox.vhdx we append the \vm folder to the last entry 408 // in the list. 409 layersLen := len(cfg.Spec.Windows.LayerFolders) 410 layers := make([]string, layersLen) 411 copy(layers, cfg.Spec.Windows.LayerFolders) 412 413 vmPath := filepath.Join(layers[layersLen-1], "vm") 414 err := os.MkdirAll(vmPath, 0) 415 if err != nil { 416 return nil, err 417 } 418 layers[layersLen-1] = vmPath 419 420 wopts.LayerFolders = layers 421 } 422 423 shim, err := c.startVMShim(cfg.VMLogFile, opts) 424 if err != nil { 425 return nil, err 426 } 427 shim.Release() 428 } 429 430 if c.HostID != "" { 431 // Call to the VM shim process to create the container. This is done so 432 // that the VM process can keep track of the VM's virtual hardware 433 // resource use. 434 err = c.issueVMRequest(runhcs.OpCreateContainer) 435 if err != nil { 436 return nil, err 437 } 438 c.hc, err = hcs.OpenComputeSystem(context.Background(), cfg.ID) 439 if err != nil { 440 return nil, err 441 } 442 } else { 443 // Create the container directly from this process. 444 err = createContainerInHost(c, nil) 445 if err != nil { 446 return nil, err 447 } 448 } 449 450 // Create the shim process for the container. 451 err = startContainerShim(c, cfg.PidFile, cfg.ShimLogFile) 452 if err != nil { 453 if e := c.Kill(); e == nil { 454 c.Remove() 455 } 456 return nil, err 457 } 458 459 return c, nil 460} 461 462func (c *container) ShimPipePath() string { 463 return runhcs.SafePipePath("runhcs-shim-" + c.UniqueID.String()) 464} 465 466func (c *container) VMPipePath() string { 467 return runhcs.VMPipePath(c.HostUniqueID) 468} 469 470func (c *container) VMIsolated() bool { 471 return c.HostID != "" 472} 473 474func (c *container) unmountInHost(vm *uvm.UtilityVM, all bool) error { 475 r := &resources.Resources{} 476 err := stateKey.Get(c.ID, keyResources, r) 477 if _, ok := err.(*regstate.NoStateError); ok { 478 return nil 479 } 480 if err != nil { 481 return err 482 } 483 err = resources.ReleaseResources(context.Background(), r, vm, all) 484 if err != nil { 485 stateKey.Set(c.ID, keyResources, r) 486 return err 487 } 488 489 err = stateKey.Clear(c.ID, keyResources) 490 if err != nil { 491 return err 492 } 493 return nil 494} 495 496func (c *container) Unmount(all bool) error { 497 if c.VMIsolated() { 498 op := runhcs.OpUnmountContainerDiskOnly 499 if all { 500 op = runhcs.OpUnmountContainer 501 } 502 err := c.issueVMRequest(op) 503 if err != nil { 504 if _, ok := err.(*noVMError); ok { 505 logrus.WithFields(logrus.Fields{ 506 logfields.ContainerID: c.ID, 507 logfields.UVMID: c.HostID, 508 logrus.ErrorKey: errors.New("failed to unmount container resources"), 509 }).Warning("VM shim could not be contacted") 510 } else { 511 return err 512 } 513 } 514 } else { 515 c.unmountInHost(nil, false) 516 } 517 return nil 518} 519 520func createContainerInHost(c *container, vm *uvm.UtilityVM) (err error) { 521 if c.hc != nil { 522 return errors.New("container already created") 523 } 524 525 // Create the container without starting it. 526 opts := &hcsoci.CreateOptions{ 527 ID: c.ID, 528 Owner: c.Owner, 529 Spec: c.Spec, 530 HostingSystem: vm, 531 NetworkNamespace: c.RequestedNetNS, 532 } 533 vmid := "" 534 if vm != nil { 535 vmid = vm.ID() 536 } 537 logrus.WithFields(logrus.Fields{ 538 logfields.ContainerID: c.ID, 539 logfields.UVMID: vmid, 540 }).Info("creating container in UVM") 541 hc, r, err := hcsoci.CreateContainer(context.Background(), opts) 542 if err != nil { 543 return err 544 } 545 defer func() { 546 if err != nil { 547 hc.Terminate(context.Background()) 548 hc.Wait() 549 resources.ReleaseResources(context.Background(), r, vm, true) 550 } 551 }() 552 553 // Record the network namespace to support namespace sharing by container ID. 554 if r.NetNS() != "" { 555 err = stateKey.Set(c.ID, keyNetNS, r.NetNS()) 556 if err != nil { 557 return err 558 } 559 } 560 561 err = stateKey.Set(c.ID, keyResources, r) 562 if err != nil { 563 return err 564 } 565 c.hc = hc.(*hcs.System) 566 return nil 567} 568 569func startContainerShim(c *container, pidFile, logFile string) error { 570 // Launch a shim process to later execute a process in the container. 571 shim, err := startProcessShim(c.ID, pidFile, logFile, nil) 572 if err != nil { 573 return err 574 } 575 defer shim.Release() 576 defer func() { 577 if err != nil { 578 shim.Kill() 579 } 580 }() 581 582 c.ShimPid = shim.Pid 583 err = stateKey.Set(c.ID, keyShimPid, shim.Pid) 584 if err != nil { 585 return err 586 } 587 588 if pidFile != "" { 589 if err = createPidFile(pidFile, shim.Pid); err != nil { 590 return err 591 } 592 } 593 594 return nil 595} 596 597func (c *container) Close() error { 598 if c.hc == nil { 599 return nil 600 } 601 return c.hc.Close() 602} 603 604func (c *container) Exec() error { 605 err := c.hc.Start(context.Background()) 606 if err != nil { 607 return err 608 } 609 610 if c.Spec.Process == nil { 611 return nil 612 } 613 614 // Alert the shim that the container is ready. 615 pipe, err := winio.DialPipe(c.ShimPipePath(), nil) 616 if err != nil { 617 return err 618 } 619 defer pipe.Close() 620 621 shim, err := os.FindProcess(c.ShimPid) 622 if err != nil { 623 return err 624 } 625 defer shim.Release() 626 627 err = runhcs.GetErrorFromPipe(pipe, shim) 628 if err != nil { 629 return err 630 } 631 632 return nil 633} 634 635func getContainer(id string, notStopped bool) (*container, error) { 636 var c container 637 err := stateKey.Get(id, keyState, &c.persistedState) 638 if err != nil { 639 return nil, err 640 } 641 err = stateKey.Get(id, keyShimPid, &c.ShimPid) 642 if err != nil { 643 if _, ok := err.(*regstate.NoStateError); !ok { 644 return nil, err 645 } 646 c.ShimPid = -1 647 } 648 if notStopped && c.ShimPid == 0 { 649 return nil, errContainerStopped 650 } 651 652 hc, err := hcs.OpenComputeSystem(context.Background(), c.ID) 653 if err == nil { 654 c.hc = hc 655 } else if !hcs.IsNotExist(err) { 656 return nil, err 657 } else if notStopped { 658 return nil, errContainerStopped 659 } 660 661 return &c, nil 662} 663 664func (c *container) Remove() error { 665 // Unmount any layers or mapped volumes. 666 err := c.Unmount(!c.IsHost) 667 if err != nil { 668 return err 669 } 670 671 // Follow kata's example and delay tearing down the VM until the owning 672 // container is removed. 673 if c.IsHost { 674 vm, err := hcs.OpenComputeSystem(context.Background(), vmID(c.ID)) 675 if err == nil { 676 vm.Terminate(context.Background()) 677 vm.Wait() 678 } 679 } 680 return stateKey.Remove(c.ID) 681} 682 683func (c *container) Kill() error { 684 if c.hc == nil { 685 return nil 686 } 687 c.hc.Terminate(context.Background()) 688 return c.hc.Wait() 689} 690 691func (c *container) Status() (containerStatus, error) { 692 if c.hc == nil || c.ShimPid == 0 { 693 return containerStopped, nil 694 } 695 props, err := c.hc.Properties(context.Background()) 696 if err != nil { 697 if !strings.Contains(err.Error(), "operation is not valid in the current state") { 698 return "", err 699 } 700 return containerUnknown, nil 701 } 702 state := containerUnknown 703 switch props.State { 704 case "", "Created": 705 state = containerCreated 706 case "Running": 707 state = containerRunning 708 case "Paused": 709 state = containerPaused 710 case "Stopped": 711 state = containerStopped 712 } 713 return state, nil 714} 715