1package main
2
3import (
4	"context"
5	"encoding/json"
6	"errors"
7	"fmt"
8	"os"
9	"path/filepath"
10	"strconv"
11	"strings"
12	"time"
13
14	winio "github.com/Microsoft/go-winio"
15	"github.com/Microsoft/go-winio/pkg/guid"
16	"github.com/Microsoft/hcsshim/internal/cni"
17	"github.com/Microsoft/hcsshim/internal/hcs"
18	"github.com/Microsoft/hcsshim/internal/hcsoci"
19	"github.com/Microsoft/hcsshim/internal/logfields"
20	"github.com/Microsoft/hcsshim/internal/oci"
21	"github.com/Microsoft/hcsshim/internal/regstate"
22	"github.com/Microsoft/hcsshim/internal/resources"
23	"github.com/Microsoft/hcsshim/internal/runhcs"
24	"github.com/Microsoft/hcsshim/internal/uvm"
25	"github.com/Microsoft/hcsshim/osversion"
26	specs "github.com/opencontainers/runtime-spec/specs-go"
27	"github.com/sirupsen/logrus"
28	"golang.org/x/sys/windows"
29)
30
31var errContainerStopped = errors.New("container is stopped")
32
33type persistedState struct {
34	// ID is the id of this container/UVM.
35	ID string `json:",omitempty"`
36	// Owner is the owner value passed into the runhcs command and may be `""`.
37	Owner string `json:",omitempty"`
38	// SandboxID is the sandbox identifer passed in via OCI specifications. This
39	// can either be the sandbox itself or the sandbox this container should run
40	// in. See `parseSandboxAnnotations`.
41	SandboxID string `json:",omitempty"`
42	// HostID will be VM ID hosting this container. If a sandbox is used it will
43	// match the `SandboxID`.
44	HostID string `json:",omitempty"`
45	// Bundle is the folder path on disk where the container state and spec files
46	// reside.
47	Bundle  string    `json:",omitempty"`
48	Created time.Time `json:",omitempty"`
49	Rootfs  string    `json:",omitempty"`
50	// Spec is the in memory deserialized values found on `Bundle\config.json`.
51	Spec           *specs.Spec `json:",omitempty"`
52	RequestedNetNS string      `json:",omitempty"`
53	// IsHost is `true` when this is a VM isolated config.
54	IsHost bool `json:",omitempty"`
55	// UniqueID is a unique ID generated per container config.
56	UniqueID guid.GUID `json:",omitempty"`
57	// HostUniqueID is the unique ID of the hosting VM if this container is
58	// hosted.
59	HostUniqueID guid.GUID `json:",omitempty"`
60}
61
62type containerStatus string
63
64const (
65	containerRunning containerStatus = "running"
66	containerStopped containerStatus = "stopped"
67	containerCreated containerStatus = "created"
68	containerPaused  containerStatus = "paused"
69	containerUnknown containerStatus = "unknown"
70
71	keyState     = "state"
72	keyResources = "resources"
73	keyShimPid   = "shim"
74	keyInitPid   = "pid"
75	keyNetNS     = "netns"
76	// keyPidMapFmt is the format to use when mapping a host OS pid to a guest
77	// pid.
78	keyPidMapFmt = "pid-%d"
79)
80
81type container struct {
82	persistedState
83	ShimPid   int
84	hc        *hcs.System
85	resources *resources.Resources
86}
87
88func startProcessShim(id, pidFile, logFile string, spec *specs.Process) (_ *os.Process, err error) {
89	// Ensure the stdio handles inherit to the child process. This isn't undone
90	// after the StartProcess call because the caller never launches another
91	// process before exiting.
92	for _, f := range []*os.File{os.Stdin, os.Stdout, os.Stderr} {
93		err = windows.SetHandleInformation(windows.Handle(f.Fd()), windows.HANDLE_FLAG_INHERIT, windows.HANDLE_FLAG_INHERIT)
94		if err != nil {
95			return nil, err
96		}
97	}
98
99	args := []string{
100		"--stdin", strconv.Itoa(int(os.Stdin.Fd())),
101		"--stdout", strconv.Itoa(int(os.Stdout.Fd())),
102		"--stderr", strconv.Itoa(int(os.Stderr.Fd())),
103	}
104	if spec != nil {
105		args = append(args, "--exec")
106	}
107	if strings.HasPrefix(logFile, runhcs.SafePipePrefix) {
108		args = append(args, "--log-pipe", logFile)
109	}
110	args = append(args, id)
111	return launchShim("shim", pidFile, logFile, args, spec)
112}
113
114func launchShim(cmd, pidFile, logFile string, args []string, data interface{}) (_ *os.Process, err error) {
115	executable, err := os.Executable()
116	if err != nil {
117		return nil, err
118	}
119
120	// Create a pipe to use as stderr for the shim process. This is used to
121	// retrieve early error information, up to the point that the shim is ready
122	// to launch a process in the container.
123	rp, wp, err := os.Pipe()
124	if err != nil {
125		return nil, err
126	}
127	defer rp.Close()
128	defer wp.Close()
129
130	// Create a pipe to send the data, if one is provided.
131	var rdatap, wdatap *os.File
132	if data != nil {
133		rdatap, wdatap, err = os.Pipe()
134		if err != nil {
135			return nil, err
136		}
137		defer rdatap.Close()
138		defer wdatap.Close()
139	}
140
141	var log *os.File
142	fullargs := []string{os.Args[0]}
143	if logFile != "" {
144		if !strings.HasPrefix(logFile, runhcs.SafePipePrefix) {
145			log, err = os.OpenFile(logFile, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0666)
146			if err != nil {
147				return nil, err
148			}
149			defer log.Close()
150		}
151
152		fullargs = append(fullargs, "--log-format", logFormat)
153		if logrus.GetLevel() == logrus.DebugLevel {
154			fullargs = append(fullargs, "--debug")
155		}
156	}
157	fullargs = append(fullargs, cmd)
158	fullargs = append(fullargs, args...)
159	attr := &os.ProcAttr{
160		Files: []*os.File{rdatap, wp, log},
161	}
162	p, err := os.StartProcess(executable, fullargs, attr)
163	if err != nil {
164		return nil, err
165	}
166	defer func() {
167		if err != nil {
168			p.Kill()
169		}
170	}()
171
172	wp.Close()
173
174	// Write the data if provided.
175	if data != nil {
176		rdatap.Close()
177		dataj, err := json.Marshal(data)
178		if err != nil {
179			return nil, err
180		}
181		_, err = wdatap.Write(dataj)
182		if err != nil {
183			return nil, err
184		}
185		wdatap.Close()
186	}
187
188	err = runhcs.GetErrorFromPipe(rp, p)
189	if err != nil {
190		return nil, err
191	}
192
193	if pidFile != "" {
194		if err = createPidFile(pidFile, p.Pid); err != nil {
195			return nil, err
196		}
197	}
198
199	return p, nil
200}
201
202// parseSandboxAnnotations searches `a` for various annotations used by
203// different runtimes to represent a sandbox ID, and sandbox type.
204//
205// If found returns the tuple `(sandboxID, isSandbox)` where `isSandbox == true`
206// indicates the identifer is the sandbox itself; `isSandbox == false` indicates
207// the identifer is the sandbox in which to place this container. Otherwise
208// returns `("", false)`.
209func parseSandboxAnnotations(a map[string]string) (string, bool) {
210	var t, id string
211	if t = a["io.kubernetes.cri.container-type"]; t != "" {
212		id = a["io.kubernetes.cri.sandbox-id"]
213	} else if t = a["io.kubernetes.cri-o.ContainerType"]; t != "" {
214		id = a["io.kubernetes.cri-o.SandboxID"]
215	} else if t = a["io.kubernetes.docker.type"]; t != "" {
216		id = a["io.kubernetes.sandbox.id"]
217		if t == "podsandbox" {
218			t = "sandbox"
219		}
220	}
221	if t == "container" {
222		return id, false
223	}
224	if t == "sandbox" {
225		return id, true
226	}
227	return "", false
228}
229
230// startVMShim starts a vm-shim command with the specified `opts`. `opts` can be `uvm.OptionsWCOW` or `uvm.OptionsLCOW`
231func (c *container) startVMShim(logFile string, opts interface{}) (*os.Process, error) {
232	var os string
233	if _, ok := opts.(*uvm.OptionsLCOW); ok {
234		os = "linux"
235	} else {
236		os = "windows"
237	}
238	args := []string{"--os", os}
239	if strings.HasPrefix(logFile, runhcs.SafePipePrefix) {
240		args = append(args, "--log-pipe", logFile)
241	}
242	args = append(args, c.VMPipePath())
243	return launchShim("vmshim", "", logFile, args, opts)
244}
245
246type containerConfig struct {
247	ID                     string
248	Owner                  string
249	HostID                 string
250	PidFile                string
251	ShimLogFile, VMLogFile string
252	Spec                   *specs.Spec
253	VMConsolePipe          string
254}
255
256func createContainer(cfg *containerConfig) (_ *container, err error) {
257	// Store the container information in a volatile registry key.
258	cwd, err := os.Getwd()
259	if err != nil {
260		return nil, err
261	}
262
263	vmisolated := cfg.Spec.Linux != nil || (cfg.Spec.Windows != nil && cfg.Spec.Windows.HyperV != nil)
264
265	sandboxID, isSandbox := parseSandboxAnnotations(cfg.Spec.Annotations)
266	hostID := cfg.HostID
267	if isSandbox {
268		if sandboxID != cfg.ID {
269			return nil, errors.New("sandbox ID must match ID")
270		}
271	} else if sandboxID != "" {
272		// Validate that the sandbox container exists.
273		sandbox, err := getContainer(sandboxID, false)
274		if err != nil {
275			return nil, err
276		}
277		defer sandbox.Close()
278		if sandbox.SandboxID != sandboxID {
279			return nil, fmt.Errorf("container %s is not a sandbox", sandboxID)
280		}
281		if hostID == "" {
282			// Use the sandbox's host.
283			hostID = sandbox.HostID
284		} else if sandbox.HostID == "" {
285			return nil, fmt.Errorf("sandbox container %s is not running in a VM host, but host %s was specified", sandboxID, hostID)
286		} else if hostID != sandbox.HostID {
287			return nil, fmt.Errorf("sandbox container %s has a different host %s from the requested host %s", sandboxID, sandbox.HostID, hostID)
288		}
289		if vmisolated && hostID == "" {
290			return nil, fmt.Errorf("container %s is not a VM isolated sandbox", sandboxID)
291		}
292	}
293
294	uniqueID, err := guid.NewV4()
295	if err != nil {
296		return nil, err
297	}
298
299	newvm := false
300	var hostUniqueID guid.GUID
301	if hostID != "" {
302		host, err := getContainer(hostID, false)
303		if err != nil {
304			return nil, err
305		}
306		defer host.Close()
307		if !host.IsHost {
308			return nil, fmt.Errorf("host container %s is not a VM host", hostID)
309		}
310		hostUniqueID = host.UniqueID
311	} else if vmisolated && (isSandbox || cfg.Spec.Linux != nil || osversion.Get().Build >= osversion.RS5) {
312		// This handles all LCOW, Pod Sandbox, and (Windows Xenon V2 for RS5+)
313		hostID = cfg.ID
314		newvm = true
315		hostUniqueID = uniqueID
316	}
317
318	// Make absolute the paths in Root.Path and Windows.LayerFolders.
319	rootfs := ""
320	if cfg.Spec.Root != nil {
321		rootfs = cfg.Spec.Root.Path
322		if rootfs != "" && !filepath.IsAbs(rootfs) && !strings.HasPrefix(rootfs, `\\?\`) {
323			rootfs = filepath.Join(cwd, rootfs)
324			cfg.Spec.Root.Path = rootfs
325		}
326	}
327
328	netNS := ""
329	if cfg.Spec.Windows != nil {
330		for i, f := range cfg.Spec.Windows.LayerFolders {
331			if !filepath.IsAbs(f) && !strings.HasPrefix(rootfs, `\\?\`) {
332				cfg.Spec.Windows.LayerFolders[i] = filepath.Join(cwd, f)
333			}
334		}
335
336		// Determine the network namespace to use.
337		if cfg.Spec.Windows.Network != nil {
338			if cfg.Spec.Windows.Network.NetworkSharedContainerName != "" {
339				// RS4 case
340				err = stateKey.Get(cfg.Spec.Windows.Network.NetworkSharedContainerName, keyNetNS, &netNS)
341				if err != nil {
342					if _, ok := err.(*regstate.NoStateError); !ok {
343						return nil, err
344					}
345				}
346			} else if cfg.Spec.Windows.Network.NetworkNamespace != "" {
347				// RS5 case
348				netNS = cfg.Spec.Windows.Network.NetworkNamespace
349			}
350		}
351	}
352
353	// Store the initial container state in the registry so that the delete
354	// command can clean everything up if something goes wrong.
355	c := &container{
356		persistedState: persistedState{
357			ID:             cfg.ID,
358			Owner:          cfg.Owner,
359			Bundle:         cwd,
360			Rootfs:         rootfs,
361			Created:        time.Now(),
362			Spec:           cfg.Spec,
363			SandboxID:      sandboxID,
364			HostID:         hostID,
365			IsHost:         newvm,
366			RequestedNetNS: netNS,
367			UniqueID:       uniqueID,
368			HostUniqueID:   hostUniqueID,
369		},
370	}
371	err = stateKey.Create(cfg.ID, keyState, &c.persistedState)
372	if err != nil {
373		return nil, err
374	}
375	defer func() {
376		if err != nil {
377			c.Remove()
378		}
379	}()
380	if isSandbox && vmisolated {
381		cnicfg := cni.NewPersistedNamespaceConfig(netNS, cfg.ID, hostUniqueID)
382		err = cnicfg.Store()
383		if err != nil {
384			return nil, err
385		}
386		defer func() {
387			if err != nil {
388				cnicfg.Remove()
389			}
390		}()
391	}
392
393	// Start a VM if necessary.
394	if newvm {
395		opts, err := oci.SpecToUVMCreateOpts(context.Background(), cfg.Spec, vmID(c.ID), cfg.Owner)
396		if err != nil {
397			return nil, err
398		}
399		switch opts.(type) {
400		case *uvm.OptionsLCOW:
401			lopts := opts.(*uvm.OptionsLCOW)
402			lopts.ConsolePipe = cfg.VMConsolePipe
403		case *uvm.OptionsWCOW:
404			wopts := opts.(*uvm.OptionsWCOW)
405
406			// In order for the UVM sandbox.vhdx not to collide with the actual
407			// nested Argon sandbox.vhdx we append the \vm folder to the last entry
408			// in the list.
409			layersLen := len(cfg.Spec.Windows.LayerFolders)
410			layers := make([]string, layersLen)
411			copy(layers, cfg.Spec.Windows.LayerFolders)
412
413			vmPath := filepath.Join(layers[layersLen-1], "vm")
414			err := os.MkdirAll(vmPath, 0)
415			if err != nil {
416				return nil, err
417			}
418			layers[layersLen-1] = vmPath
419
420			wopts.LayerFolders = layers
421		}
422
423		shim, err := c.startVMShim(cfg.VMLogFile, opts)
424		if err != nil {
425			return nil, err
426		}
427		shim.Release()
428	}
429
430	if c.HostID != "" {
431		// Call to the VM shim process to create the container. This is done so
432		// that the VM process can keep track of the VM's virtual hardware
433		// resource use.
434		err = c.issueVMRequest(runhcs.OpCreateContainer)
435		if err != nil {
436			return nil, err
437		}
438		c.hc, err = hcs.OpenComputeSystem(context.Background(), cfg.ID)
439		if err != nil {
440			return nil, err
441		}
442	} else {
443		// Create the container directly from this process.
444		err = createContainerInHost(c, nil)
445		if err != nil {
446			return nil, err
447		}
448	}
449
450	// Create the shim process for the container.
451	err = startContainerShim(c, cfg.PidFile, cfg.ShimLogFile)
452	if err != nil {
453		if e := c.Kill(); e == nil {
454			c.Remove()
455		}
456		return nil, err
457	}
458
459	return c, nil
460}
461
462func (c *container) ShimPipePath() string {
463	return runhcs.SafePipePath("runhcs-shim-" + c.UniqueID.String())
464}
465
466func (c *container) VMPipePath() string {
467	return runhcs.VMPipePath(c.HostUniqueID)
468}
469
470func (c *container) VMIsolated() bool {
471	return c.HostID != ""
472}
473
474func (c *container) unmountInHost(vm *uvm.UtilityVM, all bool) error {
475	r := &resources.Resources{}
476	err := stateKey.Get(c.ID, keyResources, r)
477	if _, ok := err.(*regstate.NoStateError); ok {
478		return nil
479	}
480	if err != nil {
481		return err
482	}
483	err = resources.ReleaseResources(context.Background(), r, vm, all)
484	if err != nil {
485		stateKey.Set(c.ID, keyResources, r)
486		return err
487	}
488
489	err = stateKey.Clear(c.ID, keyResources)
490	if err != nil {
491		return err
492	}
493	return nil
494}
495
496func (c *container) Unmount(all bool) error {
497	if c.VMIsolated() {
498		op := runhcs.OpUnmountContainerDiskOnly
499		if all {
500			op = runhcs.OpUnmountContainer
501		}
502		err := c.issueVMRequest(op)
503		if err != nil {
504			if _, ok := err.(*noVMError); ok {
505				logrus.WithFields(logrus.Fields{
506					logfields.ContainerID: c.ID,
507					logfields.UVMID:       c.HostID,
508					logrus.ErrorKey:       errors.New("failed to unmount container resources"),
509				}).Warning("VM shim could not be contacted")
510			} else {
511				return err
512			}
513		}
514	} else {
515		c.unmountInHost(nil, false)
516	}
517	return nil
518}
519
520func createContainerInHost(c *container, vm *uvm.UtilityVM) (err error) {
521	if c.hc != nil {
522		return errors.New("container already created")
523	}
524
525	// Create the container without starting it.
526	opts := &hcsoci.CreateOptions{
527		ID:               c.ID,
528		Owner:            c.Owner,
529		Spec:             c.Spec,
530		HostingSystem:    vm,
531		NetworkNamespace: c.RequestedNetNS,
532	}
533	vmid := ""
534	if vm != nil {
535		vmid = vm.ID()
536	}
537	logrus.WithFields(logrus.Fields{
538		logfields.ContainerID: c.ID,
539		logfields.UVMID:       vmid,
540	}).Info("creating container in UVM")
541	hc, r, err := hcsoci.CreateContainer(context.Background(), opts)
542	if err != nil {
543		return err
544	}
545	defer func() {
546		if err != nil {
547			hc.Terminate(context.Background())
548			hc.Wait()
549			resources.ReleaseResources(context.Background(), r, vm, true)
550		}
551	}()
552
553	// Record the network namespace to support namespace sharing by container ID.
554	if r.NetNS() != "" {
555		err = stateKey.Set(c.ID, keyNetNS, r.NetNS())
556		if err != nil {
557			return err
558		}
559	}
560
561	err = stateKey.Set(c.ID, keyResources, r)
562	if err != nil {
563		return err
564	}
565	c.hc = hc.(*hcs.System)
566	return nil
567}
568
569func startContainerShim(c *container, pidFile, logFile string) error {
570	// Launch a shim process to later execute a process in the container.
571	shim, err := startProcessShim(c.ID, pidFile, logFile, nil)
572	if err != nil {
573		return err
574	}
575	defer shim.Release()
576	defer func() {
577		if err != nil {
578			shim.Kill()
579		}
580	}()
581
582	c.ShimPid = shim.Pid
583	err = stateKey.Set(c.ID, keyShimPid, shim.Pid)
584	if err != nil {
585		return err
586	}
587
588	if pidFile != "" {
589		if err = createPidFile(pidFile, shim.Pid); err != nil {
590			return err
591		}
592	}
593
594	return nil
595}
596
597func (c *container) Close() error {
598	if c.hc == nil {
599		return nil
600	}
601	return c.hc.Close()
602}
603
604func (c *container) Exec() error {
605	err := c.hc.Start(context.Background())
606	if err != nil {
607		return err
608	}
609
610	if c.Spec.Process == nil {
611		return nil
612	}
613
614	// Alert the shim that the container is ready.
615	pipe, err := winio.DialPipe(c.ShimPipePath(), nil)
616	if err != nil {
617		return err
618	}
619	defer pipe.Close()
620
621	shim, err := os.FindProcess(c.ShimPid)
622	if err != nil {
623		return err
624	}
625	defer shim.Release()
626
627	err = runhcs.GetErrorFromPipe(pipe, shim)
628	if err != nil {
629		return err
630	}
631
632	return nil
633}
634
635func getContainer(id string, notStopped bool) (*container, error) {
636	var c container
637	err := stateKey.Get(id, keyState, &c.persistedState)
638	if err != nil {
639		return nil, err
640	}
641	err = stateKey.Get(id, keyShimPid, &c.ShimPid)
642	if err != nil {
643		if _, ok := err.(*regstate.NoStateError); !ok {
644			return nil, err
645		}
646		c.ShimPid = -1
647	}
648	if notStopped && c.ShimPid == 0 {
649		return nil, errContainerStopped
650	}
651
652	hc, err := hcs.OpenComputeSystem(context.Background(), c.ID)
653	if err == nil {
654		c.hc = hc
655	} else if !hcs.IsNotExist(err) {
656		return nil, err
657	} else if notStopped {
658		return nil, errContainerStopped
659	}
660
661	return &c, nil
662}
663
664func (c *container) Remove() error {
665	// Unmount any layers or mapped volumes.
666	err := c.Unmount(!c.IsHost)
667	if err != nil {
668		return err
669	}
670
671	// Follow kata's example and delay tearing down the VM until the owning
672	// container is removed.
673	if c.IsHost {
674		vm, err := hcs.OpenComputeSystem(context.Background(), vmID(c.ID))
675		if err == nil {
676			vm.Terminate(context.Background())
677			vm.Wait()
678		}
679	}
680	return stateKey.Remove(c.ID)
681}
682
683func (c *container) Kill() error {
684	if c.hc == nil {
685		return nil
686	}
687	c.hc.Terminate(context.Background())
688	return c.hc.Wait()
689}
690
691func (c *container) Status() (containerStatus, error) {
692	if c.hc == nil || c.ShimPid == 0 {
693		return containerStopped, nil
694	}
695	props, err := c.hc.Properties(context.Background())
696	if err != nil {
697		if !strings.Contains(err.Error(), "operation is not valid in the current state") {
698			return "", err
699		}
700		return containerUnknown, nil
701	}
702	state := containerUnknown
703	switch props.State {
704	case "", "Created":
705		state = containerCreated
706	case "Running":
707		state = containerRunning
708	case "Paused":
709		state = containerPaused
710	case "Stopped":
711		state = containerStopped
712	}
713	return state, nil
714}
715